From 604326b41a6fb9b4a78b6179335decee0365cd8c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 13 Oct 2018 02:45:58 +0200 Subject: bpf, sockmap: convert to generic sk_msg interface Add a generic sk_msg layer, and convert current sockmap and later kTLS over to make use of it. While sk_buff handles network packet representation from netdevice up to socket, sk_msg handles data representation from application to socket layer. This means that sk_msg framework spans across ULP users in the kernel, and enables features such as introspection or filtering of data with the help of BPF programs that operate on this data structure. Latter becomes in particular useful for kTLS where data encryption is deferred into the kernel, and as such enabling the kernel to perform L7 introspection and policy based on BPF for TLS connections where the record is being encrypted after BPF has run and came to a verdict. In order to get there, first step is to transform open coding of scatter-gather list handling into a common core framework that subsystems can use. The code itself has been split and refactored into three bigger pieces: i) the generic sk_msg API which deals with managing the scatter gather ring, providing helpers for walking and mangling, transferring application data from user space into it, and preparing it for BPF pre/post-processing, ii) the plain sock map itself where sockets can be attached to or detached from; these bits are independent of i) which can now be used also without sock map, and iii) the integration with plain TCP as one protocol to be used for processing L7 application data (later this could e.g. also be extended to other protocols like UDP). The semantics are the same with the old sock map code and therefore no change of user facing behavior or APIs. While pursuing this work it also helped finding a number of bugs in the old sockmap code that we've fixed already in earlier commits. The test_sockmap kselftest suite passes through fine as well. Joint work with John. Signed-off-by: Daniel Borkmann Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- include/linux/skmsg.h | 371 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 371 insertions(+) create mode 100644 include/linux/skmsg.h (limited to 'include/linux/skmsg.h') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h new file mode 100644 index 000000000000..95678103c4a0 --- /dev/null +++ b/include/linux/skmsg.h @@ -0,0 +1,371 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ + +#ifndef _LINUX_SKMSG_H +#define _LINUX_SKMSG_H + +#include +#include +#include +#include + +#include +#include +#include + +#define MAX_MSG_FRAGS MAX_SKB_FRAGS + +enum __sk_action { + __SK_DROP = 0, + __SK_PASS, + __SK_REDIRECT, + __SK_NONE, +}; + +struct sk_msg_sg { + u32 start; + u32 curr; + u32 end; + u32 size; + u32 copybreak; + bool copy[MAX_MSG_FRAGS]; + struct scatterlist data[MAX_MSG_FRAGS]; +}; + +struct sk_msg { + struct sk_msg_sg sg; + void *data; + void *data_end; + u32 apply_bytes; + u32 cork_bytes; + u32 flags; + struct sk_buff *skb; + struct sock *sk_redir; + struct sock *sk; + struct list_head list; +}; + +struct sk_psock_progs { + struct bpf_prog *msg_parser; + struct bpf_prog *skb_parser; + struct bpf_prog *skb_verdict; +}; + +enum sk_psock_state_bits { + SK_PSOCK_TX_ENABLED, +}; + +struct sk_psock_link { + struct list_head list; + struct bpf_map *map; + void *link_raw; +}; + +struct sk_psock_parser { + struct strparser strp; + bool enabled; + void (*saved_data_ready)(struct sock *sk); +}; + +struct sk_psock_work_state { + struct sk_buff *skb; + u32 len; + u32 off; +}; + +struct sk_psock { + struct sock *sk; + struct sock *sk_redir; + u32 apply_bytes; + u32 cork_bytes; + u32 eval; + struct sk_msg *cork; + struct sk_psock_progs progs; + struct sk_psock_parser parser; + struct sk_buff_head ingress_skb; + struct list_head ingress_msg; + unsigned long state; + struct list_head link; + spinlock_t link_lock; + refcount_t refcnt; + void (*saved_unhash)(struct sock *sk); + void (*saved_close)(struct sock *sk, long timeout); + void (*saved_write_space)(struct sock *sk); + struct proto *sk_proto; + struct sk_psock_work_state work_state; + struct work_struct work; + union { + struct rcu_head rcu; + struct work_struct gc; + }; +}; + +int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, + int elem_first_coalesce); +void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len); +int sk_msg_free(struct sock *sk, struct sk_msg *msg); +int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg); +void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes); +void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg, + u32 bytes); + +void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes); + +int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, + struct sk_msg *msg, u32 bytes); +int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, + struct sk_msg *msg, u32 bytes); + +static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes) +{ + WARN_ON(i == msg->sg.end && bytes); +} + +static inline void sk_msg_apply_bytes(struct sk_psock *psock, u32 bytes) +{ + if (psock->apply_bytes) { + if (psock->apply_bytes < bytes) + psock->apply_bytes = 0; + else + psock->apply_bytes -= bytes; + } +} + +#define sk_msg_iter_var_prev(var) \ + do { \ + if (var == 0) \ + var = MAX_MSG_FRAGS - 1; \ + else \ + var--; \ + } while (0) + +#define sk_msg_iter_var_next(var) \ + do { \ + var++; \ + if (var == MAX_MSG_FRAGS) \ + var = 0; \ + } while (0) + +#define sk_msg_iter_prev(msg, which) \ + sk_msg_iter_var_prev(msg->sg.which) + +#define sk_msg_iter_next(msg, which) \ + sk_msg_iter_var_next(msg->sg.which) + +static inline void sk_msg_clear_meta(struct sk_msg *msg) +{ + memset(&msg->sg, 0, offsetofend(struct sk_msg_sg, copy)); +} + +static inline void sk_msg_init(struct sk_msg *msg) +{ + memset(msg, 0, sizeof(*msg)); + sg_init_marker(msg->sg.data, ARRAY_SIZE(msg->sg.data)); +} + +static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src, + int which, u32 size) +{ + dst->sg.data[which] = src->sg.data[which]; + dst->sg.data[which].length = size; + src->sg.data[which].length -= size; + src->sg.data[which].offset += size; +} + +static inline u32 sk_msg_elem_used(const struct sk_msg *msg) +{ + return msg->sg.end >= msg->sg.start ? + msg->sg.end - msg->sg.start : + msg->sg.end + (MAX_MSG_FRAGS - msg->sg.start); +} + +static inline bool sk_msg_full(const struct sk_msg *msg) +{ + return (msg->sg.end == msg->sg.start) && msg->sg.size; +} + +static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which) +{ + return &msg->sg.data[which]; +} + +static inline struct page *sk_msg_page(struct sk_msg *msg, int which) +{ + return sg_page(sk_msg_elem(msg, which)); +} + +static inline bool sk_msg_to_ingress(const struct sk_msg *msg) +{ + return msg->flags & BPF_F_INGRESS; +} + +static inline void sk_msg_compute_data_pointers(struct sk_msg *msg) +{ + struct scatterlist *sge = sk_msg_elem(msg, msg->sg.start); + + if (msg->sg.copy[msg->sg.start]) { + msg->data = NULL; + msg->data_end = NULL; + } else { + msg->data = sg_virt(sge); + msg->data_end = msg->data + sge->length; + } +} + +static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page, + u32 len, u32 offset) +{ + struct scatterlist *sge; + + get_page(page); + sge = sk_msg_elem(msg, msg->sg.end); + sg_set_page(sge, page, len, offset); + sg_unmark_end(sge); + + msg->sg.copy[msg->sg.end] = true; + msg->sg.size += len; + sk_msg_iter_next(msg, end); +} + +static inline struct sk_psock *sk_psock(const struct sock *sk) +{ + return rcu_dereference_sk_user_data(sk); +} + +static inline bool sk_has_psock(struct sock *sk) +{ + return sk_psock(sk) != NULL && sk->sk_prot->recvmsg == tcp_bpf_recvmsg; +} + +static inline void sk_psock_queue_msg(struct sk_psock *psock, + struct sk_msg *msg) +{ + list_add_tail(&msg->list, &psock->ingress_msg); +} + +static inline void sk_psock_report_error(struct sk_psock *psock, int err) +{ + struct sock *sk = psock->sk; + + sk->sk_err = err; + sk->sk_error_report(sk); +} + +struct sk_psock *sk_psock_init(struct sock *sk, int node); + +int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); +void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock); +void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock); + +int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, + struct sk_msg *msg); + +static inline struct sk_psock_link *sk_psock_init_link(void) +{ + return kzalloc(sizeof(struct sk_psock_link), + GFP_ATOMIC | __GFP_NOWARN); +} + +static inline void sk_psock_free_link(struct sk_psock_link *link) +{ + kfree(link); +} + +struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock); +#if defined(CONFIG_BPF_STREAM_PARSER) +void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link); +#else +static inline void sk_psock_unlink(struct sock *sk, + struct sk_psock_link *link) +{ +} +#endif + +void __sk_psock_purge_ingress_msg(struct sk_psock *psock); + +static inline void sk_psock_cork_free(struct sk_psock *psock) +{ + if (psock->cork) { + sk_msg_free(psock->sk, psock->cork); + kfree(psock->cork); + psock->cork = NULL; + } +} + +static inline void sk_psock_update_proto(struct sock *sk, + struct sk_psock *psock, + struct proto *ops) +{ + psock->saved_unhash = sk->sk_prot->unhash; + psock->saved_close = sk->sk_prot->close; + psock->saved_write_space = sk->sk_write_space; + + psock->sk_proto = sk->sk_prot; + sk->sk_prot = ops; +} + +static inline void sk_psock_restore_proto(struct sock *sk, + struct sk_psock *psock) +{ + if (psock->sk_proto) { + sk->sk_prot = psock->sk_proto; + psock->sk_proto = NULL; + } +} + +static inline void sk_psock_set_state(struct sk_psock *psock, + enum sk_psock_state_bits bit) +{ + set_bit(bit, &psock->state); +} + +static inline void sk_psock_clear_state(struct sk_psock *psock, + enum sk_psock_state_bits bit) +{ + clear_bit(bit, &psock->state); +} + +static inline bool sk_psock_test_state(const struct sk_psock *psock, + enum sk_psock_state_bits bit) +{ + return test_bit(bit, &psock->state); +} + +static inline struct sk_psock *sk_psock_get(struct sock *sk) +{ + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock(sk); + if (psock && !refcount_inc_not_zero(&psock->refcnt)) + psock = NULL; + rcu_read_unlock(); + return psock; +} + +void sk_psock_stop(struct sock *sk, struct sk_psock *psock); +void sk_psock_destroy(struct rcu_head *rcu); +void sk_psock_drop(struct sock *sk, struct sk_psock *psock); + +static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) +{ + if (refcount_dec_and_test(&psock->refcnt)) + sk_psock_drop(sk, psock); +} + +static inline void psock_set_prog(struct bpf_prog **pprog, + struct bpf_prog *prog) +{ + prog = xchg(pprog, prog); + if (prog) + bpf_prog_put(prog); +} + +static inline void psock_progs_drop(struct sk_psock_progs *progs) +{ + psock_set_prog(&progs->msg_parser, NULL); + psock_set_prog(&progs->skb_parser, NULL); + psock_set_prog(&progs->skb_verdict, NULL); +} + +#endif /* _LINUX_SKMSG_H */ -- cgit v1.2.3 From d829e9c4112b52f4f00195900fd4c685f61365ab Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 13 Oct 2018 02:45:59 +0200 Subject: tls: convert to generic sk_msg interface Convert kTLS over to make use of sk_msg interface for plaintext and encrypted scattergather data, so it reuses all the sk_msg helpers and data structure which later on in a second step enables to glue this to BPF. This also allows to remove quite a bit of open coded helpers which are covered by the sk_msg API. Recent changes in kTLs 80ece6a03aaf ("tls: Remove redundant vars from tls record structure") and 4e6d47206c32 ("tls: Add support for inplace records encryption") changed the data path handling a bit; while we've kept the latter optimization intact, we had to undo the former change to better fit the sk_msg model, hence the sg_aead_in and sg_aead_out have been brought back and are linked into the sk_msg sgs. Now the kTLS record contains a msg_plaintext and msg_encrypted sk_msg each. In the original code, the zerocopy_from_iter() has been used out of TX but also RX path. For the strparser skb-based RX path, we've left the zerocopy_from_iter() in decrypt_internal() mostly untouched, meaning it has been moved into tls_setup_from_iter() with charging logic removed (as not used from RX). Given RX path is not based on sk_msg objects, we haven't pursued setting up a dummy sk_msg to call into sk_msg_zerocopy_from_iter(), but it could be an option to prusue in a later step. Joint work with John. Signed-off-by: Daniel Borkmann Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- include/linux/skmsg.h | 2 + include/net/sock.h | 4 - include/net/tls.h | 18 +- net/core/skmsg.c | 39 ++++ net/core/sock.c | 61 ------ net/tls/Kconfig | 1 + net/tls/tls_device.c | 2 +- net/tls/tls_sw.c | 511 ++++++++++++++++++-------------------------------- 8 files changed, 236 insertions(+), 402 deletions(-) (limited to 'include/linux/skmsg.h') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 95678103c4a0..4e84b3c2eff8 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -102,6 +102,8 @@ struct sk_psock { int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, int elem_first_coalesce); +int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src, + u32 off, u32 len); void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len); int sk_msg_free(struct sock *sk, struct sk_msg *msg); int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg); diff --git a/include/net/sock.h b/include/net/sock.h index 751549ac0a84..7470c45d182d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2214,10 +2214,6 @@ static inline struct page_frag *sk_page_frag(struct sock *sk) bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag); -int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, - int sg_start, int *sg_curr, unsigned int *sg_size, - int first_coalesce); - /* * Default write policy as shown to user space via poll/select/SIGIO */ diff --git a/include/net/tls.h b/include/net/tls.h index 5e853835597e..3d22d8a59be7 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -39,6 +39,8 @@ #include #include #include +#include + #include #include #include @@ -103,15 +105,13 @@ struct tls_rec { int tx_flags; int inplace_crypto; - /* AAD | sg_plaintext_data | sg_tag */ - struct scatterlist sg_plaintext_data[MAX_SKB_FRAGS + 1]; - /* AAD | sg_encrypted_data (data contain overhead for hdr&iv&tag) */ - struct scatterlist sg_encrypted_data[MAX_SKB_FRAGS + 1]; + struct sk_msg msg_plaintext; + struct sk_msg msg_encrypted; - unsigned int sg_plaintext_size; - unsigned int sg_encrypted_size; - int sg_plaintext_num_elem; - int sg_encrypted_num_elem; + /* AAD | msg_plaintext.sg.data | sg_tag */ + struct scatterlist sg_aead_in[2]; + /* AAD | msg_encrypted.sg.data (data contains overhead for hdr & iv & tag) */ + struct scatterlist sg_aead_out[2]; char aad_space[TLS_AAD_SPACE_SIZE]; struct aead_request aead_req; @@ -223,8 +223,8 @@ struct tls_context { unsigned long flags; bool in_tcp_sendpages; + bool pending_open_record_frags; - u16 pending_open_record_frags; int (*push_pending_record)(struct sock *sk, int flags); void (*sk_write_space)(struct sock *sk); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index ae2b281c9c57..56a99d0c9aa0 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -73,6 +73,45 @@ int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, } EXPORT_SYMBOL_GPL(sk_msg_alloc); +int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src, + u32 off, u32 len) +{ + int i = src->sg.start; + struct scatterlist *sge = sk_msg_elem(src, i); + u32 sge_len, sge_off; + + if (sk_msg_full(dst)) + return -ENOSPC; + + while (off) { + if (sge->length > off) + break; + off -= sge->length; + sk_msg_iter_var_next(i); + if (i == src->sg.end && off) + return -ENOSPC; + sge = sk_msg_elem(src, i); + } + + while (len) { + sge_len = sge->length - off; + sge_off = sge->offset + off; + if (sge_len > len) + sge_len = len; + off = 0; + len -= sge_len; + sk_msg_page_add(dst, sg_page(sge), sge_len, sge_off); + sk_mem_charge(sk, sge_len); + sk_msg_iter_var_next(i); + if (i == src->sg.end && len) + return -ENOSPC; + sge = sk_msg_elem(src, i); + } + + return 0; +} +EXPORT_SYMBOL_GPL(sk_msg_clone); + void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes) { int i = msg->sg.start; diff --git a/net/core/sock.c b/net/core/sock.c index 7e8796a6a089..52e4f1c16b1e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2238,67 +2238,6 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) } EXPORT_SYMBOL(sk_page_frag_refill); -int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, - int sg_start, int *sg_curr_index, unsigned int *sg_curr_size, - int first_coalesce) -{ - int sg_curr = *sg_curr_index, use = 0, rc = 0; - unsigned int size = *sg_curr_size; - struct page_frag *pfrag; - struct scatterlist *sge; - - len -= size; - pfrag = sk_page_frag(sk); - - while (len > 0) { - unsigned int orig_offset; - - if (!sk_page_frag_refill(sk, pfrag)) { - rc = -ENOMEM; - goto out; - } - - use = min_t(int, len, pfrag->size - pfrag->offset); - - if (!sk_wmem_schedule(sk, use)) { - rc = -ENOMEM; - goto out; - } - - sk_mem_charge(sk, use); - size += use; - orig_offset = pfrag->offset; - pfrag->offset += use; - - sge = sg + sg_curr - 1; - if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page && - sge->offset + sge->length == orig_offset) { - sge->length += use; - } else { - sge = sg + sg_curr; - sg_unmark_end(sge); - sg_set_page(sge, pfrag->page, use, orig_offset); - get_page(pfrag->page); - sg_curr++; - - if (sg_curr == MAX_SKB_FRAGS) - sg_curr = 0; - - if (sg_curr == sg_start) { - rc = -ENOSPC; - break; - } - } - - len -= use; - } -out: - *sg_curr_size = size; - *sg_curr_index = sg_curr; - return rc; -} -EXPORT_SYMBOL(sk_alloc_sg); - static void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) diff --git a/net/tls/Kconfig b/net/tls/Kconfig index 73f05ece53d0..99c1a19c17b1 100644 --- a/net/tls/Kconfig +++ b/net/tls/Kconfig @@ -8,6 +8,7 @@ config TLS select CRYPTO_AES select CRYPTO_GCM select STREAM_PARSER + select NET_SOCK_MSG default n ---help--- Enable kernel support for TLS protocol. This allows symmetric diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 961b07d4d41c..276edbc04f38 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -421,7 +421,7 @@ last_record: tls_push_record_flags = flags; if (more) { tls_ctx->pending_open_record_frags = - record->num_frags; + !!record->num_frags; break; } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index aa9fdce272b6..5043b0be1448 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -213,153 +213,49 @@ static int tls_do_decryption(struct sock *sk, return ret; } -static void trim_sg(struct sock *sk, struct scatterlist *sg, - int *sg_num_elem, unsigned int *sg_size, int target_size) -{ - int i = *sg_num_elem - 1; - int trim = *sg_size - target_size; - - if (trim <= 0) { - WARN_ON(trim < 0); - return; - } - - *sg_size = target_size; - while (trim >= sg[i].length) { - trim -= sg[i].length; - sk_mem_uncharge(sk, sg[i].length); - put_page(sg_page(&sg[i])); - i--; - - if (i < 0) - goto out; - } - - sg[i].length -= trim; - sk_mem_uncharge(sk, trim); - -out: - *sg_num_elem = i + 1; -} - -static void trim_both_sgl(struct sock *sk, int target_size) +static void tls_trim_both_msgs(struct sock *sk, int target_size) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; - trim_sg(sk, &rec->sg_plaintext_data[1], - &rec->sg_plaintext_num_elem, - &rec->sg_plaintext_size, - target_size); - + sk_msg_trim(sk, &rec->msg_plaintext, target_size); if (target_size > 0) target_size += tls_ctx->tx.overhead_size; - - trim_sg(sk, &rec->sg_encrypted_data[1], - &rec->sg_encrypted_num_elem, - &rec->sg_encrypted_size, - target_size); + sk_msg_trim(sk, &rec->msg_encrypted, target_size); } -static int alloc_encrypted_sg(struct sock *sk, int len) +static int tls_alloc_encrypted_msg(struct sock *sk, int len) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; - int rc = 0; - - rc = sk_alloc_sg(sk, len, - &rec->sg_encrypted_data[1], 0, - &rec->sg_encrypted_num_elem, - &rec->sg_encrypted_size, 0); - - if (rc == -ENOSPC) - rec->sg_encrypted_num_elem = - ARRAY_SIZE(rec->sg_encrypted_data) - 1; + struct sk_msg *msg_en = &rec->msg_encrypted; - return rc; + return sk_msg_alloc(sk, msg_en, len, 0); } -static int move_to_plaintext_sg(struct sock *sk, int required_size) +static int tls_clone_plaintext_msg(struct sock *sk, int required) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; - struct scatterlist *plain_sg = &rec->sg_plaintext_data[1]; - struct scatterlist *enc_sg = &rec->sg_encrypted_data[1]; - int enc_sg_idx = 0; + struct sk_msg *msg_pl = &rec->msg_plaintext; + struct sk_msg *msg_en = &rec->msg_encrypted; int skip, len; - if (rec->sg_plaintext_num_elem == MAX_SKB_FRAGS) - return -ENOSPC; - - /* We add page references worth len bytes from enc_sg at the - * end of plain_sg. It is guaranteed that sg_encrypted_data + /* We add page references worth len bytes from encrypted sg + * at the end of plaintext sg. It is guaranteed that msg_en * has enough required room (ensured by caller). */ - len = required_size - rec->sg_plaintext_size; + len = required - msg_pl->sg.size; - /* Skip initial bytes in sg_encrypted_data to be able - * to use same offset of both plain and encrypted data. + /* Skip initial bytes in msg_en's data to be able to use + * same offset of both plain and encrypted data. */ - skip = tls_ctx->tx.prepend_size + rec->sg_plaintext_size; - - while (enc_sg_idx < rec->sg_encrypted_num_elem) { - if (enc_sg[enc_sg_idx].length > skip) - break; - - skip -= enc_sg[enc_sg_idx].length; - enc_sg_idx++; - } + skip = tls_ctx->tx.prepend_size + msg_pl->sg.size; - /* unmark the end of plain_sg*/ - sg_unmark_end(plain_sg + rec->sg_plaintext_num_elem - 1); - - while (len) { - struct page *page = sg_page(&enc_sg[enc_sg_idx]); - int bytes = enc_sg[enc_sg_idx].length - skip; - int offset = enc_sg[enc_sg_idx].offset + skip; - - if (bytes > len) - bytes = len; - else - enc_sg_idx++; - - /* Skipping is required only one time */ - skip = 0; - - /* Increment page reference */ - get_page(page); - - sg_set_page(&plain_sg[rec->sg_plaintext_num_elem], page, - bytes, offset); - - sk_mem_charge(sk, bytes); - - len -= bytes; - rec->sg_plaintext_size += bytes; - - rec->sg_plaintext_num_elem++; - - if (rec->sg_plaintext_num_elem == MAX_SKB_FRAGS) - return -ENOSPC; - } - - return 0; -} - -static void free_sg(struct sock *sk, struct scatterlist *sg, - int *sg_num_elem, unsigned int *sg_size) -{ - int i, n = *sg_num_elem; - - for (i = 0; i < n; ++i) { - sk_mem_uncharge(sk, sg[i].length); - put_page(sg_page(&sg[i])); - } - *sg_num_elem = 0; - *sg_size = 0; + return sk_msg_clone(sk, msg_pl, msg_en, skip, len); } static void tls_free_open_rec(struct sock *sk) @@ -372,14 +268,8 @@ static void tls_free_open_rec(struct sock *sk) if (!rec) return; - free_sg(sk, &rec->sg_encrypted_data[1], - &rec->sg_encrypted_num_elem, - &rec->sg_encrypted_size); - - free_sg(sk, &rec->sg_plaintext_data[1], - &rec->sg_plaintext_num_elem, - &rec->sg_plaintext_size); - + sk_msg_free(sk, &rec->msg_encrypted); + sk_msg_free(sk, &rec->msg_plaintext); kfree(rec); } @@ -388,6 +278,7 @@ int tls_tx_records(struct sock *sk, int flags) struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec, *tmp; + struct sk_msg *msg_en; int tx_flags, rc = 0; if (tls_is_partially_sent_record(tls_ctx)) { @@ -407,9 +298,7 @@ int tls_tx_records(struct sock *sk, int flags) * Remove the head of tx_list */ list_del(&rec->list); - free_sg(sk, &rec->sg_plaintext_data[1], - &rec->sg_plaintext_num_elem, &rec->sg_plaintext_size); - + sk_msg_free(sk, &rec->msg_plaintext); kfree(rec); } @@ -421,17 +310,15 @@ int tls_tx_records(struct sock *sk, int flags) else tx_flags = flags; + msg_en = &rec->msg_encrypted; rc = tls_push_sg(sk, tls_ctx, - &rec->sg_encrypted_data[1], + &msg_en->sg.data[msg_en->sg.curr], 0, tx_flags); if (rc) goto tx_err; list_del(&rec->list); - free_sg(sk, &rec->sg_plaintext_data[1], - &rec->sg_plaintext_num_elem, - &rec->sg_plaintext_size); - + sk_msg_free(sk, &rec->msg_plaintext); kfree(rec); } else { break; @@ -451,15 +338,18 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err) struct sock *sk = req->data; struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct scatterlist *sge; + struct sk_msg *msg_en; struct tls_rec *rec; bool ready = false; int pending; rec = container_of(aead_req, struct tls_rec, aead_req); + msg_en = &rec->msg_encrypted; - rec->sg_encrypted_data[1].offset -= tls_ctx->tx.prepend_size; - rec->sg_encrypted_data[1].length += tls_ctx->tx.prepend_size; - + sge = sk_msg_elem(msg_en, msg_en->sg.curr); + sge->offset -= tls_ctx->tx.prepend_size; + sge->length += tls_ctx->tx.prepend_size; /* Check if error is previously set on socket */ if (err || sk->sk_err) { @@ -497,31 +387,29 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err) /* Schedule the transmission */ if (!test_and_set_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) - schedule_delayed_work(&ctx->tx_work.work, 2); + schedule_delayed_work(&ctx->tx_work.work, 1); } static int tls_do_encryption(struct sock *sk, struct tls_context *tls_ctx, struct tls_sw_context_tx *ctx, struct aead_request *aead_req, - size_t data_len) + size_t data_len, u32 start) { struct tls_rec *rec = ctx->open_rec; - struct scatterlist *plain_sg = rec->sg_plaintext_data; - struct scatterlist *enc_sg = rec->sg_encrypted_data; + struct sk_msg *msg_en = &rec->msg_encrypted; + struct scatterlist *sge = sk_msg_elem(msg_en, start); int rc; - /* Skip the first index as it contains AAD data */ - rec->sg_encrypted_data[1].offset += tls_ctx->tx.prepend_size; - rec->sg_encrypted_data[1].length -= tls_ctx->tx.prepend_size; + sge->offset += tls_ctx->tx.prepend_size; + sge->length -= tls_ctx->tx.prepend_size; - /* If it is inplace crypto, then pass same SG list as both src, dst */ - if (rec->inplace_crypto) - plain_sg = enc_sg; + msg_en->sg.curr = start; aead_request_set_tfm(aead_req, ctx->aead_send); aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); - aead_request_set_crypt(aead_req, plain_sg, enc_sg, + aead_request_set_crypt(aead_req, rec->sg_aead_in, + rec->sg_aead_out, data_len, tls_ctx->tx.iv); aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, @@ -534,8 +422,8 @@ static int tls_do_encryption(struct sock *sk, rc = crypto_aead_encrypt(aead_req); if (!rc || rc != -EINPROGRESS) { atomic_dec(&ctx->encrypt_pending); - rec->sg_encrypted_data[1].offset -= tls_ctx->tx.prepend_size; - rec->sg_encrypted_data[1].length += tls_ctx->tx.prepend_size; + sge->offset -= tls_ctx->tx.prepend_size; + sge->length += tls_ctx->tx.prepend_size; } if (!rc) { @@ -557,35 +445,50 @@ static int tls_push_record(struct sock *sk, int flags, struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec = ctx->open_rec; + struct sk_msg *msg_pl, *msg_en; struct aead_request *req; int rc; + u32 i; if (!rec) return 0; + msg_pl = &rec->msg_plaintext; + msg_en = &rec->msg_encrypted; + rec->tx_flags = flags; req = &rec->aead_req; - sg_mark_end(rec->sg_plaintext_data + rec->sg_plaintext_num_elem); - sg_mark_end(rec->sg_encrypted_data + rec->sg_encrypted_num_elem); + i = msg_pl->sg.end; + sk_msg_iter_var_prev(i); + sg_mark_end(sk_msg_elem(msg_pl, i)); - tls_make_aad(rec->aad_space, rec->sg_plaintext_size, + i = msg_pl->sg.start; + sg_chain(rec->sg_aead_in, 2, rec->inplace_crypto ? + &msg_en->sg.data[i] : &msg_pl->sg.data[i]); + + i = msg_en->sg.end; + sk_msg_iter_var_prev(i); + sg_mark_end(sk_msg_elem(msg_en, i)); + + i = msg_en->sg.start; + sg_chain(rec->sg_aead_out, 2, &msg_en->sg.data[i]); + + tls_make_aad(rec->aad_space, msg_pl->sg.size, tls_ctx->tx.rec_seq, tls_ctx->tx.rec_seq_size, record_type); tls_fill_prepend(tls_ctx, - page_address(sg_page(&rec->sg_encrypted_data[1])) + - rec->sg_encrypted_data[1].offset, - rec->sg_plaintext_size, record_type); + page_address(sg_page(&msg_en->sg.data[i])) + + msg_en->sg.data[i].offset, msg_pl->sg.size, + record_type); - tls_ctx->pending_open_record_frags = 0; - - rc = tls_do_encryption(sk, tls_ctx, ctx, req, rec->sg_plaintext_size); - if (rc == -EINPROGRESS) - return -EINPROGRESS; + tls_ctx->pending_open_record_frags = false; + rc = tls_do_encryption(sk, tls_ctx, ctx, req, msg_pl->sg.size, i); if (rc < 0) { - tls_err_abort(sk, EBADMSG); + if (rc != -EINPROGRESS) + tls_err_abort(sk, EBADMSG); return rc; } @@ -597,104 +500,11 @@ static int tls_sw_push_pending_record(struct sock *sk, int flags) return tls_push_record(sk, flags, TLS_RECORD_TYPE_DATA); } -static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, - int length, int *pages_used, - unsigned int *size_used, - struct scatterlist *to, int to_max_pages, - bool charge) -{ - struct page *pages[MAX_SKB_FRAGS]; - - size_t offset; - ssize_t copied, use; - int i = 0; - unsigned int size = *size_used; - int num_elem = *pages_used; - int rc = 0; - int maxpages; - - while (length > 0) { - i = 0; - maxpages = to_max_pages - num_elem; - if (maxpages == 0) { - rc = -EFAULT; - goto out; - } - copied = iov_iter_get_pages(from, pages, - length, - maxpages, &offset); - if (copied <= 0) { - rc = -EFAULT; - goto out; - } - - iov_iter_advance(from, copied); - - length -= copied; - size += copied; - while (copied) { - use = min_t(int, copied, PAGE_SIZE - offset); - - sg_set_page(&to[num_elem], - pages[i], use, offset); - sg_unmark_end(&to[num_elem]); - if (charge) - sk_mem_charge(sk, use); - - offset = 0; - copied -= use; - - ++i; - ++num_elem; - } - } - - /* Mark the end in the last sg entry if newly added */ - if (num_elem > *pages_used) - sg_mark_end(&to[num_elem - 1]); -out: - if (rc) - iov_iter_revert(from, size - *size_used); - *size_used = size; - *pages_used = num_elem; - - return rc; -} - -static int memcopy_from_iter(struct sock *sk, struct iov_iter *from, - int bytes) -{ - struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - struct tls_rec *rec = ctx->open_rec; - struct scatterlist *sg = &rec->sg_plaintext_data[1]; - int copy, i, rc = 0; - - for (i = tls_ctx->pending_open_record_frags; - i < rec->sg_plaintext_num_elem; ++i) { - copy = sg[i].length; - if (copy_from_iter( - page_address(sg_page(&sg[i])) + sg[i].offset, - copy, from) != copy) { - rc = -EFAULT; - goto out; - } - bytes -= copy; - - ++tls_ctx->pending_open_record_frags; - - if (!bytes) - break; - } - -out: - return rc; -} - static struct tls_rec *get_rec(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct sk_msg *msg_pl, *msg_en; struct tls_rec *rec; int mem_size; @@ -708,15 +518,21 @@ static struct tls_rec *get_rec(struct sock *sk) if (!rec) return NULL; - sg_init_table(&rec->sg_plaintext_data[0], - ARRAY_SIZE(rec->sg_plaintext_data)); - sg_init_table(&rec->sg_encrypted_data[0], - ARRAY_SIZE(rec->sg_encrypted_data)); + msg_pl = &rec->msg_plaintext; + msg_en = &rec->msg_encrypted; + + sk_msg_init(msg_pl); + sk_msg_init(msg_en); - sg_set_buf(&rec->sg_plaintext_data[0], rec->aad_space, + sg_init_table(rec->sg_aead_in, 2); + sg_set_buf(&rec->sg_aead_in[0], rec->aad_space, sizeof(rec->aad_space)); - sg_set_buf(&rec->sg_encrypted_data[0], rec->aad_space, + sg_unmark_end(&rec->sg_aead_in[1]); + + sg_init_table(rec->sg_aead_out, 2); + sg_set_buf(&rec->sg_aead_out[0], rec->aad_space, sizeof(rec->aad_space)); + sg_unmark_end(&rec->sg_aead_out[1]); ctx->open_rec = rec; rec->inplace_crypto = 1; @@ -735,6 +551,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) bool is_kvec = msg->msg_iter.type & ITER_KVEC; bool eor = !(msg->msg_flags & MSG_MORE); size_t try_to_copy, copied = 0; + struct sk_msg *msg_pl, *msg_en; struct tls_rec *rec; int required_size; int num_async = 0; @@ -778,23 +595,26 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) goto send_end; } - orig_size = rec->sg_plaintext_size; + msg_pl = &rec->msg_plaintext; + msg_en = &rec->msg_encrypted; + + orig_size = msg_pl->sg.size; full_record = false; try_to_copy = msg_data_left(msg); - record_room = TLS_MAX_PAYLOAD_SIZE - rec->sg_plaintext_size; + record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size; if (try_to_copy >= record_room) { try_to_copy = record_room; full_record = true; } - required_size = rec->sg_plaintext_size + try_to_copy + + required_size = msg_pl->sg.size + try_to_copy + tls_ctx->tx.overhead_size; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; alloc_encrypted: - ret = alloc_encrypted_sg(sk, required_size); + ret = tls_alloc_encrypted_msg(sk, required_size); if (ret) { if (ret != -ENOSPC) goto wait_for_memory; @@ -803,17 +623,13 @@ alloc_encrypted: * actually allocated. The difference is due * to max sg elements limit */ - try_to_copy -= required_size - rec->sg_encrypted_size; + try_to_copy -= required_size - msg_en->sg.size; full_record = true; } if (!is_kvec && (full_record || eor) && !async_capable) { - ret = zerocopy_from_iter(sk, &msg->msg_iter, - try_to_copy, &rec->sg_plaintext_num_elem, - &rec->sg_plaintext_size, - &rec->sg_plaintext_data[1], - ARRAY_SIZE(rec->sg_plaintext_data) - 1, - true); + ret = sk_msg_zerocopy_from_iter(sk, &msg->msg_iter, + msg_pl, try_to_copy); if (ret) goto fallback_to_reg_send; @@ -831,15 +647,12 @@ alloc_encrypted: continue; fallback_to_reg_send: - trim_sg(sk, &rec->sg_plaintext_data[1], - &rec->sg_plaintext_num_elem, - &rec->sg_plaintext_size, - orig_size); + sk_msg_trim(sk, msg_pl, orig_size); } - required_size = rec->sg_plaintext_size + try_to_copy; + required_size = msg_pl->sg.size + try_to_copy; - ret = move_to_plaintext_sg(sk, required_size); + ret = tls_clone_plaintext_msg(sk, required_size); if (ret) { if (ret != -ENOSPC) goto send_end; @@ -848,20 +661,21 @@ fallback_to_reg_send: * actually allocated. The difference is due * to max sg elements limit */ - try_to_copy -= required_size - rec->sg_plaintext_size; + try_to_copy -= required_size - msg_pl->sg.size; full_record = true; - - trim_sg(sk, &rec->sg_encrypted_data[1], - &rec->sg_encrypted_num_elem, - &rec->sg_encrypted_size, - rec->sg_plaintext_size + - tls_ctx->tx.overhead_size); + sk_msg_trim(sk, msg_en, msg_pl->sg.size + + tls_ctx->tx.overhead_size); } - ret = memcopy_from_iter(sk, &msg->msg_iter, try_to_copy); - if (ret) + ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_pl, + try_to_copy); + if (ret < 0) goto trim_sgl; + /* Open records defined only if successfully copied, otherwise + * we would trim the sg but not reset the open record frags. + */ + tls_ctx->pending_open_record_frags = true; copied += try_to_copy; if (full_record || eor) { ret = tls_push_record(sk, msg->msg_flags, record_type); @@ -881,11 +695,11 @@ wait_for_memory: ret = sk_stream_wait_memory(sk, &timeo); if (ret) { trim_sgl: - trim_both_sgl(sk, orig_size); + tls_trim_both_msgs(sk, orig_size); goto send_end; } - if (rec->sg_encrypted_size < required_size) + if (msg_en->sg.size < required_size) goto alloc_encrypted; } @@ -929,7 +743,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); unsigned char record_type = TLS_RECORD_TYPE_DATA; size_t orig_size = size; - struct scatterlist *sg; + struct sk_msg *msg_pl; struct tls_rec *rec; int num_async = 0; bool full_record; @@ -970,20 +784,23 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, goto sendpage_end; } + msg_pl = &rec->msg_plaintext; + full_record = false; - record_room = TLS_MAX_PAYLOAD_SIZE - rec->sg_plaintext_size; + record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size; copy = size; if (copy >= record_room) { copy = record_room; full_record = true; } - required_size = rec->sg_plaintext_size + copy + - tls_ctx->tx.overhead_size; + + required_size = msg_pl->sg.size + copy + + tls_ctx->tx.overhead_size; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; alloc_payload: - ret = alloc_encrypted_sg(sk, required_size); + ret = tls_alloc_encrypted_msg(sk, required_size); if (ret) { if (ret != -ENOSPC) goto wait_for_memory; @@ -992,26 +809,18 @@ alloc_payload: * actually allocated. The difference is due * to max sg elements limit */ - copy -= required_size - rec->sg_plaintext_size; + copy -= required_size - msg_pl->sg.size; full_record = true; } - get_page(page); - sg = &rec->sg_plaintext_data[1] + rec->sg_plaintext_num_elem; - sg_set_page(sg, page, copy, offset); - sg_unmark_end(sg); - - rec->sg_plaintext_num_elem++; - + sk_msg_page_add(msg_pl, page, copy, offset); sk_mem_charge(sk, copy); + offset += copy; size -= copy; - rec->sg_plaintext_size += copy; - tls_ctx->pending_open_record_frags = rec->sg_plaintext_num_elem; - if (full_record || eor || - rec->sg_plaintext_num_elem == - ARRAY_SIZE(rec->sg_plaintext_data) - 1) { + tls_ctx->pending_open_record_frags = true; + if (full_record || eor || sk_msg_full(msg_pl)) { rec->inplace_crypto = 0; ret = tls_push_record(sk, flags, record_type); if (ret) { @@ -1027,7 +836,7 @@ wait_for_sndbuf: wait_for_memory: ret = sk_stream_wait_memory(sk, &timeo); if (ret) { - trim_both_sgl(sk, rec->sg_plaintext_size); + tls_trim_both_msgs(sk, msg_pl->sg.size); goto sendpage_end; } @@ -1092,6 +901,64 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags, return skb; } +static int tls_setup_from_iter(struct sock *sk, struct iov_iter *from, + int length, int *pages_used, + unsigned int *size_used, + struct scatterlist *to, + int to_max_pages) +{ + int rc = 0, i = 0, num_elem = *pages_used, maxpages; + struct page *pages[MAX_SKB_FRAGS]; + unsigned int size = *size_used; + ssize_t copied, use; + size_t offset; + + while (length > 0) { + i = 0; + maxpages = to_max_pages - num_elem; + if (maxpages == 0) { + rc = -EFAULT; + goto out; + } + copied = iov_iter_get_pages(from, pages, + length, + maxpages, &offset); + if (copied <= 0) { + rc = -EFAULT; + goto out; + } + + iov_iter_advance(from, copied); + + length -= copied; + size += copied; + while (copied) { + use = min_t(int, copied, PAGE_SIZE - offset); + + sg_set_page(&to[num_elem], + pages[i], use, offset); + sg_unmark_end(&to[num_elem]); + /* We do not uncharge memory from this API */ + + offset = 0; + copied -= use; + + i++; + num_elem++; + } + } + /* Mark the end in the last sg entry if newly added */ + if (num_elem > *pages_used) + sg_mark_end(&to[num_elem - 1]); +out: + if (rc) + iov_iter_revert(from, size - *size_used); + *size_used = size; + *pages_used = num_elem; + + return rc; +} + /* This function decrypts the input skb into either out_iov or in out_sg * or in skb buffers itself. The input parameter 'zc' indicates if * zero-copy mode needs to be tried or not. With zero-copy mode, either @@ -1189,9 +1056,9 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb, sg_set_buf(&sgout[0], aad, TLS_AAD_SPACE_SIZE); *chunk = 0; - err = zerocopy_from_iter(sk, out_iov, data_len, &pages, - chunk, &sgout[1], - (n_sgout - 1), false); + err = tls_setup_from_iter(sk, out_iov, data_len, + &pages, chunk, &sgout[1], + (n_sgout - 1)); if (err < 0) goto fallback_to_reg_recv; } else if (out_sg) { @@ -1619,25 +1486,15 @@ void tls_sw_free_resources_tx(struct sock *sk) rec = list_first_entry(&ctx->tx_list, struct tls_rec, list); - - free_sg(sk, &rec->sg_plaintext_data[1], - &rec->sg_plaintext_num_elem, - &rec->sg_plaintext_size); - list_del(&rec->list); + sk_msg_free(sk, &rec->msg_plaintext); kfree(rec); } list_for_each_entry_safe(rec, tmp, &ctx->tx_list, list) { - free_sg(sk, &rec->sg_encrypted_data[1], - &rec->sg_encrypted_num_elem, - &rec->sg_encrypted_size); - - free_sg(sk, &rec->sg_plaintext_data[1], - &rec->sg_plaintext_num_elem, - &rec->sg_plaintext_size); - list_del(&rec->list); + sk_msg_free(sk, &rec->msg_encrypted); + sk_msg_free(sk, &rec->msg_plaintext); kfree(rec); } -- cgit v1.2.3 From d3b18ad31f93d0b6bae105c679018a1ba7daa9ca Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sat, 13 Oct 2018 02:46:01 +0200 Subject: tls: add bpf support to sk_msg handling This work adds BPF sk_msg verdict program support to kTLS allowing BPF and kTLS to be combined together. Previously kTLS and sk_msg verdict programs were mutually exclusive in the ULP layer which created challenges for the orchestrator when trying to apply TCP based policy, for example. To resolve this, leveraging the work from previous patches that consolidates the use of sk_msg, we can finally enable BPF sk_msg verdict programs so they continue to run after the kTLS socket is created. No change in behavior when kTLS is not used in combination with BPF, the kselftest suite for kTLS also runs successfully. Joint work with Daniel. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- include/linux/skmsg.h | 41 ++++- net/tls/tls_sw.c | 439 ++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 414 insertions(+), 66 deletions(-) (limited to 'include/linux/skmsg.h') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 4e84b3c2eff8..0b919f0bc6d6 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -29,7 +29,11 @@ struct sk_msg_sg { u32 size; u32 copybreak; bool copy[MAX_MSG_FRAGS]; - struct scatterlist data[MAX_MSG_FRAGS]; + /* The extra element is used for chaining the front and sections when + * the list becomes partitioned (e.g. end < start). The crypto APIs + * require the chaining. + */ + struct scatterlist data[MAX_MSG_FRAGS + 1]; }; struct sk_msg { @@ -112,6 +116,7 @@ void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg, u32 bytes); void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes); +void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes); int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); @@ -161,8 +166,9 @@ static inline void sk_msg_clear_meta(struct sk_msg *msg) static inline void sk_msg_init(struct sk_msg *msg) { + BUILD_BUG_ON(ARRAY_SIZE(msg->sg.data) - 1 != MAX_MSG_FRAGS); memset(msg, 0, sizeof(*msg)); - sg_init_marker(msg->sg.data, ARRAY_SIZE(msg->sg.data)); + sg_init_marker(msg->sg.data, MAX_MSG_FRAGS); } static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src, @@ -174,6 +180,12 @@ static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src, src->sg.data[which].offset += size; } +static inline void sk_msg_xfer_full(struct sk_msg *dst, struct sk_msg *src) +{ + memcpy(dst, src, sizeof(*src)); + sk_msg_init(src); +} + static inline u32 sk_msg_elem_used(const struct sk_msg *msg) { return msg->sg.end >= msg->sg.start ? @@ -229,6 +241,26 @@ static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page, sk_msg_iter_next(msg, end); } +static inline void sk_msg_sg_copy(struct sk_msg *msg, u32 i, bool copy_state) +{ + do { + msg->sg.copy[i] = copy_state; + sk_msg_iter_var_next(i); + if (i == msg->sg.end) + break; + } while (1); +} + +static inline void sk_msg_sg_copy_set(struct sk_msg *msg, u32 start) +{ + sk_msg_sg_copy(msg, start, true); +} + +static inline void sk_msg_sg_copy_clear(struct sk_msg *msg, u32 start) +{ + sk_msg_sg_copy(msg, start, false); +} + static inline struct sk_psock *sk_psock(const struct sock *sk) { return rcu_dereference_sk_user_data(sk); @@ -245,6 +277,11 @@ static inline void sk_psock_queue_msg(struct sk_psock *psock, list_add_tail(&msg->list, &psock->ingress_msg); } +static inline bool sk_psock_queue_empty(const struct sk_psock *psock) +{ + return psock ? list_empty(&psock->ingress_msg) : true; +} + static inline void sk_psock_report_error(struct sk_psock *psock, int err) { struct sock *sk = psock->sk; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 3b75e0dd51a2..a525fc4c2a4b 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -4,6 +4,7 @@ * Copyright (c) 2016-2017, Lance Chao . All rights reserved. * Copyright (c) 2016, Fridolin Pokorny . All rights reserved. * Copyright (c) 2016, Nikos Mavrogiannopoulos . All rights reserved. + * Copyright (c) 2018, Covalent IO, Inc. http://covalent.io * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -258,21 +259,58 @@ static int tls_clone_plaintext_msg(struct sock *sk, int required) return sk_msg_clone(sk, msg_pl, msg_en, skip, len); } -static void tls_free_open_rec(struct sock *sk) +static struct tls_rec *tls_get_rec(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - struct tls_rec *rec = ctx->open_rec; + struct sk_msg *msg_pl, *msg_en; + struct tls_rec *rec; + int mem_size; - /* Return if there is no open record */ + mem_size = sizeof(struct tls_rec) + crypto_aead_reqsize(ctx->aead_send); + + rec = kzalloc(mem_size, sk->sk_allocation); if (!rec) - return; + return NULL; + msg_pl = &rec->msg_plaintext; + msg_en = &rec->msg_encrypted; + + sk_msg_init(msg_pl); + sk_msg_init(msg_en); + + sg_init_table(rec->sg_aead_in, 2); + sg_set_buf(&rec->sg_aead_in[0], rec->aad_space, + sizeof(rec->aad_space)); + sg_unmark_end(&rec->sg_aead_in[1]); + + sg_init_table(rec->sg_aead_out, 2); + sg_set_buf(&rec->sg_aead_out[0], rec->aad_space, + sizeof(rec->aad_space)); + sg_unmark_end(&rec->sg_aead_out[1]); + + return rec; +} + +static void tls_free_rec(struct sock *sk, struct tls_rec *rec) +{ sk_msg_free(sk, &rec->msg_encrypted); sk_msg_free(sk, &rec->msg_plaintext); kfree(rec); } +static void tls_free_open_rec(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_rec *rec = ctx->open_rec; + + if (rec) { + tls_free_rec(sk, rec); + ctx->open_rec = NULL; + } +} + int tls_tx_records(struct sock *sk, int flags) { struct tls_context *tls_ctx = tls_get_ctx(sk); @@ -439,16 +477,135 @@ static int tls_do_encryption(struct sock *sk, return rc; } +static int tls_split_open_record(struct sock *sk, struct tls_rec *from, + struct tls_rec **to, struct sk_msg *msg_opl, + struct sk_msg *msg_oen, u32 split_point, + u32 tx_overhead_size, u32 *orig_end) +{ + u32 i, j, bytes = 0, apply = msg_opl->apply_bytes; + struct scatterlist *sge, *osge, *nsge; + u32 orig_size = msg_opl->sg.size; + struct scatterlist tmp = { }; + struct sk_msg *msg_npl; + struct tls_rec *new; + int ret; + + new = tls_get_rec(sk); + if (!new) + return -ENOMEM; + ret = sk_msg_alloc(sk, &new->msg_encrypted, msg_opl->sg.size + + tx_overhead_size, 0); + if (ret < 0) { + tls_free_rec(sk, new); + return ret; + } + + *orig_end = msg_opl->sg.end; + i = msg_opl->sg.start; + sge = sk_msg_elem(msg_opl, i); + while (apply && sge->length) { + if (sge->length > apply) { + u32 len = sge->length - apply; + + get_page(sg_page(sge)); + sg_set_page(&tmp, sg_page(sge), len, + sge->offset + apply); + sge->length = apply; + bytes += apply; + apply = 0; + } else { + apply -= sge->length; + bytes += sge->length; + } + + sk_msg_iter_var_next(i); + if (i == msg_opl->sg.end) + break; + sge = sk_msg_elem(msg_opl, i); + } + + msg_opl->sg.end = i; + msg_opl->sg.curr = i; + msg_opl->sg.copybreak = 0; + msg_opl->apply_bytes = 0; + msg_opl->sg.size = bytes; + + msg_npl = &new->msg_plaintext; + msg_npl->apply_bytes = apply; + msg_npl->sg.size = orig_size - bytes; + + j = msg_npl->sg.start; + nsge = sk_msg_elem(msg_npl, j); + if (tmp.length) { + memcpy(nsge, &tmp, sizeof(*nsge)); + sk_msg_iter_var_next(j); + nsge = sk_msg_elem(msg_npl, j); + } + + osge = sk_msg_elem(msg_opl, i); + while (osge->length) { + memcpy(nsge, osge, sizeof(*nsge)); + sg_unmark_end(nsge); + sk_msg_iter_var_next(i); + sk_msg_iter_var_next(j); + if (i == *orig_end) + break; + osge = sk_msg_elem(msg_opl, i); + nsge = sk_msg_elem(msg_npl, j); + } + + msg_npl->sg.end = j; + msg_npl->sg.curr = j; + msg_npl->sg.copybreak = 0; + + *to = new; + return 0; +} + +static void tls_merge_open_record(struct sock *sk, struct tls_rec *to, + struct tls_rec *from, u32 orig_end) +{ + struct sk_msg *msg_npl = &from->msg_plaintext; + struct sk_msg *msg_opl = &to->msg_plaintext; + struct scatterlist *osge, *nsge; + u32 i, j; + + i = msg_opl->sg.end; + sk_msg_iter_var_prev(i); + j = msg_npl->sg.start; + + osge = sk_msg_elem(msg_opl, i); + nsge = sk_msg_elem(msg_npl, j); + + if (sg_page(osge) == sg_page(nsge) && + osge->offset + osge->length == nsge->offset) { + osge->length += nsge->length; + put_page(sg_page(nsge)); + } + + msg_opl->sg.end = orig_end; + msg_opl->sg.curr = orig_end; + msg_opl->sg.copybreak = 0; + msg_opl->apply_bytes = msg_opl->sg.size + msg_npl->sg.size; + msg_opl->sg.size += msg_npl->sg.size; + + sk_msg_free(sk, &to->msg_encrypted); + sk_msg_xfer_full(&to->msg_encrypted, &from->msg_encrypted); + + kfree(from); +} + static int tls_push_record(struct sock *sk, int flags, unsigned char record_type) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - struct tls_rec *rec = ctx->open_rec; + struct tls_rec *rec = ctx->open_rec, *tmp = NULL; + u32 i, split_point, uninitialized_var(orig_end); struct sk_msg *msg_pl, *msg_en; struct aead_request *req; + bool split; int rc; - u32 i; if (!rec) return 0; @@ -456,6 +613,18 @@ static int tls_push_record(struct sock *sk, int flags, msg_pl = &rec->msg_plaintext; msg_en = &rec->msg_encrypted; + split_point = msg_pl->apply_bytes; + split = split_point && split_point < msg_pl->sg.size; + if (split) { + rc = tls_split_open_record(sk, rec, &tmp, msg_pl, msg_en, + split_point, tls_ctx->tx.overhead_size, + &orig_end); + if (rc < 0) + return rc; + sk_msg_trim(sk, msg_en, msg_pl->sg.size + + tls_ctx->tx.overhead_size); + } + rec->tx_flags = flags; req = &rec->aead_req; @@ -487,57 +656,139 @@ static int tls_push_record(struct sock *sk, int flags, rc = tls_do_encryption(sk, tls_ctx, ctx, req, msg_pl->sg.size, i); if (rc < 0) { - if (rc != -EINPROGRESS) + if (rc != -EINPROGRESS) { tls_err_abort(sk, EBADMSG); + if (split) { + tls_ctx->pending_open_record_frags = true; + tls_merge_open_record(sk, rec, tmp, orig_end); + } + } return rc; + } else if (split) { + msg_pl = &tmp->msg_plaintext; + msg_en = &tmp->msg_encrypted; + sk_msg_trim(sk, msg_en, msg_pl->sg.size + + tls_ctx->tx.overhead_size); + tls_ctx->pending_open_record_frags = true; + ctx->open_rec = tmp; } return tls_tx_records(sk, flags); } -static int tls_sw_push_pending_record(struct sock *sk, int flags) -{ - return tls_push_record(sk, flags, TLS_RECORD_TYPE_DATA); -} - -static struct tls_rec *get_rec(struct sock *sk) +static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, + bool full_record, u8 record_type, + size_t *copied, int flags) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); - struct sk_msg *msg_pl, *msg_en; + struct sk_msg msg_redir = { }; + struct sk_psock *psock; + struct sock *sk_redir; struct tls_rec *rec; - int mem_size; + int err = 0, send; + bool enospc; + + psock = sk_psock_get(sk); + if (!psock) + return tls_push_record(sk, flags, record_type); +more_data: + enospc = sk_msg_full(msg); + if (psock->eval == __SK_NONE) + psock->eval = sk_psock_msg_verdict(sk, psock, msg); + if (msg->cork_bytes && msg->cork_bytes > msg->sg.size && + !enospc && !full_record) { + err = -ENOSPC; + goto out_err; + } + msg->cork_bytes = 0; + send = msg->sg.size; + if (msg->apply_bytes && msg->apply_bytes < send) + send = msg->apply_bytes; + + switch (psock->eval) { + case __SK_PASS: + err = tls_push_record(sk, flags, record_type); + if (err < 0) { + *copied -= sk_msg_free(sk, msg); + tls_free_open_rec(sk); + goto out_err; + } + break; + case __SK_REDIRECT: + sk_redir = psock->sk_redir; + memcpy(&msg_redir, msg, sizeof(*msg)); + if (msg->apply_bytes < send) + msg->apply_bytes = 0; + else + msg->apply_bytes -= send; + sk_msg_return_zero(sk, msg, send); + msg->sg.size -= send; + release_sock(sk); + err = tcp_bpf_sendmsg_redir(sk_redir, &msg_redir, send, flags); + lock_sock(sk); + if (err < 0) { + *copied -= sk_msg_free_nocharge(sk, &msg_redir); + msg->sg.size = 0; + } + if (msg->sg.size == 0) + tls_free_open_rec(sk); + break; + case __SK_DROP: + default: + sk_msg_free_partial(sk, msg, send); + if (msg->apply_bytes < send) + msg->apply_bytes = 0; + else + msg->apply_bytes -= send; + if (msg->sg.size == 0) + tls_free_open_rec(sk); + *copied -= send; + err = -EACCES; + } - /* Return if we already have an open record */ - if (ctx->open_rec) - return ctx->open_rec; + if (likely(!err)) { + bool reset_eval = !ctx->open_rec; - mem_size = sizeof(struct tls_rec) + crypto_aead_reqsize(ctx->aead_send); + rec = ctx->open_rec; + if (rec) { + msg = &rec->msg_plaintext; + if (!msg->apply_bytes) + reset_eval = true; + } + if (reset_eval) { + psock->eval = __SK_NONE; + if (psock->sk_redir) { + sock_put(psock->sk_redir); + psock->sk_redir = NULL; + } + } + if (rec) + goto more_data; + } + out_err: + sk_psock_put(sk, psock); + return err; +} + +static int tls_sw_push_pending_record(struct sock *sk, int flags) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); + struct tls_rec *rec = ctx->open_rec; + struct sk_msg *msg_pl; + size_t copied; - rec = kzalloc(mem_size, sk->sk_allocation); if (!rec) - return NULL; + return 0; msg_pl = &rec->msg_plaintext; - msg_en = &rec->msg_encrypted; - - sk_msg_init(msg_pl); - sk_msg_init(msg_en); - - sg_init_table(rec->sg_aead_in, 2); - sg_set_buf(&rec->sg_aead_in[0], rec->aad_space, - sizeof(rec->aad_space)); - sg_unmark_end(&rec->sg_aead_in[1]); - - sg_init_table(rec->sg_aead_out, 2); - sg_set_buf(&rec->sg_aead_out[0], rec->aad_space, - sizeof(rec->aad_space)); - sg_unmark_end(&rec->sg_aead_out[1]); - - ctx->open_rec = rec; - rec->inplace_crypto = 1; + copied = msg_pl->sg.size; + if (!copied) + return 0; - return rec; + return bpf_exec_tx_verdict(msg_pl, sk, true, TLS_RECORD_TYPE_DATA, + &copied, flags); } int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) @@ -589,7 +840,10 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) goto send_end; } - rec = get_rec(sk); + if (ctx->open_rec) + rec = ctx->open_rec; + else + rec = ctx->open_rec = tls_get_rec(sk); if (!rec) { ret = -ENOMEM; goto send_end; @@ -628,6 +882,8 @@ alloc_encrypted: } if (!is_kvec && (full_record || eor) && !async_capable) { + u32 first = msg_pl->sg.end; + ret = sk_msg_zerocopy_from_iter(sk, &msg->msg_iter, msg_pl, try_to_copy); if (ret) @@ -637,15 +893,27 @@ alloc_encrypted: num_zc++; copied += try_to_copy; - ret = tls_push_record(sk, msg->msg_flags, record_type); + + sk_msg_sg_copy_set(msg_pl, first); + ret = bpf_exec_tx_verdict(msg_pl, sk, full_record, + record_type, &copied, + msg->msg_flags); if (ret) { if (ret == -EINPROGRESS) num_async++; + else if (ret == -ENOMEM) + goto wait_for_memory; + else if (ret == -ENOSPC) + goto rollback_iter; else if (ret != -EAGAIN) goto send_end; } continue; - +rollback_iter: + copied -= try_to_copy; + sk_msg_sg_copy_clear(msg_pl, first); + iov_iter_revert(&msg->msg_iter, + msg_pl->sg.size - orig_size); fallback_to_reg_send: sk_msg_trim(sk, msg_pl, orig_size); } @@ -678,12 +946,19 @@ fallback_to_reg_send: tls_ctx->pending_open_record_frags = true; copied += try_to_copy; if (full_record || eor) { - ret = tls_push_record(sk, msg->msg_flags, record_type); + ret = bpf_exec_tx_verdict(msg_pl, sk, full_record, + record_type, &copied, + msg->msg_flags); if (ret) { if (ret == -EINPROGRESS) num_async++; - else if (ret != -EAGAIN) + else if (ret == -ENOMEM) + goto wait_for_memory; + else if (ret != -EAGAIN) { + if (ret == -ENOSPC) + ret = 0; goto send_end; + } } } @@ -742,10 +1017,10 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); unsigned char record_type = TLS_RECORD_TYPE_DATA; - size_t orig_size = size; struct sk_msg *msg_pl; struct tls_rec *rec; int num_async = 0; + size_t copied = 0; bool full_record; int record_room; int ret = 0; @@ -778,7 +1053,10 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, goto sendpage_end; } - rec = get_rec(sk); + if (ctx->open_rec) + rec = ctx->open_rec; + else + rec = ctx->open_rec = tls_get_rec(sk); if (!rec) { ret = -ENOMEM; goto sendpage_end; @@ -788,6 +1066,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, full_record = false; record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size; + copied = 0; copy = size; if (copy >= record_room) { copy = record_room; @@ -818,16 +1097,23 @@ alloc_payload: offset += copy; size -= copy; + copied += copy; tls_ctx->pending_open_record_frags = true; if (full_record || eor || sk_msg_full(msg_pl)) { rec->inplace_crypto = 0; - ret = tls_push_record(sk, flags, record_type); + ret = bpf_exec_tx_verdict(msg_pl, sk, full_record, + record_type, &copied, flags); if (ret) { if (ret == -EINPROGRESS) num_async++; - else if (ret != -EAGAIN) + else if (ret == -ENOMEM) + goto wait_for_memory; + else if (ret != -EAGAIN) { + if (ret == -ENOSPC) + ret = 0; goto sendpage_end; + } } } continue; @@ -851,24 +1137,20 @@ wait_for_memory: } } sendpage_end: - if (orig_size > size) - ret = orig_size - size; - else - ret = sk_stream_error(sk, flags, ret); - + ret = sk_stream_error(sk, flags, ret); release_sock(sk); - return ret; + return copied ? copied : ret; } -static struct sk_buff *tls_wait_data(struct sock *sk, int flags, - long timeo, int *err) +static struct sk_buff *tls_wait_data(struct sock *sk, struct sk_psock *psock, + int flags, long timeo, int *err) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct sk_buff *skb; DEFINE_WAIT_FUNC(wait, woken_wake_function); - while (!(skb = ctx->recv_pkt)) { + while (!(skb = ctx->recv_pkt) && sk_psock_queue_empty(psock)) { if (sk->sk_err) { *err = sock_error(sk); return NULL; @@ -887,7 +1169,10 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags, add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - sk_wait_event(sk, &timeo, ctx->recv_pkt != skb, &wait); + sk_wait_event(sk, &timeo, + ctx->recv_pkt != skb || + !sk_psock_queue_empty(psock), + &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); @@ -1164,6 +1449,7 @@ int tls_sw_recvmsg(struct sock *sk, { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + struct sk_psock *psock; unsigned char control; struct strp_msg *rxm; struct sk_buff *skb; @@ -1179,6 +1465,7 @@ int tls_sw_recvmsg(struct sock *sk, if (unlikely(flags & MSG_ERRQUEUE)) return sock_recv_errqueue(sk, msg, len, SOL_IP, IP_RECVERR); + psock = sk_psock_get(sk); lock_sock(sk); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); @@ -1188,9 +1475,19 @@ int tls_sw_recvmsg(struct sock *sk, bool async = false; int chunk = 0; - skb = tls_wait_data(sk, flags, timeo, &err); - if (!skb) + skb = tls_wait_data(sk, psock, flags, timeo, &err); + if (!skb) { + if (psock) { + int ret = __tcp_bpf_recvmsg(sk, psock, msg, len); + + if (ret > 0) { + copied += ret; + len -= ret; + continue; + } + } goto recv_end; + } rxm = strp_msg(skb); @@ -1296,6 +1593,8 @@ recv_end: } release_sock(sk); + if (psock) + sk_psock_put(sk, psock); return copied ? : err; } @@ -1318,7 +1617,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - skb = tls_wait_data(sk, flags, timeo, &err); + skb = tls_wait_data(sk, NULL, flags, timeo, &err); if (!skb) goto splice_read_end; @@ -1356,11 +1655,16 @@ bool tls_sw_stream_read(const struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + bool ingress_empty = true; + struct sk_psock *psock; - if (ctx->recv_pkt) - return true; + rcu_read_lock(); + psock = sk_psock(sk); + if (psock) + ingress_empty = list_empty(&psock->ingress_msg); + rcu_read_unlock(); - return false; + return !ingress_empty || ctx->recv_pkt; } static int tls_read_size(struct strparser *strp, struct sk_buff *skb) @@ -1439,8 +1743,15 @@ static void tls_data_ready(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + struct sk_psock *psock; strp_data_ready(&ctx->strp); + + psock = sk_psock_get(sk); + if (psock && !list_empty(&psock->ingress_msg)) { + ctx->saved_data_ready(sk); + sk_psock_put(sk, psock); + } } void tls_sw_free_resources_tx(struct sock *sk) -- cgit v1.2.3 From 3f4c3127d332000530349db4843deece27fe5e0c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 16 Oct 2018 10:36:01 -0700 Subject: bpf: sockmap, fix skmsg recvmsg handler to track size correctly When converting sockmap to new skmsg generic data structures we missed that the recvmsg handler did not correctly use sg.size and instead was using individual elements length. The result is if a sock is closed with outstanding data we omit the call to sk_mem_uncharge() and can get the warning below. [ 66.728282] WARNING: CPU: 6 PID: 5783 at net/core/stream.c:206 sk_stream_kill_queues+0x1fa/0x210 To fix this correct the redirect handler to xfer the size along with the scatterlist and also decrement the size from the recvmsg handler. Now when a sock is closed the remaining 'size' will be decremented with sk_mem_uncharge(). Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 1 + net/ipv4/tcp_bpf.c | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux/skmsg.h') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 0b919f0bc6d6..31df0d9fa536 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -176,6 +176,7 @@ static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src, { dst->sg.data[which] = src->sg.data[which]; dst->sg.data[which].length = size; + dst->sg.size += size; src->sg.data[which].length -= size; src->sg.data[which].offset += size; } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 80debb0daf37..f9d3cf185827 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -73,6 +73,7 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, sge->offset += copy; sge->length -= copy; sk_mem_uncharge(sk, copy); + msg_rx->sg.size -= copy; if (!sge->length) { i++; if (i == MAX_SKB_FRAGS) -- cgit v1.2.3 From 8734a162c13b1a893e7dff8de0df81fed04c51a6 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 16 Oct 2018 11:07:59 -0700 Subject: bpf: skmsg, improve sk_msg_used_element to work in cork context Currently sk_msg_used_element is only called in zerocopy context where cork is not possible and if this case happens we fallback to copy mode. However the helper is more useful if it works in all contexts. This patch resolved the case where if end == head indicating a full or empty ring the helper always reports an empty ring. To fix this add a test for the full ring case to avoid reporting a full ring has 0 elements. This additional functionality will be used in the next patches from recvmsg context where end = head with a full ring is a valid case. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux/skmsg.h') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 31df0d9fa536..22347b08e1f8 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -187,18 +187,21 @@ static inline void sk_msg_xfer_full(struct sk_msg *dst, struct sk_msg *src) sk_msg_init(src); } +static inline bool sk_msg_full(const struct sk_msg *msg) +{ + return (msg->sg.end == msg->sg.start) && msg->sg.size; +} + static inline u32 sk_msg_elem_used(const struct sk_msg *msg) { + if (sk_msg_full(msg)) + return MAX_MSG_FRAGS; + return msg->sg.end >= msg->sg.start ? msg->sg.end - msg->sg.start : msg->sg.end + (MAX_MSG_FRAGS - msg->sg.start); } -static inline bool sk_msg_full(const struct sk_msg *msg) -{ - return (msg->sg.end == msg->sg.start) && msg->sg.size; -} - static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which) { return &msg->sg.data[which]; -- cgit v1.2.3 From 5032d079909d1ac5c2535acc32d5f01cd245d8ea Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 18 Oct 2018 13:58:35 -0700 Subject: bpf: skmsg, fix psock create on existing kcm/tls port Before using the psock returned by sk_psock_get() when adding it to a sockmap we need to ensure it is actually a sockmap based psock. Previously we were only checking this after incrementing the reference counter which was an error. This resulted in a slab-out-of-bounds error when the psock was not actually a sockmap type. This moves the check up so the reference counter is only used if it is a sockmap psock. Eric reported the following KASAN BUG, BUG: KASAN: slab-out-of-bounds in atomic_read include/asm-generic/atomic-instrumented.h:21 [inline] BUG: KASAN: slab-out-of-bounds in refcount_inc_not_zero_checked+0x97/0x2f0 lib/refcount.c:120 Read of size 4 at addr ffff88019548be58 by task syz-executor4/22387 CPU: 1 PID: 22387 Comm: syz-executor4 Not tainted 4.19.0-rc7+ #264 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1c4/0x2b4 lib/dump_stack.c:113 print_address_description.cold.8+0x9/0x1ff mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report.cold.9+0x242/0x309 mm/kasan/report.c:412 check_memory_region_inline mm/kasan/kasan.c:260 [inline] check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267 kasan_check_read+0x11/0x20 mm/kasan/kasan.c:272 atomic_read include/asm-generic/atomic-instrumented.h:21 [inline] refcount_inc_not_zero_checked+0x97/0x2f0 lib/refcount.c:120 sk_psock_get include/linux/skmsg.h:379 [inline] sock_map_link.isra.6+0x41f/0xe30 net/core/sock_map.c:178 sock_hash_update_common+0x19b/0x11e0 net/core/sock_map.c:669 sock_hash_update_elem+0x306/0x470 net/core/sock_map.c:738 map_update_elem+0x819/0xdf0 kernel/bpf/syscall.c:818 Signed-off-by: John Fastabend Reported-by: Eric Dumazet Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 25 ++++++++++++++++++++----- net/core/sock_map.c | 11 ++++++----- 2 files changed, 26 insertions(+), 10 deletions(-) (limited to 'include/linux/skmsg.h') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 22347b08e1f8..84e18863f6a4 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -270,11 +270,6 @@ static inline struct sk_psock *sk_psock(const struct sock *sk) return rcu_dereference_sk_user_data(sk); } -static inline bool sk_has_psock(struct sock *sk) -{ - return sk_psock(sk) != NULL && sk->sk_prot->recvmsg == tcp_bpf_recvmsg; -} - static inline void sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { @@ -374,6 +369,26 @@ static inline bool sk_psock_test_state(const struct sk_psock *psock, return test_bit(bit, &psock->state); } +static inline struct sk_psock *sk_psock_get_checked(struct sock *sk) +{ + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock(sk); + if (psock) { + if (sk->sk_prot->recvmsg != tcp_bpf_recvmsg) { + psock = ERR_PTR(-EBUSY); + goto out; + } + + if (!refcount_inc_not_zero(&psock->refcnt)) + psock = ERR_PTR(-EBUSY); + } +out: + rcu_read_unlock(); + return psock; +} + static inline struct sk_psock *sk_psock_get(struct sock *sk) { struct sk_psock *psock; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 3c0e44cb811a..be6092ac69f8 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -175,12 +175,13 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, } } - psock = sk_psock_get(sk); + psock = sk_psock_get_checked(sk); + if (IS_ERR(psock)) { + ret = PTR_ERR(psock); + goto out_progs; + } + if (psock) { - if (!sk_has_psock(sk)) { - ret = -EBUSY; - goto out_progs; - } if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || (skb_progs && READ_ONCE(psock->progs.skb_parser))) { sk_psock_put(sk, psock); -- cgit v1.2.3 From 6fff607e2f14bd7c63c06c464a6f93b8efbabe28 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 19 Oct 2018 19:56:49 -0700 Subject: bpf: sk_msg program helper bpf_msg_push_data This allows user to push data into a msg using sk_msg program types. The format is as follows, bpf_msg_push_data(msg, offset, len, flags) this will insert 'len' bytes at offset 'offset'. For example to prepend 10 bytes at the front of the message the user can, bpf_msg_push_data(msg, 0, 10, 0); This will invalidate data bounds so BPF user will have to then recheck data bounds after calling this. After this the msg size will have been updated and the user is free to write into the added bytes. We allow any offset/len as long as it is within the (data, data_end) range. However, a copy will be required if the ring is full and its possible for the helper to fail with ENOMEM or EINVAL errors which need to be handled by the BPF program. This can be used similar to XDP metadata to pass data between sk_msg layer and lower layers. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 5 ++ include/uapi/linux/bpf.h | 20 ++++++- net/core/filter.c | 134 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 158 insertions(+), 1 deletion(-) (limited to 'include/linux/skmsg.h') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 84e18863f6a4..2a11e9d91dfa 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -207,6 +207,11 @@ static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which) return &msg->sg.data[which]; } +static inline struct scatterlist sk_msg_elem_cpy(struct sk_msg *msg, int which) +{ + return msg->sg.data[which]; +} + static inline struct page *sk_msg_page(struct sk_msg *msg, int which) { return sg_page(sk_msg_elem(msg, which)); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a2fb333290dc..852dc17ab47a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2240,6 +2240,23 @@ union bpf_attr { * pointer that was returned from bpf_sk_lookup_xxx\ (). * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags) + * Description + * For socket policies, insert *len* bytes into msg at offset + * *start*. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it may want to insert metadata or options into the msg. + * This can later be read and used by any of the lower layer BPF + * hooks. + * + * This helper may fail if under memory pressure (a malloc + * fails) in these cases BPF programs will get an appropriate + * error and BPF programs will need to handle them. + * + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2331,7 +2348,8 @@ union bpf_attr { FN(sk_release), \ FN(map_push_elem), \ FN(map_pop_elem), \ - FN(map_peek_elem), + FN(map_peek_elem), \ + FN(msg_push_data), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index 5fd5139e8638..35c6933c2622 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2297,6 +2297,137 @@ static const struct bpf_func_proto bpf_msg_pull_data_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, + u32, len, u64, flags) +{ + struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge; + u32 new, i = 0, l, space, copy = 0, offset = 0; + u8 *raw, *to, *from; + struct page *page; + + if (unlikely(flags)) + return -EINVAL; + + /* First find the starting scatterlist element */ + i = msg->sg.start; + do { + l = sk_msg_elem(msg, i)->length; + + if (start < offset + l) + break; + offset += l; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); + + if (start >= offset + l) + return -EINVAL; + + space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); + + /* If no space available will fallback to copy, we need at + * least one scatterlist elem available to push data into + * when start aligns to the beginning of an element or two + * when it falls inside an element. We handle the start equals + * offset case because its the common case for inserting a + * header. + */ + if (!space || (space == 1 && start != offset)) + copy = msg->sg.data[i].length; + + page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, + get_order(copy + len)); + if (unlikely(!page)) + return -ENOMEM; + + if (copy) { + int front, back; + + raw = page_address(page); + + psge = sk_msg_elem(msg, i); + front = start - offset; + back = psge->length - front; + from = sg_virt(psge); + + if (front) + memcpy(raw, from, front); + + if (back) { + from += front; + to = raw + front + len; + + memcpy(to, from, back); + } + + put_page(sg_page(psge)); + } else if (start - offset) { + psge = sk_msg_elem(msg, i); + rsge = sk_msg_elem_cpy(msg, i); + + psge->length = start - offset; + rsge.length -= psge->length; + rsge.offset += start; + + sk_msg_iter_var_next(i); + sg_unmark_end(psge); + sk_msg_iter_next(msg, end); + } + + /* Slot(s) to place newly allocated data */ + new = i; + + /* Shift one or two slots as needed */ + if (!copy) { + sge = sk_msg_elem_cpy(msg, i); + + sk_msg_iter_var_next(i); + sg_unmark_end(&sge); + sk_msg_iter_next(msg, end); + + nsge = sk_msg_elem_cpy(msg, i); + if (rsge.length) { + sk_msg_iter_var_next(i); + nnsge = sk_msg_elem_cpy(msg, i); + } + + while (i != msg->sg.end) { + msg->sg.data[i] = sge; + sge = nsge; + sk_msg_iter_var_next(i); + if (rsge.length) { + nsge = nnsge; + nnsge = sk_msg_elem_cpy(msg, i); + } else { + nsge = sk_msg_elem_cpy(msg, i); + } + } + } + + /* Place newly allocated data buffer */ + sk_mem_charge(msg->sk, len); + msg->sg.size += len; + msg->sg.copy[new] = false; + sg_set_page(&msg->sg.data[new], page, len + copy, 0); + if (rsge.length) { + get_page(sg_page(&rsge)); + sk_msg_iter_var_next(new); + msg->sg.data[new] = rsge; + } + + sk_msg_compute_data_pointers(msg); + return 0; +} + +static const struct bpf_func_proto bpf_msg_push_data_proto = { + .func = bpf_msg_push_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -4854,6 +4985,7 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_xdp_adjust_head || func == bpf_xdp_adjust_meta || func == bpf_msg_pull_data || + func == bpf_msg_push_data || func == bpf_xdp_adjust_tail || #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) func == bpf_lwt_seg6_store_bytes || @@ -5130,6 +5262,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_cork_bytes_proto; case BPF_FUNC_msg_pull_data: return &bpf_msg_pull_data_proto; + case BPF_FUNC_msg_push_data: + return &bpf_msg_push_data_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; default: -- cgit v1.2.3