From 83134ef4609388f6b9ca31a384f531155196c2a7 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Oct 2024 12:13:31 +0200 Subject: netkit: Add option for scrubbing skb meta data Jordan reported that when running Cilium with netkit in per-endpoint-routes mode, network policy misclassifies traffic. In this direct routing mode of Cilium which is used in case of GKE/EKS/AKS, the Pod's BPF program to enforce policy sits on the netkit primary device's egress side. The issue here is that in case of netkit's netkit_prep_forward(), it will clear meta data such as skb->mark and skb->priority before executing the BPF program. Thus, identity data stored in there from earlier BPF programs (e.g. from tcx ingress on the physical device) gets cleared instead of being made available for the primary's program to process. While for traffic egressing the Pod via the peer device this might be desired, this is different for the primary one where compared to tcx egress on the host veth this information would be available. To address this, add a new parameter for the device orchestration to allow control of skb->mark and skb->priority scrubbing, to make the two accessible from BPF (and eventually leave it up to the program to scrub). By default, the current behavior is retained. For netkit peer this also enables the use case where applications could cooperate/signal intent to the BPF program. Note that struct netkit has a 4 byte hole between policy and bundle which is used here, in other words, struct netkit's first cacheline content used in fast-path does not get moved around. Fixes: 35dfaad7188c ("netkit, bpf: Add bpf programmable net device") Reported-by: Jordan Rife Signed-off-by: Daniel Borkmann Cc: Nikolay Aleksandrov Link: https://github.com/cilium/cilium/issues/34042 Acked-by: Jakub Kicinski Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20241004101335.117711-1-daniel@iogearbox.net Signed-off-by: Martin KaFai Lau --- include/uapi/linux/if_link.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 6dc258993b17..2acc7687e017 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1292,6 +1292,19 @@ enum netkit_mode { NETKIT_L3, }; +/* NETKIT_SCRUB_NONE leaves clearing skb->{mark,priority} up to + * the BPF program if attached. This also means the latter can + * consume the two fields if they were populated earlier. + * + * NETKIT_SCRUB_DEFAULT zeroes skb->{mark,priority} fields before + * invoking the attached BPF program when the peer device resides + * in a different network namespace. This is the default behavior. + */ +enum netkit_scrub { + NETKIT_SCRUB_NONE, + NETKIT_SCRUB_DEFAULT, +}; + enum { IFLA_NETKIT_UNSPEC, IFLA_NETKIT_PEER_INFO, @@ -1299,6 +1312,8 @@ enum { IFLA_NETKIT_POLICY, IFLA_NETKIT_PEER_POLICY, IFLA_NETKIT_MODE, + IFLA_NETKIT_SCRUB, + IFLA_NETKIT_PEER_SCRUB, __IFLA_NETKIT_MAX, }; #define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1) -- cgit v1.2.3 From b692bf9a7543af7ad11a59d182a3757578f0ba53 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Mon, 7 Oct 2024 14:24:53 +0200 Subject: xsk: Get rid of xdp_buff_xsk::xskb_list_node Let's bring xdp_buff_xsk back to occupying 2 cachelines by removing xskb_list_node - for the purpose of gathering the xskb frags free_list_node can be used, head of the list (xsk_buff_pool::xskb_list) stays as-is, just reuse the node ptr. It is safe to do as a single xdp_buff_xsk can never reside in two pool's lists simultaneously. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20241007122458.282590-2-maciej.fijalkowski@intel.com --- include/net/xdp_sock_drv.h | 14 +++++++------- include/net/xsk_buff_pool.h | 1 - net/xdp/xsk.c | 4 ++-- net/xdp/xsk_buff_pool.c | 1 - 4 files changed, 9 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 0a5dca2b2b3f..360bc1244c6a 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -126,8 +126,8 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) if (likely(!xdp_buff_has_frags(xdp))) goto out; - list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { - list_del(&pos->xskb_list_node); + list_for_each_entry_safe(pos, tmp, xskb_list, free_list_node) { + list_del(&pos->free_list_node); xp_free(pos); } @@ -140,7 +140,7 @@ static inline void xsk_buff_add_frag(struct xdp_buff *xdp) { struct xdp_buff_xsk *frag = container_of(xdp, struct xdp_buff_xsk, xdp); - list_add_tail(&frag->xskb_list_node, &frag->pool->xskb_list); + list_add_tail(&frag->free_list_node, &frag->pool->xskb_list); } static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) @@ -150,9 +150,9 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) struct xdp_buff_xsk *frag; frag = list_first_entry_or_null(&xskb->pool->xskb_list, - struct xdp_buff_xsk, xskb_list_node); + struct xdp_buff_xsk, free_list_node); if (frag) { - list_del(&frag->xskb_list_node); + list_del(&frag->free_list_node); ret = &frag->xdp; } @@ -163,7 +163,7 @@ static inline void xsk_buff_del_tail(struct xdp_buff *tail) { struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); - list_del(&xskb->xskb_list_node); + list_del(&xskb->free_list_node); } static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) @@ -172,7 +172,7 @@ static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) struct xdp_buff_xsk *frag; frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, - xskb_list_node); + free_list_node); return &frag->xdp; } diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index bacb33f1e3e5..aa7f1d0b3a5e 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -30,7 +30,6 @@ struct xdp_buff_xsk { struct xsk_buff_pool *pool; u64 orig_addr; struct list_head free_list_node; - struct list_head xskb_list_node; }; #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 1140b2a120ca..9c93064349a8 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -171,14 +171,14 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) return 0; xskb_list = &xskb->pool->xskb_list; - list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { + list_for_each_entry_safe(pos, tmp, xskb_list, free_list_node) { if (list_is_singular(xskb_list)) contd = 0; len = pos->xdp.data_end - pos->xdp.data; err = __xsk_rcv_zc(xs, pos, len, contd); if (err) goto err; - list_del(&pos->xskb_list_node); + list_del(&pos->free_list_node); } return 0; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 521a2938e50a..e5368db7d18e 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -102,7 +102,6 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, xskb->pool = pool; xskb->xdp.frame_sz = umem->chunk_size - umem->headroom; INIT_LIST_HEAD(&xskb->free_list_node); - INIT_LIST_HEAD(&xskb->xskb_list_node); if (pool->unaligned) pool->free_heads[i] = xskb; else -- cgit v1.2.3 From 30ec2c1baaead43903ad63ff8e3083949059083c Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Mon, 7 Oct 2024 14:24:54 +0200 Subject: xsk: s/free_list_node/list_node/ Now that free_list_node's purpose is two-folded, make it just a 'list_node'. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20241007122458.282590-3-maciej.fijalkowski@intel.com --- include/net/xdp_sock_drv.h | 14 +++++++------- include/net/xsk_buff_pool.h | 2 +- net/xdp/xsk.c | 4 ++-- net/xdp/xsk_buff_pool.c | 14 +++++++------- 4 files changed, 17 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 360bc1244c6a..40085afd9160 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -126,8 +126,8 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) if (likely(!xdp_buff_has_frags(xdp))) goto out; - list_for_each_entry_safe(pos, tmp, xskb_list, free_list_node) { - list_del(&pos->free_list_node); + list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { + list_del(&pos->list_node); xp_free(pos); } @@ -140,7 +140,7 @@ static inline void xsk_buff_add_frag(struct xdp_buff *xdp) { struct xdp_buff_xsk *frag = container_of(xdp, struct xdp_buff_xsk, xdp); - list_add_tail(&frag->free_list_node, &frag->pool->xskb_list); + list_add_tail(&frag->list_node, &frag->pool->xskb_list); } static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) @@ -150,9 +150,9 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) struct xdp_buff_xsk *frag; frag = list_first_entry_or_null(&xskb->pool->xskb_list, - struct xdp_buff_xsk, free_list_node); + struct xdp_buff_xsk, list_node); if (frag) { - list_del(&frag->free_list_node); + list_del(&frag->list_node); ret = &frag->xdp; } @@ -163,7 +163,7 @@ static inline void xsk_buff_del_tail(struct xdp_buff *tail) { struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); - list_del(&xskb->free_list_node); + list_del(&xskb->list_node); } static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) @@ -172,7 +172,7 @@ static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) struct xdp_buff_xsk *frag; frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, - free_list_node); + list_node); return &frag->xdp; } diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index aa7f1d0b3a5e..af8b6f776f86 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -29,7 +29,7 @@ struct xdp_buff_xsk { dma_addr_t frame_dma; struct xsk_buff_pool *pool; u64 orig_addr; - struct list_head free_list_node; + struct list_head list_node; }; #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 9c93064349a8..520023405908 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -171,14 +171,14 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) return 0; xskb_list = &xskb->pool->xskb_list; - list_for_each_entry_safe(pos, tmp, xskb_list, free_list_node) { + list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { if (list_is_singular(xskb_list)) contd = 0; len = pos->xdp.data_end - pos->xdp.data; err = __xsk_rcv_zc(xs, pos, len, contd); if (err) goto err; - list_del(&pos->free_list_node); + list_del(&pos->list_node); } return 0; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index e5368db7d18e..973557d5e4f7 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -101,7 +101,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, xskb = &pool->heads[i]; xskb->pool = pool; xskb->xdp.frame_sz = umem->chunk_size - umem->headroom; - INIT_LIST_HEAD(&xskb->free_list_node); + INIT_LIST_HEAD(&xskb->list_node); if (pool->unaligned) pool->free_heads[i] = xskb; else @@ -549,8 +549,8 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool) } else { pool->free_list_cnt--; xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, - free_list_node); - list_del_init(&xskb->free_list_node); + list_node); + list_del_init(&xskb->list_node); } xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM; @@ -616,8 +616,8 @@ static u32 xp_alloc_reused(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u3 i = nb_entries; while (i--) { - xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, free_list_node); - list_del_init(&xskb->free_list_node); + xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, list_node); + list_del_init(&xskb->list_node); *xdp = &xskb->xdp; xdp++; @@ -687,11 +687,11 @@ EXPORT_SYMBOL(xp_can_alloc); void xp_free(struct xdp_buff_xsk *xskb) { - if (!list_empty(&xskb->free_list_node)) + if (!list_empty(&xskb->list_node)) return; xskb->pool->free_list_cnt++; - list_add(&xskb->free_list_node, &xskb->pool->free_list); + list_add(&xskb->list_node, &xskb->pool->free_list); } EXPORT_SYMBOL(xp_free); -- cgit v1.2.3 From bea14124bacbe5c9366381e62635eed28ac892ae Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Mon, 7 Oct 2024 14:24:55 +0200 Subject: xsk: Get rid of xdp_buff_xsk::orig_addr Continue the process of dieting xdp_buff_xsk by removing orig_addr member. It can be calculated from xdp->data_hard_start where it was previously used, so it is not anything that has to be carried around in struct used widely in hot path. This has been used for initializing xdp_buff_xsk::frame_dma during pool setup and as a shortcut in xp_get_handle() to retrieve address provided to xsk Rx queue. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20241007122458.282590-4-maciej.fijalkowski@intel.com --- include/net/xsk_buff_pool.h | 19 +++++++++++-------- net/xdp/xsk.c | 2 +- net/xdp/xsk_buff_pool.c | 4 +++- 3 files changed, 15 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index af8b6f776f86..468a23b1b4c5 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -28,7 +28,6 @@ struct xdp_buff_xsk { dma_addr_t dma; dma_addr_t frame_dma; struct xsk_buff_pool *pool; - u64 orig_addr; struct list_head list_node; }; @@ -119,7 +118,6 @@ void xp_free(struct xdp_buff_xsk *xskb); static inline void xp_init_xskb_addr(struct xdp_buff_xsk *xskb, struct xsk_buff_pool *pool, u64 addr) { - xskb->orig_addr = addr; xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom; } @@ -221,14 +219,19 @@ static inline void xp_release(struct xdp_buff_xsk *xskb) xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; } -static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb) +static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb, + struct xsk_buff_pool *pool) { - u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start; + u64 orig_addr = xskb->xdp.data - pool->addrs; + u64 offset; - offset += xskb->pool->headroom; - if (!xskb->pool->unaligned) - return xskb->orig_addr + offset; - return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); + if (!pool->unaligned) + return orig_addr; + + offset = xskb->xdp.data - xskb->xdp.data_hard_start; + orig_addr -= offset; + offset += pool->headroom; + return orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); } static inline bool xp_tx_metadata_enabled(const struct xsk_buff_pool *pool) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 520023405908..6c31c1de1619 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -141,7 +141,7 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, u64 addr; int err; - addr = xp_get_handle(xskb); + addr = xp_get_handle(xskb, xskb->pool); err = xskq_prod_reserve_desc(xs->rx, addr, len, flags); if (err) { xs->rx_queue_full++; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 973557d5e4f7..7ecd4ccd2473 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -416,8 +416,10 @@ static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_ for (i = 0; i < pool->heads_cnt; i++) { struct xdp_buff_xsk *xskb = &pool->heads[i]; + u64 orig_addr; - xp_init_xskb_dma(xskb, pool, dma_map->dma_pages, xskb->orig_addr); + orig_addr = xskb->xdp.data_hard_start - pool->addrs - pool->headroom; + xp_init_xskb_dma(xskb, pool, dma_map->dma_pages, orig_addr); } } -- cgit v1.2.3 From 6e126872191df946a6fe01b79273119d32d96711 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Mon, 7 Oct 2024 14:24:56 +0200 Subject: xsk: Carry a copy of xdp_zc_max_segs within xsk_buff_pool This so we avoid dereferencing struct net_device within hot path. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20241007122458.282590-5-maciej.fijalkowski@intel.com --- include/net/xsk_buff_pool.h | 1 + net/xdp/xsk_buff_pool.c | 1 + net/xdp/xsk_queue.h | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 468a23b1b4c5..bb03cee716b3 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -76,6 +76,7 @@ struct xsk_buff_pool { u32 chunk_size; u32 chunk_shift; u32 frame_len; + u32 xdp_zc_max_segs; u8 tx_metadata_len; /* inherited from umem */ u8 cached_need_wakeup; bool uses_need_wakeup; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 7ecd4ccd2473..e946ba4a5ccf 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -229,6 +229,7 @@ int xp_assign_dev(struct xsk_buff_pool *pool, goto err_unreg_xsk; } pool->umem->zc = true; + pool->xdp_zc_max_segs = netdev->xdp_zc_max_segs; return 0; err_unreg_xsk: diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 406b20dfee8d..46d87e961ad6 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -260,7 +260,7 @@ u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool, nr_frags = 0; } else { nr_frags++; - if (nr_frags == pool->netdev->xdp_zc_max_segs) { + if (nr_frags == pool->xdp_zc_max_segs) { nr_frags = 0; break; } -- cgit v1.2.3