From a5546e18f77c0cb15d434bf5b92647687fe483e3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 15 Jan 2026 09:25:48 +0100 Subject: net: Add queue-create operation Add a ynl netdev family operation called queue-create that creates a new queue on a netdevice: name: queue-create attribute-set: queue flags: [admin-perm] do: request: attributes: - ifindex - type - lease reply: &queue-create-op attributes: - id This is a generic operation such that it can be extended for various use cases in future. Right now it is mandatory to specify ifindex, the queue type which is enforced to rx and a lease. The newly created queue id is returned to the caller. A queue from a virtual device can have a lease which refers to another queue from a physical device. This is useful for memory providers and AF_XDP operations which take an ifindex and queue id to allow applications to bind against virtual devices in containers. The lease couples both queues together and allows to proxy the operations from a virtual device in a container to the physical device. In future, the nested lease attribute can be lifted and made optional for other use-cases such as dynamic queue creation for physical netdevs. The lack of lease and the specification of the physical device as an ifindex will imply that we need a real queue to be allocated. Similarly, the queue type enforcement to rx can then be lifted as well to support tx. An early implementation had only driver-specific integration [0], but in order for other virtual devices to reuse, it makes sense to have this as a generic API in core net. For leasing queues, the virtual netdev must have real_num_rx_queue less than num_rx_queues at the time of calling queue-create. The queue-type must be rx as only rx queues are supported for leasing for now. We also enforce that the queue-create ifindex must point to a virtual device, and that the nested lease attribute's ifindex must point to a physical device. The nested lease attribute set contains a netns-id attribute which is currently only intended for dumping as part of the queue-get operation. Also, it is modeled as an s32 type similarly as done elsewhere in the stack. Signed-off-by: Daniel Borkmann Co-developed-by: David Wei Signed-off-by: David Wei Link: https://bpfconf.ebpf.io/bpfconf2025/bpfconf2025_material/lsfmmbpf_2025_netkit_borkmann.pdf [0] Acked-by: Stanislav Fomichev Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20260115082603.219152-2-daniel@iogearbox.net Signed-off-by: Paolo Abeni --- include/uapi/linux/netdev.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index e0b579a1df4f..7df1056a35fd 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -160,6 +160,7 @@ enum { NETDEV_A_QUEUE_DMABUF, NETDEV_A_QUEUE_IO_URING, NETDEV_A_QUEUE_XSK, + NETDEV_A_QUEUE_LEASE, __NETDEV_A_QUEUE_MAX, NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) @@ -202,6 +203,15 @@ enum { NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) }; +enum { + NETDEV_A_LEASE_IFINDEX = 1, + NETDEV_A_LEASE_QUEUE, + NETDEV_A_LEASE_NETNS_ID, + + __NETDEV_A_LEASE_MAX, + NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1) +}; + enum { NETDEV_A_DMABUF_IFINDEX = 1, NETDEV_A_DMABUF_QUEUES, @@ -228,6 +238,7 @@ enum { NETDEV_CMD_BIND_RX, NETDEV_CMD_NAPI_SET, NETDEV_CMD_BIND_TX, + NETDEV_CMD_QUEUE_CREATE, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) -- cgit v1.2.3 From 31127deddef4a13628202a7bfef912e6c1ba3e57 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 15 Jan 2026 09:25:49 +0100 Subject: net: Implement netdev_nl_queue_create_doit Implement netdev_nl_queue_create_doit which creates a new rx queue in a virtual netdev and then leases it to a rx queue in a physical netdev. Example with ynl client: # ./pyynl/cli.py \ --spec ~/netlink/specs/netdev.yaml \ --do queue-create \ --json '{"ifindex": 8, "type": "rx", "lease": {"ifindex": 4, "queue": {"type": "rx", "id": 15}}}' {'id': 1} Note that the netdevice locking order is always from the virtual to the physical device. Signed-off-by: Daniel Borkmann Co-developed-by: David Wei Signed-off-by: David Wei Acked-by: Stanislav Fomichev Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20260115082603.219152-3-daniel@iogearbox.net Signed-off-by: Paolo Abeni --- include/net/netdev_queues.h | 19 +++++- include/net/netdev_rx_queue.h | 9 ++- include/net/xdp_sock_drv.h | 2 +- net/core/dev.c | 7 ++ net/core/dev.h | 2 + net/core/netdev-genl.c | 146 +++++++++++++++++++++++++++++++++++++++++- net/core/netdev_queues.c | 57 +++++++++++++++++ net/core/netdev_rx_queue.c | 46 +++++++++++-- net/xdp/xsk.c | 2 +- 9 files changed, 278 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index b55d3b9cb9c2..81dc7cb2360c 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -130,6 +130,11 @@ void netdev_stat_queue_sum(struct net_device *netdev, * @ndo_queue_get_dma_dev: Get dma device for zero-copy operations to be used * for this queue. Return NULL on error. * + * @ndo_queue_create: Create a new RX queue which can be leased to another queue. + * Ops on this queue are redirected to the leased queue e.g. + * when opening a memory provider. Return the new queue id on + * success. Return negative error code on failure. + * * Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while * the interface is closed. @ndo_queue_start and @ndo_queue_stop will only * be called for an interface which is open. @@ -149,9 +154,12 @@ struct netdev_queue_mgmt_ops { int idx); struct device * (*ndo_queue_get_dma_dev)(struct net_device *dev, int idx); + int (*ndo_queue_create)(struct net_device *dev); }; -bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx); +bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx); +bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx); +bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx); /** * DOC: Lockless queue stopping / waking helpers. @@ -340,5 +348,10 @@ static inline unsigned int netif_xmit_timeout_ms(struct netdev_queue *txq) }) struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx); - -#endif +bool netdev_can_create_queue(const struct net_device *dev, + struct netlink_ext_ack *extack); +bool netdev_can_lease_queue(const struct net_device *dev, + struct netlink_ext_ack *extack); +bool netdev_queue_busy(struct net_device *dev, int idx, + struct netlink_ext_ack *extack); +#endif /* _LINUX_NET_QUEUES_H */ diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index 8cdcd138b33f..1cacc2451516 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -28,6 +28,8 @@ struct netdev_rx_queue { #endif struct napi_struct *napi; struct pp_memory_provider_params mp_params; + struct netdev_rx_queue *lease; + netdevice_tracker lease_tracker; } ____cacheline_aligned_in_smp; /* @@ -57,5 +59,8 @@ get_netdev_rx_queue_index(struct netdev_rx_queue *queue) } int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq); - -#endif +void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst, + struct netdev_rx_queue *rxq_src); +void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst, + struct netdev_rx_queue *rxq_src); +#endif /* _LINUX_NETDEV_RX_QUEUE_H */ diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 242e34f771cc..c07cfb431eac 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -28,7 +28,7 @@ void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max); void xsk_tx_release(struct xsk_buff_pool *pool); -struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, +struct xsk_buff_pool *xsk_get_pool_from_qid(const struct net_device *dev, u16 queue_id); void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool); void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool); diff --git a/net/core/dev.c b/net/core/dev.c index 2661b68f5be3..13a3de63a825 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1114,6 +1114,13 @@ netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex) return __netdev_put_lock_ops_compat(dev, net); } +struct net_device * +netdev_put_lock(struct net_device *dev, netdevice_tracker *tracker) +{ + netdev_tracker_free(dev, tracker); + return __netdev_put_lock(dev, dev_net(dev)); +} + struct net_device * netdev_xa_find_lock(struct net *net, struct net_device *dev, unsigned long *index) diff --git a/net/core/dev.h b/net/core/dev.h index da18536cbd35..9bcb76b325d0 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -30,6 +30,8 @@ netdev_napi_by_id_lock(struct net *net, unsigned int napi_id); struct net_device *dev_get_by_napi_id(unsigned int napi_id); struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net); +struct net_device *netdev_put_lock(struct net_device *dev, + netdevice_tracker *tracker); struct net_device * netdev_xa_find_lock(struct net *net, struct net_device *dev, unsigned long *index); diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index aae75431858d..cd4dc4eef029 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -1122,7 +1122,151 @@ err_genlmsg_free: int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info) { - return -EOPNOTSUPP; + const int qmaxtype = ARRAY_SIZE(netdev_queue_id_nl_policy) - 1; + const int lmaxtype = ARRAY_SIZE(netdev_lease_nl_policy) - 1; + int err, ifindex, ifindex_lease, queue_id, queue_id_lease; + struct nlattr *qtb[ARRAY_SIZE(netdev_queue_id_nl_policy)]; + struct nlattr *ltb[ARRAY_SIZE(netdev_lease_nl_policy)]; + struct netdev_rx_queue *rxq, *rxq_lease; + struct net_device *dev, *dev_lease; + netdevice_tracker dev_tracker; + struct nlattr *nest; + struct sk_buff *rsp; + void *hdr; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_LEASE)) + return -EINVAL; + if (nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]) != + NETDEV_QUEUE_TYPE_RX) { + NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_QUEUE_TYPE]); + return -EINVAL; + } + + ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); + + nest = info->attrs[NETDEV_A_QUEUE_LEASE]; + err = nla_parse_nested(ltb, lmaxtype, nest, + netdev_lease_nl_policy, info->extack); + if (err < 0) + return err; + if (NL_REQ_ATTR_CHECK(info->extack, nest, ltb, NETDEV_A_LEASE_IFINDEX) || + NL_REQ_ATTR_CHECK(info->extack, nest, ltb, NETDEV_A_LEASE_QUEUE)) + return -EINVAL; + if (ltb[NETDEV_A_LEASE_NETNS_ID]) { + NL_SET_BAD_ATTR(info->extack, ltb[NETDEV_A_LEASE_NETNS_ID]); + return -EINVAL; + } + + ifindex_lease = nla_get_u32(ltb[NETDEV_A_LEASE_IFINDEX]); + + nest = ltb[NETDEV_A_LEASE_QUEUE]; + err = nla_parse_nested(qtb, qmaxtype, nest, + netdev_queue_id_nl_policy, info->extack); + if (err < 0) + return err; + if (NL_REQ_ATTR_CHECK(info->extack, nest, qtb, NETDEV_A_QUEUE_ID) || + NL_REQ_ATTR_CHECK(info->extack, nest, qtb, NETDEV_A_QUEUE_TYPE)) + return -EINVAL; + if (nla_get_u32(qtb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) { + NL_SET_BAD_ATTR(info->extack, qtb[NETDEV_A_QUEUE_TYPE]); + return -EINVAL; + } + if (ifindex == ifindex_lease) { + NL_SET_ERR_MSG(info->extack, + "Lease ifindex cannot be the same as queue creation ifindex"); + return -EINVAL; + } + + queue_id_lease = nla_get_u32(qtb[NETDEV_A_QUEUE_ID]); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) { + err = -EMSGSIZE; + goto err_genlmsg_free; + } + + /* Locking order is always from the virtual to the physical device + * since this is also the same order when applications open the + * memory provider later on. + */ + dev = netdev_get_by_index_lock(genl_info_net(info), ifindex); + if (!dev) { + err = -ENODEV; + goto err_genlmsg_free; + } + if (!netdev_can_create_queue(dev, info->extack)) { + err = -EINVAL; + goto err_unlock_dev; + } + + dev_lease = netdev_get_by_index(genl_info_net(info), ifindex_lease, + &dev_tracker, GFP_KERNEL); + if (!dev_lease) { + err = -ENODEV; + goto err_unlock_dev; + } + if (!netdev_can_lease_queue(dev_lease, info->extack)) { + netdev_put(dev_lease, &dev_tracker); + err = -EINVAL; + goto err_unlock_dev; + } + + dev_lease = netdev_put_lock(dev_lease, &dev_tracker); + if (!dev_lease) { + err = -ENODEV; + goto err_unlock_dev; + } + if (queue_id_lease >= dev_lease->real_num_rx_queues) { + err = -ERANGE; + NL_SET_BAD_ATTR(info->extack, qtb[NETDEV_A_QUEUE_ID]); + goto err_unlock_dev_lease; + } + if (netdev_queue_busy(dev_lease, queue_id_lease, info->extack)) { + err = -EBUSY; + goto err_unlock_dev_lease; + } + + rxq_lease = __netif_get_rx_queue(dev_lease, queue_id_lease); + rxq = __netif_get_rx_queue(dev, dev->real_num_rx_queues - 1); + + if (rxq->lease && rxq->lease->dev != dev_lease) { + err = -EOPNOTSUPP; + NL_SET_ERR_MSG(info->extack, + "Leasing multiple queues from different devices not supported"); + goto err_unlock_dev_lease; + } + + err = queue_id = dev->queue_mgmt_ops->ndo_queue_create(dev); + if (err < 0) { + NL_SET_ERR_MSG(info->extack, + "Device is unable to create a new queue"); + goto err_unlock_dev_lease; + } + + rxq = __netif_get_rx_queue(dev, queue_id); + netdev_rx_queue_lease(rxq, rxq_lease); + + nla_put_u32(rsp, NETDEV_A_QUEUE_ID, queue_id); + genlmsg_end(rsp, hdr); + + netdev_unlock(dev_lease); + netdev_unlock(dev); + + return genlmsg_reply(rsp, info); + +err_unlock_dev_lease: + netdev_unlock(dev_lease); +err_unlock_dev: + netdev_unlock(dev); +err_genlmsg_free: + nlmsg_free(rsp); + return err; } void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv) diff --git a/net/core/netdev_queues.c b/net/core/netdev_queues.c index 251f27a8307f..fae92ee090c4 100644 --- a/net/core/netdev_queues.c +++ b/net/core/netdev_queues.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include +#include +#include /** * netdev_queue_get_dma_dev() - get dma device for zero-copy operations @@ -25,3 +27,58 @@ struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx) return dma_dev && dma_dev->dma_mask ? dma_dev : NULL; } +bool netdev_can_create_queue(const struct net_device *dev, + struct netlink_ext_ack *extack) +{ + if (dev->dev.parent) { + NL_SET_ERR_MSG(extack, "Device is not a virtual device"); + return false; + } + if (!dev->queue_mgmt_ops || + !dev->queue_mgmt_ops->ndo_queue_create) { + NL_SET_ERR_MSG(extack, "Device does not support queue creation"); + return false; + } + if (dev->real_num_rx_queues < 1 || + dev->real_num_tx_queues < 1) { + NL_SET_ERR_MSG(extack, "Device must have at least one real queue"); + return false; + } + return true; +} + +bool netdev_can_lease_queue(const struct net_device *dev, + struct netlink_ext_ack *extack) +{ + if (!dev->dev.parent) { + NL_SET_ERR_MSG(extack, "Lease device is a virtual device"); + return false; + } + if (!netif_device_present(dev)) { + NL_SET_ERR_MSG(extack, "Lease device has been removed from the system"); + return false; + } + if (!dev->queue_mgmt_ops) { + NL_SET_ERR_MSG(extack, "Lease device does not support queue management operations"); + return false; + } + return true; +} + +bool netdev_queue_busy(struct net_device *dev, int idx, + struct netlink_ext_ack *extack) +{ + if (netif_rxq_is_leased(dev, idx)) { + NL_SET_ERR_MSG(extack, "Lease device queue is already leased"); + return true; + } + if (xsk_get_pool_from_qid(dev, idx)) { + NL_SET_ERR_MSG(extack, "Lease device queue in use by AF_XDP"); + return true; + } + if (netif_rxq_has_mp(dev, idx)) { + NL_SET_ERR_MSG(extack, "Lease device queue in use by memory provider"); + return true; + } + return false; +} diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index c7d9341b7630..830c1a964c36 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -9,15 +9,53 @@ #include "page_pool_priv.h" -/* See also page_pool_is_unreadable() */ -bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx) +void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst, + struct netdev_rx_queue *rxq_src) { - struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx); + netdev_assert_locked(rxq_src->dev); + netdev_assert_locked(rxq_dst->dev); + + netdev_hold(rxq_src->dev, &rxq_src->lease_tracker, GFP_KERNEL); - return !!rxq->mp_params.mp_ops; + WRITE_ONCE(rxq_src->lease, rxq_dst); + WRITE_ONCE(rxq_dst->lease, rxq_src); +} + +void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst, + struct netdev_rx_queue *rxq_src) +{ + netdev_assert_locked(rxq_dst->dev); + netdev_assert_locked(rxq_src->dev); + + WRITE_ONCE(rxq_src->lease, NULL); + WRITE_ONCE(rxq_dst->lease, NULL); + + netdev_put(rxq_src->dev, &rxq_src->lease_tracker); +} + +bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx) +{ + if (rxq_idx < dev->real_num_rx_queues) + return READ_ONCE(__netif_get_rx_queue(dev, rxq_idx)->lease); + return false; +} + +/* See also page_pool_is_unreadable() */ +bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx) +{ + if (rxq_idx < dev->real_num_rx_queues) + return __netif_get_rx_queue(dev, rxq_idx)->mp_params.mp_ops; + return false; } EXPORT_SYMBOL(netif_rxq_has_unreadable_mp); +bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx) +{ + if (rxq_idx < dev->real_num_rx_queues) + return __netif_get_rx_queue(dev, rxq_idx)->mp_params.mp_priv; + return false; +} + int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) { struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 3b46bc635c43..67f32a249ecb 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -103,7 +103,7 @@ bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool) } EXPORT_SYMBOL(xsk_uses_need_wakeup); -struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, +struct xsk_buff_pool *xsk_get_pool_from_qid(const struct net_device *dev, u16 queue_id) { if (queue_id < dev->real_num_rx_queues) -- cgit v1.2.3 From 9e2103f36110b50fc30be333fe2b43522c2dfa2a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 15 Jan 2026 09:25:50 +0100 Subject: net: Add lease info to queue-get response Populate nested lease info to the queue-get response that returns the ifindex, queue id with type and optionally netns id if the device resides in a different netns. Example with ynl client: # ip a [...] 4: enp10s0f0np0: mtu 1500 xdp/id:24 qdisc mq state UP group default qlen 1000 link/ether e8:eb:d3:a3:43:f6 brd ff:ff:ff:ff:ff:ff inet 10.0.0.2/24 scope global enp10s0f0np0 valid_lft forever preferred_lft forever inet6 fe80::eaeb:d3ff:fea3:43f6/64 scope link proto kernel_ll valid_lft forever preferred_lft forever [...] # ethtool -i enp10s0f0np0 driver: mlx5_core [...] # ./pyynl/cli.py \ --spec ~/netlink/specs/netdev.yaml \ --do queue-get \ --json '{"ifindex": 4, "id": 15, "type": "rx"}' {'id': 15, 'ifindex': 4, 'lease': {'ifindex': 8, 'netns-id': 0, 'queue': {'id': 1, 'type': 'rx'}}, 'napi-id': 8227, 'type': 'rx', 'xsk': {}} # ip netns list foo (id: 0) # ip netns exec foo ip a [...] 8: nk@NONE: mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff inet6 fe80::200:ff:fe00:0/64 scope link proto kernel_ll valid_lft forever preferred_lft forever [...] # ip netns exec foo ethtool -i nk driver: netkit [...] # ip netns exec foo ls /sys/class/net/nk/queues/ rx-0 rx-1 tx-0 # ip netns exec foo ./pyynl/cli.py \ --spec ~/netlink/specs/netdev.yaml \ --do queue-get \ --json '{"ifindex": 8, "id": 1, "type": "rx"}' {'id': 1, 'ifindex': 8, 'type': 'rx'} Note that the caller of netdev_nl_queue_fill_one() holds the netdevice lock. For the queue-get we do not lock both devices. When queues get {un,}leased, both devices are locked, thus if __netif_get_rx_queue_peer() returns true, the peer pointer points to a valid device. The netns-id is fetched via peernet2id_alloc() similarly as done in OVS. Signed-off-by: Daniel Borkmann Co-developed-by: David Wei Signed-off-by: David Wei Reviewed-by: Nikolay Aleksandrov Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20260115082603.219152-4-daniel@iogearbox.net Signed-off-by: Paolo Abeni --- include/net/netdev_rx_queue.h | 10 ++++++++++ net/core/netdev-genl.c | 36 ++++++++++++++++++++++++++++++++++ net/core/netdev_rx_queue.c | 45 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 91 insertions(+) (limited to 'include') diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index 1cacc2451516..de04fdfdad72 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -63,4 +63,14 @@ void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst, struct netdev_rx_queue *rxq_src); void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst, struct netdev_rx_queue *rxq_src); +bool netif_rx_queue_lease_get_owner(struct net_device **dev, unsigned int *rxq); + +enum netif_lease_dir { + NETIF_VIRT_TO_PHYS, + NETIF_PHYS_TO_VIRT, +}; + +struct netdev_rx_queue * +__netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq, + enum netif_lease_dir dir); #endif /* _LINUX_NETDEV_RX_QUEUE_H */ diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index cd4dc4eef029..51c830f88f10 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -391,8 +391,11 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { struct pp_memory_provider_params *params; + struct net_device *orig_netdev = netdev; + struct nlattr *nest_lease, *nest_queue; struct netdev_rx_queue *rxq; struct netdev_queue *txq; + u32 lease_q_idx = q_idx; void *hdr; hdr = genlmsg_iput(rsp, info); @@ -410,6 +413,37 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, if (nla_put_napi_id(rsp, rxq->napi)) goto nla_put_failure; + if (netif_rx_queue_lease_get_owner(&netdev, &lease_q_idx)) { + struct net *net, *peer_net; + + nest_lease = nla_nest_start(rsp, NETDEV_A_QUEUE_LEASE); + if (!nest_lease) + goto nla_put_failure; + nest_queue = nla_nest_start(rsp, NETDEV_A_LEASE_QUEUE); + if (!nest_queue) + goto nla_put_failure; + if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, lease_q_idx)) + goto nla_put_failure; + if (nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type)) + goto nla_put_failure; + nla_nest_end(rsp, nest_queue); + if (nla_put_u32(rsp, NETDEV_A_LEASE_IFINDEX, + READ_ONCE(netdev->ifindex))) + goto nla_put_failure; + rcu_read_lock(); + peer_net = dev_net_rcu(netdev); + net = dev_net_rcu(orig_netdev); + if (!net_eq(net, peer_net)) { + s32 id = peernet2id_alloc(net, peer_net, GFP_ATOMIC); + + if (nla_put_s32(rsp, NETDEV_A_LEASE_NETNS_ID, id)) + goto nla_put_failure_unlock; + } + rcu_read_unlock(); + nla_nest_end(rsp, nest_lease); + netdev = orig_netdev; + } + params = &rxq->mp_params; if (params->mp_ops && params->mp_ops->nl_fill(params->mp_priv, rsp, rxq)) @@ -437,6 +471,8 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, return 0; +nla_put_failure_unlock: + rcu_read_unlock(); nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index 830c1a964c36..61fe25817e98 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -40,6 +40,51 @@ bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx) return false; } +static bool netif_lease_dir_ok(const struct net_device *dev, + enum netif_lease_dir dir) +{ + if (dir == NETIF_VIRT_TO_PHYS && !dev->dev.parent) + return true; + if (dir == NETIF_PHYS_TO_VIRT && dev->dev.parent) + return true; + return false; +} + +struct netdev_rx_queue * +__netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq_idx, + enum netif_lease_dir dir) +{ + struct net_device *orig_dev = *dev; + struct netdev_rx_queue *rxq = __netif_get_rx_queue(orig_dev, *rxq_idx); + + if (rxq->lease) { + if (!netif_lease_dir_ok(orig_dev, dir)) + return NULL; + rxq = rxq->lease; + *rxq_idx = get_netdev_rx_queue_index(rxq); + *dev = rxq->dev; + } + return rxq; +} + +bool netif_rx_queue_lease_get_owner(struct net_device **dev, + unsigned int *rxq_idx) +{ + struct net_device *orig_dev = *dev; + struct netdev_rx_queue *rxq; + + /* The physical device needs to be locked. If there is indeed a lease, + * then the virtual device holds a reference on the physical device + * and the lease stays active until the virtual device is torn down. + * When queues get {un,}leased both devices are always locked. + */ + netdev_ops_assert_locked(orig_dev); + rxq = __netif_get_rx_queue_lease(dev, rxq_idx, NETIF_PHYS_TO_VIRT); + if (rxq && orig_dev != *dev) + return true; + return false; +} + /* See also page_pool_is_unreadable() */ bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx) { -- cgit v1.2.3 From 0caa9a8ddec3bf87bffb0eb99635068ddddce35d Mon Sep 17 00:00:00 2001 From: David Wei Date: Thu, 15 Jan 2026 09:25:52 +0100 Subject: net: Proxy net_mp_{open,close}_rxq for leased queues When a process in a container wants to setup a memory provider, it will use the virtual netdev and a leased rxq, and call net_mp_{open,close}_rxq to try and restart the queue. At this point, proxy the queue restart on the real rxq in the physical netdev. For memory providers (io_uring zero-copy rx and devmem), it causes the real rxq in the physical netdev to be filled from a memory provider that has DMA mapped memory from a process within a container. Signed-off-by: David Wei Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Reviewed-by: Nikolay Aleksandrov Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20260115082603.219152-6-daniel@iogearbox.net Signed-off-by: Paolo Abeni --- include/net/netdev_rx_queue.h | 4 ++ include/net/page_pool/memory_provider.h | 4 +- net/core/netdev_rx_queue.c | 78 +++++++++++++++++++++++++-------- 3 files changed, 66 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index de04fdfdad72..508d11afaecb 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -73,4 +73,8 @@ enum netif_lease_dir { struct netdev_rx_queue * __netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq, enum netif_lease_dir dir); +struct netdev_rx_queue * +netif_get_rx_queue_lease_locked(struct net_device **dev, unsigned int *rxq); +void netif_put_rx_queue_lease_locked(struct net_device *orig_dev, + struct net_device *dev); #endif /* _LINUX_NETDEV_RX_QUEUE_H */ diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h index ada4f968960a..b6f811c3416b 100644 --- a/include/net/page_pool/memory_provider.h +++ b/include/net/page_pool/memory_provider.h @@ -23,12 +23,12 @@ bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr); void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov); void net_mp_niov_clear_page_pool(struct net_iov *niov); -int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, +int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, struct pp_memory_provider_params *p); int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, const struct pp_memory_provider_params *p, struct netlink_ext_ack *extack); -void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, +void net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, struct pp_memory_provider_params *old_p); void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, const struct pp_memory_provider_params *old_p); diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index 61fe25817e98..75c7a68cb90d 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -67,6 +67,29 @@ __netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq_idx, return rxq; } +struct netdev_rx_queue * +netif_get_rx_queue_lease_locked(struct net_device **dev, unsigned int *rxq_idx) +{ + struct net_device *orig_dev = *dev; + struct netdev_rx_queue *rxq; + + /* Locking order is always from the virtual to the physical device + * see netdev_nl_queue_create_doit(). + */ + netdev_ops_assert_locked(orig_dev); + rxq = __netif_get_rx_queue_lease(dev, rxq_idx, NETIF_VIRT_TO_PHYS); + if (rxq && orig_dev != *dev) + netdev_lock(*dev); + return rxq; +} + +void netif_put_rx_queue_lease_locked(struct net_device *orig_dev, + struct net_device *dev) +{ + if (orig_dev != dev) + netdev_unlock(dev); +} + bool netif_rx_queue_lease_get_owner(struct net_device **dev, unsigned int *rxq_idx) { @@ -183,49 +206,63 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, const struct pp_memory_provider_params *p, struct netlink_ext_ack *extack) { + struct net_device *orig_dev = dev; struct netdev_rx_queue *rxq; int ret; if (!netdev_need_ops_lock(dev)) return -EOPNOTSUPP; - if (rxq_idx >= dev->real_num_rx_queues) { NL_SET_ERR_MSG(extack, "rx queue index out of range"); return -ERANGE; } - rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues); + rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues); + rxq = netif_get_rx_queue_lease_locked(&dev, &rxq_idx); + if (!rxq) { + NL_SET_ERR_MSG(extack, "rx queue peered to a virtual netdev"); + return -EBUSY; + } + if (!dev->dev.parent) { + NL_SET_ERR_MSG(extack, "rx queue is mapped to a virtual netdev"); + ret = -EBUSY; + goto out; + } if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { NL_SET_ERR_MSG(extack, "tcp-data-split is disabled"); - return -EINVAL; + ret = -EINVAL; + goto out; } if (dev->cfg->hds_thresh) { NL_SET_ERR_MSG(extack, "hds-thresh is not zero"); - return -EINVAL; + ret = -EINVAL; + goto out; } if (dev_xdp_prog_count(dev)) { NL_SET_ERR_MSG(extack, "unable to custom memory provider to device with XDP program attached"); - return -EEXIST; + ret = -EEXIST; + goto out; } - - rxq = __netif_get_rx_queue(dev, rxq_idx); if (rxq->mp_params.mp_ops) { NL_SET_ERR_MSG(extack, "designated queue already memory provider bound"); - return -EEXIST; + ret = -EEXIST; + goto out; } #ifdef CONFIG_XDP_SOCKETS if (rxq->pool) { NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP"); - return -EBUSY; + ret = -EBUSY; + goto out; } #endif - rxq->mp_params = *p; ret = netdev_rx_queue_restart(dev, rxq_idx); if (ret) { rxq->mp_params.mp_ops = NULL; rxq->mp_params.mp_priv = NULL; } +out: + netif_put_rx_queue_lease_locked(orig_dev, dev); return ret; } @@ -240,38 +277,43 @@ int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, return ret; } -void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, +void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, const struct pp_memory_provider_params *old_p) { + struct net_device *orig_dev = dev; struct netdev_rx_queue *rxq; int err; - if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues)) + if (WARN_ON_ONCE(rxq_idx >= dev->real_num_rx_queues)) return; - rxq = __netif_get_rx_queue(dev, ifq_idx); + rxq = netif_get_rx_queue_lease_locked(&dev, &rxq_idx); + if (WARN_ON_ONCE(!rxq)) + return; /* Callers holding a netdev ref may get here after we already * went thru shutdown via dev_memory_provider_uninstall(). */ if (dev->reg_state > NETREG_REGISTERED && !rxq->mp_params.mp_ops) - return; + goto out; if (WARN_ON_ONCE(rxq->mp_params.mp_ops != old_p->mp_ops || rxq->mp_params.mp_priv != old_p->mp_priv)) - return; + goto out; rxq->mp_params.mp_ops = NULL; rxq->mp_params.mp_priv = NULL; - err = netdev_rx_queue_restart(dev, ifq_idx); + err = netdev_rx_queue_restart(dev, rxq_idx); WARN_ON(err && err != -ENETDOWN); +out: + netif_put_rx_queue_lease_locked(orig_dev, dev); } -void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, +void net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, struct pp_memory_provider_params *old_p) { netdev_lock(dev); - __net_mp_close_rxq(dev, ifq_idx, old_p); + __net_mp_close_rxq(dev, rxq_idx, old_p); netdev_unlock(dev); } -- cgit v1.2.3 From b5c3fa4a0b16d4a7d0bd0e5626a13fec0024030a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 15 Jan 2026 09:25:56 +0100 Subject: netkit: Add single device mode for netkit Add a single device mode for netkit instead of netkit pairs. The primary target for the paired devices is to connect network namespaces, of course, and support has been implemented in projects like Cilium [0]. For the rxq leasing the plan is to support two main scenarios related to single device mode: * For the use-case of io_uring zero-copy, the control plane can either set up a netkit pair where the peer device can perform rxq leasing which is then tied to the lifetime of the peer device, or the control plane can use a regular netkit pair to connect the hostns to a Pod/container and dynamically add/remove rxq leasing through a single device without having to interrupt the device pair. In the case of io_uring, the memory pool is used as skb non-linear pages, and thus the skb will go its way through the regular stack into netkit. Things like the netkit policy when no BPF is attached or skb scrubbing etc apply as-is in case the paired devices are used, or if the backend memory is tied to the single device and traffic goes through a paired device. * For the use-case of AF_XDP, the control plane needs to use netkit in the single device mode. The single device mode currently enforces only a pass policy when no BPF is attached, and does not yet support BPF link attachments for AF_XDP. skbs sent to that device get dropped at the moment. Given AF_XDP operates at a lower layer of the stack tying this to the netkit pair did not make sense. In future, the plan is to allow BPF at the XDP layer which can: i) process traffic coming from the AF_XDP application (e.g. QEMU with AF_XDP backend) to filter egress traffic or to push selected egress traffic up to the single netkit device to the local stack (e.g. DHCP requests), and ii) vice-versa skbs sent to the single netkit into the AF_XDP application (e.g. DHCP replies). Also, the control-plane can dynamically manage rxq leasing for the single netkit device without having to interrupt (e.g. down/up cycle) the main netkit pair for the Pod which has traffic going in and out. Signed-off-by: Daniel Borkmann Co-developed-by: David Wei Signed-off-by: David Wei Reviewed-by: Jordan Rife Reviewed-by: Nikolay Aleksandrov Link: https://docs.cilium.io/en/stable/operations/performance/tuning/#netkit-device-mode [0] Link: https://patch.msgid.link/20260115082603.219152-10-daniel@iogearbox.net Signed-off-by: Paolo Abeni --- drivers/net/netkit.c | 110 +++++++++++++++++++++++++++---------------- include/uapi/linux/if_link.h | 6 +++ 2 files changed, 76 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index 0a2fef7caccb..76332a98af92 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -26,6 +26,7 @@ struct netkit { __cacheline_group_begin(netkit_slowpath); enum netkit_mode mode; + enum netkit_pairing pair; bool primary; u32 headroom; __cacheline_group_end(netkit_slowpath); @@ -135,6 +136,10 @@ static int netkit_open(struct net_device *dev) struct netkit *nk = netkit_priv(dev); struct net_device *peer = rtnl_dereference(nk->peer); + if (nk->pair == NETKIT_DEVICE_SINGLE) { + netif_carrier_on(dev); + return 0; + } if (!peer) return -ENOTCONN; if (peer->flags & IFF_UP) { @@ -335,6 +340,7 @@ static int netkit_new_link(struct net_device *dev, enum netkit_scrub scrub_prim = NETKIT_SCRUB_DEFAULT; enum netkit_scrub scrub_peer = NETKIT_SCRUB_DEFAULT; struct nlattr *peer_tb[IFLA_MAX + 1], **tbp, *attr; + enum netkit_pairing pair = NETKIT_DEVICE_PAIR; enum netkit_action policy_prim = NETKIT_PASS; enum netkit_action policy_peer = NETKIT_PASS; struct nlattr **data = params->data; @@ -343,7 +349,8 @@ static int netkit_new_link(struct net_device *dev, struct nlattr **tb = params->tb; u16 headroom = 0, tailroom = 0; struct ifinfomsg *ifmp = NULL; - struct net_device *peer; + struct net_device *peer = NULL; + bool seen_peer = false; char ifname[IFNAMSIZ]; struct netkit *nk; int err; @@ -380,6 +387,12 @@ static int netkit_new_link(struct net_device *dev, headroom = nla_get_u16(data[IFLA_NETKIT_HEADROOM]); if (data[IFLA_NETKIT_TAILROOM]) tailroom = nla_get_u16(data[IFLA_NETKIT_TAILROOM]); + if (data[IFLA_NETKIT_PAIRING]) + pair = nla_get_u32(data[IFLA_NETKIT_PAIRING]); + + seen_peer = data[IFLA_NETKIT_PEER_INFO] || + data[IFLA_NETKIT_PEER_SCRUB] || + data[IFLA_NETKIT_PEER_POLICY]; } if (ifmp && tbp[IFLA_IFNAME]) { @@ -392,45 +405,46 @@ static int netkit_new_link(struct net_device *dev, if (mode != NETKIT_L2 && (tb[IFLA_ADDRESS] || tbp[IFLA_ADDRESS])) return -EOPNOTSUPP; + if (pair == NETKIT_DEVICE_SINGLE && + (tb != tbp || seen_peer || policy_prim != NETKIT_PASS)) + return -EOPNOTSUPP; - peer = rtnl_create_link(peer_net, ifname, ifname_assign_type, - &netkit_link_ops, tbp, extack); - if (IS_ERR(peer)) - return PTR_ERR(peer); - - netif_inherit_tso_max(peer, dev); - if (headroom) { - peer->needed_headroom = headroom; - dev->needed_headroom = headroom; - } - if (tailroom) { - peer->needed_tailroom = tailroom; - dev->needed_tailroom = tailroom; - } - - if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS])) - eth_hw_addr_random(peer); - if (ifmp && dev->ifindex) - peer->ifindex = ifmp->ifi_index; - - nk = netkit_priv(peer); - nk->primary = false; - nk->policy = policy_peer; - nk->scrub = scrub_peer; - nk->mode = mode; - nk->headroom = headroom; - bpf_mprog_bundle_init(&nk->bundle); + if (pair == NETKIT_DEVICE_PAIR) { + peer = rtnl_create_link(peer_net, ifname, ifname_assign_type, + &netkit_link_ops, tbp, extack); + if (IS_ERR(peer)) + return PTR_ERR(peer); + + netif_inherit_tso_max(peer, dev); + if (headroom) + peer->needed_headroom = headroom; + if (tailroom) + peer->needed_tailroom = tailroom; + if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS])) + eth_hw_addr_random(peer); + if (ifmp && dev->ifindex) + peer->ifindex = ifmp->ifi_index; - err = register_netdevice(peer); - if (err < 0) - goto err_register_peer; - netif_carrier_off(peer); - if (mode == NETKIT_L2) - dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL); + nk = netkit_priv(peer); + nk->primary = false; + nk->policy = policy_peer; + nk->scrub = scrub_peer; + nk->mode = mode; + nk->pair = pair; + nk->headroom = headroom; + bpf_mprog_bundle_init(&nk->bundle); + + err = register_netdevice(peer); + if (err < 0) + goto err_register_peer; + netif_carrier_off(peer); + if (mode == NETKIT_L2) + dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL); - err = rtnl_configure_link(peer, NULL, 0, NULL); - if (err < 0) - goto err_configure_peer; + err = rtnl_configure_link(peer, NULL, 0, NULL); + if (err < 0) + goto err_configure_peer; + } if (mode == NETKIT_L2 && !tb[IFLA_ADDRESS]) eth_hw_addr_random(dev); @@ -438,12 +452,17 @@ static int netkit_new_link(struct net_device *dev, nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); else strscpy(dev->name, "nk%d", IFNAMSIZ); + if (headroom) + dev->needed_headroom = headroom; + if (tailroom) + dev->needed_tailroom = tailroom; nk = netkit_priv(dev); nk->primary = true; nk->policy = policy_prim; nk->scrub = scrub_prim; nk->mode = mode; + nk->pair = pair; nk->headroom = headroom; bpf_mprog_bundle_init(&nk->bundle); @@ -455,10 +474,12 @@ static int netkit_new_link(struct net_device *dev, dev_change_flags(dev, dev->flags & ~IFF_NOARP, NULL); rcu_assign_pointer(netkit_priv(dev)->peer, peer); - rcu_assign_pointer(netkit_priv(peer)->peer, dev); + if (peer) + rcu_assign_pointer(netkit_priv(peer)->peer, dev); return 0; err_configure_peer: - unregister_netdevice(peer); + if (peer) + unregister_netdevice(peer); return err; err_register_peer: free_netdev(peer); @@ -518,6 +539,8 @@ static struct net_device *netkit_dev_fetch(struct net *net, u32 ifindex, u32 whi nk = netkit_priv(dev); if (!nk->primary) return ERR_PTR(-EACCES); + if (nk->pair == NETKIT_DEVICE_SINGLE) + return ERR_PTR(-EOPNOTSUPP); if (which == BPF_NETKIT_PEER) { dev = rcu_dereference_rtnl(nk->peer); if (!dev) @@ -879,6 +902,7 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[], { IFLA_NETKIT_PEER_INFO, "peer info" }, { IFLA_NETKIT_HEADROOM, "headroom" }, { IFLA_NETKIT_TAILROOM, "tailroom" }, + { IFLA_NETKIT_PAIRING, "pairing" }, }; if (!nk->primary) { @@ -898,9 +922,11 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[], } if (data[IFLA_NETKIT_POLICY]) { + err = -EOPNOTSUPP; attr = data[IFLA_NETKIT_POLICY]; policy = nla_get_u32(attr); - err = netkit_check_policy(policy, attr, extack); + if (nk->pair == NETKIT_DEVICE_PAIR) + err = netkit_check_policy(policy, attr, extack); if (err) return err; WRITE_ONCE(nk->policy, policy); @@ -931,6 +957,7 @@ static size_t netkit_get_size(const struct net_device *dev) nla_total_size(sizeof(u8)) + /* IFLA_NETKIT_PRIMARY */ nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_HEADROOM */ nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_TAILROOM */ + nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PAIRING */ 0; } @@ -951,6 +978,8 @@ static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev) return -EMSGSIZE; if (nla_put_u16(skb, IFLA_NETKIT_TAILROOM, dev->needed_tailroom)) return -EMSGSIZE; + if (nla_put_u32(skb, IFLA_NETKIT_PAIRING, nk->pair)) + return -EMSGSIZE; if (peer) { nk = netkit_priv(peer); @@ -972,6 +1001,7 @@ static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = { [IFLA_NETKIT_TAILROOM] = { .type = NLA_U16 }, [IFLA_NETKIT_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT), [IFLA_NETKIT_PEER_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT), + [IFLA_NETKIT_PAIRING] = NLA_POLICY_MAX(NLA_U32, NETKIT_DEVICE_SINGLE), [IFLA_NETKIT_PRIMARY] = { .type = NLA_REJECT, .reject_message = "Primary attribute is read-only" }, }; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 3b491d96e52e..bbd565757298 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1296,6 +1296,11 @@ enum netkit_mode { NETKIT_L3, }; +enum netkit_pairing { + NETKIT_DEVICE_PAIR, + NETKIT_DEVICE_SINGLE, +}; + /* NETKIT_SCRUB_NONE leaves clearing skb->{mark,priority} up to * the BPF program if attached. This also means the latter can * consume the two fields if they were populated earlier. @@ -1320,6 +1325,7 @@ enum { IFLA_NETKIT_PEER_SCRUB, IFLA_NETKIT_HEADROOM, IFLA_NETKIT_TAILROOM, + IFLA_NETKIT_PAIRING, __IFLA_NETKIT_MAX, }; #define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1) -- cgit v1.2.3 From eef51113f8afd35c69cbf3702e0ecd55263f2416 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 15 Jan 2026 09:25:58 +0100 Subject: netkit: Add netkit notifier to check for unregistering devices Add a netdevice notifier in netkit to watch for NETDEV_UNREGISTER events. If the target device is indeed NETREG_UNREGISTERING and previously leased a queue to a netkit device, then collect the related netkit devices and batch-unregister_netdevice_many() them. If this would not be done, then the netkit device would hold a reference on the physical device preventing it from going away. However, in case of both io_uring zero-copy as well as AF_XDP this situation is handled gracefully and the allocated resources are torn down. In the case where mentioned infra is used through netkit, the applications have a reference on netkit, and netkit in turn holds a reference on the physical device. In order to have netkit release the reference on the physical device, we need such watcher to then unregister the netkit ones. This is generally quite similar to the dependency handling in case of tunnels (e.g. vxlan bound to a underlying netdev) where the tunnel device gets removed along with the physical device. # ip a [...] 4: enp10s0f0np0: mtu 1500 qdisc mq state DOWN group default qlen 1000 link/ether e8:eb:d3:a3:43:f6 brd ff:ff:ff:ff:ff:ff inet 10.0.0.2/24 scope global enp10s0f0np0 valid_lft forever preferred_lft forever [...] 8: nk@NONE: mtu 1500 qdisc noop state DOWN group default qlen 1000 link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff [...] # rmmod mlx5_ib # rmmod mlx5_core [ 309.261822] mlx5_core 0000:0a:00.0 mlx5_0: Port: 1 Link DOWN [ 344.235236] mlx5_core 0000:0a:00.1: E-Switch: Unload vfs: mode(LEGACY), nvfs(0), necvfs(0), active vports(0) [ 344.246948] mlx5_core 0000:0a:00.1: E-Switch: Disable: mode(LEGACY), nvfs(0), necvfs(0), active vports(0) [ 344.463754] mlx5_core 0000:0a:00.1: E-Switch: Disable: mode(LEGACY), nvfs(0), necvfs(0), active vports(0) [ 344.770155] mlx5_core 0000:0a:00.1: E-Switch: cleanup [ 345.345709] mlx5_core 0000:0a:00.0: E-Switch: Unload vfs: mode(LEGACY), nvfs(0), necvfs(0), active vports(0) [ 345.357524] mlx5_core 0000:0a:00.0: E-Switch: Disable: mode(LEGACY), nvfs(0), necvfs(0), active vports(0) [ 350.995989] mlx5_core 0000:0a:00.0: E-Switch: Disable: mode(LEGACY), nvfs(0), necvfs(0), active vports(0) [ 351.574396] mlx5_core 0000:0a:00.0: E-Switch: cleanup # ip a [...] [ both enp10s0f0np0 and nk gone ] [...] Signed-off-by: Daniel Borkmann Co-developed-by: David Wei Signed-off-by: David Wei Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20260115082603.219152-12-daniel@iogearbox.net Signed-off-by: Paolo Abeni --- drivers/net/netkit.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++- include/linux/netdevice.h | 6 +++++ 2 files changed, 62 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index c6688b604337..5c417c6055d8 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -1039,6 +1039,48 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[], return 0; } +static void netkit_check_lease_unregister(struct net_device *dev) +{ + LIST_HEAD(list_kill); + u32 q_idx; + + if (READ_ONCE(dev->reg_state) != NETREG_UNREGISTERING || + !dev->dev.parent) + return; + + netdev_lock_ops(dev); + for (q_idx = 0; q_idx < dev->real_num_rx_queues; q_idx++) { + struct net_device *tmp = dev; + u32 tmp_q_idx = q_idx; + + if (netif_rx_queue_lease_get_owner(&tmp, &tmp_q_idx)) { + if (tmp->netdev_ops != &netkit_netdev_ops) + continue; + /* A single phys device can have multiple queues leased + * to one netkit device. We can only queue that netkit + * device once to the list_kill. Queues of that phys + * device can be leased with different individual netkit + * devices, hence we batch via list_kill. + */ + if (unregister_netdevice_queued(tmp)) + continue; + netkit_del_link(tmp, &list_kill); + } + } + netdev_unlock_ops(dev); + unregister_netdevice_many(&list_kill); +} + +static int netkit_notifier(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + if (event == NETDEV_UNREGISTER) + netkit_check_lease_unregister(dev); + return NOTIFY_DONE; +} + static size_t netkit_get_size(const struct net_device *dev) { return nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_POLICY */ @@ -1115,18 +1157,31 @@ static struct rtnl_link_ops netkit_link_ops = { .maxtype = IFLA_NETKIT_MAX, }; +static struct notifier_block netkit_netdev_notifier = { + .notifier_call = netkit_notifier, +}; + static __init int netkit_mod_init(void) { + int ret; + BUILD_BUG_ON((int)NETKIT_NEXT != (int)TCX_NEXT || (int)NETKIT_PASS != (int)TCX_PASS || (int)NETKIT_DROP != (int)TCX_DROP || (int)NETKIT_REDIRECT != (int)TCX_REDIRECT); - return rtnl_link_register(&netkit_link_ops); + ret = rtnl_link_register(&netkit_link_ops); + if (ret) + return ret; + ret = register_netdevice_notifier(&netkit_netdev_notifier); + if (ret) + rtnl_link_unregister(&netkit_link_ops); + return ret; } static __exit void netkit_mod_exit(void) { + unregister_netdevice_notifier(&netkit_netdev_notifier); rtnl_link_unregister(&netkit_link_ops); } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d99b0fbc1942..4d146c000e21 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3400,11 +3400,17 @@ static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) int register_netdevice(struct net_device *dev); void unregister_netdevice_queue(struct net_device *dev, struct list_head *head); void unregister_netdevice_many(struct list_head *head); + static inline void unregister_netdevice(struct net_device *dev) { unregister_netdevice_queue(dev, NULL); } +static inline bool unregister_netdevice_queued(const struct net_device *dev) +{ + return !list_empty(&dev->unreg_list); +} + int netdev_refcnt_read(const struct net_device *dev); void free_netdev(struct net_device *dev); -- cgit v1.2.3