From 31127deddef4a13628202a7bfef912e6c1ba3e57 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 15 Jan 2026 09:25:49 +0100 Subject: net: Implement netdev_nl_queue_create_doit Implement netdev_nl_queue_create_doit which creates a new rx queue in a virtual netdev and then leases it to a rx queue in a physical netdev. Example with ynl client: # ./pyynl/cli.py \ --spec ~/netlink/specs/netdev.yaml \ --do queue-create \ --json '{"ifindex": 8, "type": "rx", "lease": {"ifindex": 4, "queue": {"type": "rx", "id": 15}}}' {'id': 1} Note that the netdevice locking order is always from the virtual to the physical device. Signed-off-by: Daniel Borkmann Co-developed-by: David Wei Signed-off-by: David Wei Acked-by: Stanislav Fomichev Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20260115082603.219152-3-daniel@iogearbox.net Signed-off-by: Paolo Abeni --- include/net/netdev_queues.h | 19 ++++++++++++++++--- include/net/netdev_rx_queue.h | 9 +++++++-- include/net/xdp_sock_drv.h | 2 +- 3 files changed, 24 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index b55d3b9cb9c2..81dc7cb2360c 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -130,6 +130,11 @@ void netdev_stat_queue_sum(struct net_device *netdev, * @ndo_queue_get_dma_dev: Get dma device for zero-copy operations to be used * for this queue. Return NULL on error. * + * @ndo_queue_create: Create a new RX queue which can be leased to another queue. + * Ops on this queue are redirected to the leased queue e.g. + * when opening a memory provider. Return the new queue id on + * success. Return negative error code on failure. + * * Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while * the interface is closed. @ndo_queue_start and @ndo_queue_stop will only * be called for an interface which is open. @@ -149,9 +154,12 @@ struct netdev_queue_mgmt_ops { int idx); struct device * (*ndo_queue_get_dma_dev)(struct net_device *dev, int idx); + int (*ndo_queue_create)(struct net_device *dev); }; -bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx); +bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx); +bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx); +bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx); /** * DOC: Lockless queue stopping / waking helpers. @@ -340,5 +348,10 @@ static inline unsigned int netif_xmit_timeout_ms(struct netdev_queue *txq) }) struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx); - -#endif +bool netdev_can_create_queue(const struct net_device *dev, + struct netlink_ext_ack *extack); +bool netdev_can_lease_queue(const struct net_device *dev, + struct netlink_ext_ack *extack); +bool netdev_queue_busy(struct net_device *dev, int idx, + struct netlink_ext_ack *extack); +#endif /* _LINUX_NET_QUEUES_H */ diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index 8cdcd138b33f..1cacc2451516 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -28,6 +28,8 @@ struct netdev_rx_queue { #endif struct napi_struct *napi; struct pp_memory_provider_params mp_params; + struct netdev_rx_queue *lease; + netdevice_tracker lease_tracker; } ____cacheline_aligned_in_smp; /* @@ -57,5 +59,8 @@ get_netdev_rx_queue_index(struct netdev_rx_queue *queue) } int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq); - -#endif +void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst, + struct netdev_rx_queue *rxq_src); +void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst, + struct netdev_rx_queue *rxq_src); +#endif /* _LINUX_NETDEV_RX_QUEUE_H */ diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 242e34f771cc..c07cfb431eac 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -28,7 +28,7 @@ void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max); void xsk_tx_release(struct xsk_buff_pool *pool); -struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, +struct xsk_buff_pool *xsk_get_pool_from_qid(const struct net_device *dev, u16 queue_id); void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool); void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool); -- cgit v1.2.3 From 9e2103f36110b50fc30be333fe2b43522c2dfa2a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 15 Jan 2026 09:25:50 +0100 Subject: net: Add lease info to queue-get response Populate nested lease info to the queue-get response that returns the ifindex, queue id with type and optionally netns id if the device resides in a different netns. Example with ynl client: # ip a [...] 4: enp10s0f0np0: mtu 1500 xdp/id:24 qdisc mq state UP group default qlen 1000 link/ether e8:eb:d3:a3:43:f6 brd ff:ff:ff:ff:ff:ff inet 10.0.0.2/24 scope global enp10s0f0np0 valid_lft forever preferred_lft forever inet6 fe80::eaeb:d3ff:fea3:43f6/64 scope link proto kernel_ll valid_lft forever preferred_lft forever [...] # ethtool -i enp10s0f0np0 driver: mlx5_core [...] # ./pyynl/cli.py \ --spec ~/netlink/specs/netdev.yaml \ --do queue-get \ --json '{"ifindex": 4, "id": 15, "type": "rx"}' {'id': 15, 'ifindex': 4, 'lease': {'ifindex': 8, 'netns-id': 0, 'queue': {'id': 1, 'type': 'rx'}}, 'napi-id': 8227, 'type': 'rx', 'xsk': {}} # ip netns list foo (id: 0) # ip netns exec foo ip a [...] 8: nk@NONE: mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff inet6 fe80::200:ff:fe00:0/64 scope link proto kernel_ll valid_lft forever preferred_lft forever [...] # ip netns exec foo ethtool -i nk driver: netkit [...] # ip netns exec foo ls /sys/class/net/nk/queues/ rx-0 rx-1 tx-0 # ip netns exec foo ./pyynl/cli.py \ --spec ~/netlink/specs/netdev.yaml \ --do queue-get \ --json '{"ifindex": 8, "id": 1, "type": "rx"}' {'id': 1, 'ifindex': 8, 'type': 'rx'} Note that the caller of netdev_nl_queue_fill_one() holds the netdevice lock. For the queue-get we do not lock both devices. When queues get {un,}leased, both devices are locked, thus if __netif_get_rx_queue_peer() returns true, the peer pointer points to a valid device. The netns-id is fetched via peernet2id_alloc() similarly as done in OVS. Signed-off-by: Daniel Borkmann Co-developed-by: David Wei Signed-off-by: David Wei Reviewed-by: Nikolay Aleksandrov Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20260115082603.219152-4-daniel@iogearbox.net Signed-off-by: Paolo Abeni --- include/net/netdev_rx_queue.h | 10 ++++++++++ net/core/netdev-genl.c | 36 ++++++++++++++++++++++++++++++++++ net/core/netdev_rx_queue.c | 45 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 91 insertions(+) (limited to 'include/net') diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index 1cacc2451516..de04fdfdad72 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -63,4 +63,14 @@ void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst, struct netdev_rx_queue *rxq_src); void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst, struct netdev_rx_queue *rxq_src); +bool netif_rx_queue_lease_get_owner(struct net_device **dev, unsigned int *rxq); + +enum netif_lease_dir { + NETIF_VIRT_TO_PHYS, + NETIF_PHYS_TO_VIRT, +}; + +struct netdev_rx_queue * +__netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq, + enum netif_lease_dir dir); #endif /* _LINUX_NETDEV_RX_QUEUE_H */ diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index cd4dc4eef029..51c830f88f10 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -391,8 +391,11 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { struct pp_memory_provider_params *params; + struct net_device *orig_netdev = netdev; + struct nlattr *nest_lease, *nest_queue; struct netdev_rx_queue *rxq; struct netdev_queue *txq; + u32 lease_q_idx = q_idx; void *hdr; hdr = genlmsg_iput(rsp, info); @@ -410,6 +413,37 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, if (nla_put_napi_id(rsp, rxq->napi)) goto nla_put_failure; + if (netif_rx_queue_lease_get_owner(&netdev, &lease_q_idx)) { + struct net *net, *peer_net; + + nest_lease = nla_nest_start(rsp, NETDEV_A_QUEUE_LEASE); + if (!nest_lease) + goto nla_put_failure; + nest_queue = nla_nest_start(rsp, NETDEV_A_LEASE_QUEUE); + if (!nest_queue) + goto nla_put_failure; + if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, lease_q_idx)) + goto nla_put_failure; + if (nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type)) + goto nla_put_failure; + nla_nest_end(rsp, nest_queue); + if (nla_put_u32(rsp, NETDEV_A_LEASE_IFINDEX, + READ_ONCE(netdev->ifindex))) + goto nla_put_failure; + rcu_read_lock(); + peer_net = dev_net_rcu(netdev); + net = dev_net_rcu(orig_netdev); + if (!net_eq(net, peer_net)) { + s32 id = peernet2id_alloc(net, peer_net, GFP_ATOMIC); + + if (nla_put_s32(rsp, NETDEV_A_LEASE_NETNS_ID, id)) + goto nla_put_failure_unlock; + } + rcu_read_unlock(); + nla_nest_end(rsp, nest_lease); + netdev = orig_netdev; + } + params = &rxq->mp_params; if (params->mp_ops && params->mp_ops->nl_fill(params->mp_priv, rsp, rxq)) @@ -437,6 +471,8 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, return 0; +nla_put_failure_unlock: + rcu_read_unlock(); nla_put_failure: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index 830c1a964c36..61fe25817e98 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -40,6 +40,51 @@ bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx) return false; } +static bool netif_lease_dir_ok(const struct net_device *dev, + enum netif_lease_dir dir) +{ + if (dir == NETIF_VIRT_TO_PHYS && !dev->dev.parent) + return true; + if (dir == NETIF_PHYS_TO_VIRT && dev->dev.parent) + return true; + return false; +} + +struct netdev_rx_queue * +__netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq_idx, + enum netif_lease_dir dir) +{ + struct net_device *orig_dev = *dev; + struct netdev_rx_queue *rxq = __netif_get_rx_queue(orig_dev, *rxq_idx); + + if (rxq->lease) { + if (!netif_lease_dir_ok(orig_dev, dir)) + return NULL; + rxq = rxq->lease; + *rxq_idx = get_netdev_rx_queue_index(rxq); + *dev = rxq->dev; + } + return rxq; +} + +bool netif_rx_queue_lease_get_owner(struct net_device **dev, + unsigned int *rxq_idx) +{ + struct net_device *orig_dev = *dev; + struct netdev_rx_queue *rxq; + + /* The physical device needs to be locked. If there is indeed a lease, + * then the virtual device holds a reference on the physical device + * and the lease stays active until the virtual device is torn down. + * When queues get {un,}leased both devices are always locked. + */ + netdev_ops_assert_locked(orig_dev); + rxq = __netif_get_rx_queue_lease(dev, rxq_idx, NETIF_PHYS_TO_VIRT); + if (rxq && orig_dev != *dev) + return true; + return false; +} + /* See also page_pool_is_unreadable() */ bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx) { -- cgit v1.2.3 From 0caa9a8ddec3bf87bffb0eb99635068ddddce35d Mon Sep 17 00:00:00 2001 From: David Wei Date: Thu, 15 Jan 2026 09:25:52 +0100 Subject: net: Proxy net_mp_{open,close}_rxq for leased queues When a process in a container wants to setup a memory provider, it will use the virtual netdev and a leased rxq, and call net_mp_{open,close}_rxq to try and restart the queue. At this point, proxy the queue restart on the real rxq in the physical netdev. For memory providers (io_uring zero-copy rx and devmem), it causes the real rxq in the physical netdev to be filled from a memory provider that has DMA mapped memory from a process within a container. Signed-off-by: David Wei Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Reviewed-by: Nikolay Aleksandrov Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20260115082603.219152-6-daniel@iogearbox.net Signed-off-by: Paolo Abeni --- include/net/netdev_rx_queue.h | 4 ++ include/net/page_pool/memory_provider.h | 4 +- net/core/netdev_rx_queue.c | 78 +++++++++++++++++++++++++-------- 3 files changed, 66 insertions(+), 20 deletions(-) (limited to 'include/net') diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index de04fdfdad72..508d11afaecb 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -73,4 +73,8 @@ enum netif_lease_dir { struct netdev_rx_queue * __netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq, enum netif_lease_dir dir); +struct netdev_rx_queue * +netif_get_rx_queue_lease_locked(struct net_device **dev, unsigned int *rxq); +void netif_put_rx_queue_lease_locked(struct net_device *orig_dev, + struct net_device *dev); #endif /* _LINUX_NETDEV_RX_QUEUE_H */ diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h index ada4f968960a..b6f811c3416b 100644 --- a/include/net/page_pool/memory_provider.h +++ b/include/net/page_pool/memory_provider.h @@ -23,12 +23,12 @@ bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr); void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov); void net_mp_niov_clear_page_pool(struct net_iov *niov); -int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, +int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, struct pp_memory_provider_params *p); int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, const struct pp_memory_provider_params *p, struct netlink_ext_ack *extack); -void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, +void net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, struct pp_memory_provider_params *old_p); void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, const struct pp_memory_provider_params *old_p); diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index 61fe25817e98..75c7a68cb90d 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -67,6 +67,29 @@ __netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq_idx, return rxq; } +struct netdev_rx_queue * +netif_get_rx_queue_lease_locked(struct net_device **dev, unsigned int *rxq_idx) +{ + struct net_device *orig_dev = *dev; + struct netdev_rx_queue *rxq; + + /* Locking order is always from the virtual to the physical device + * see netdev_nl_queue_create_doit(). + */ + netdev_ops_assert_locked(orig_dev); + rxq = __netif_get_rx_queue_lease(dev, rxq_idx, NETIF_VIRT_TO_PHYS); + if (rxq && orig_dev != *dev) + netdev_lock(*dev); + return rxq; +} + +void netif_put_rx_queue_lease_locked(struct net_device *orig_dev, + struct net_device *dev) +{ + if (orig_dev != dev) + netdev_unlock(dev); +} + bool netif_rx_queue_lease_get_owner(struct net_device **dev, unsigned int *rxq_idx) { @@ -183,49 +206,63 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, const struct pp_memory_provider_params *p, struct netlink_ext_ack *extack) { + struct net_device *orig_dev = dev; struct netdev_rx_queue *rxq; int ret; if (!netdev_need_ops_lock(dev)) return -EOPNOTSUPP; - if (rxq_idx >= dev->real_num_rx_queues) { NL_SET_ERR_MSG(extack, "rx queue index out of range"); return -ERANGE; } - rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues); + rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues); + rxq = netif_get_rx_queue_lease_locked(&dev, &rxq_idx); + if (!rxq) { + NL_SET_ERR_MSG(extack, "rx queue peered to a virtual netdev"); + return -EBUSY; + } + if (!dev->dev.parent) { + NL_SET_ERR_MSG(extack, "rx queue is mapped to a virtual netdev"); + ret = -EBUSY; + goto out; + } if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { NL_SET_ERR_MSG(extack, "tcp-data-split is disabled"); - return -EINVAL; + ret = -EINVAL; + goto out; } if (dev->cfg->hds_thresh) { NL_SET_ERR_MSG(extack, "hds-thresh is not zero"); - return -EINVAL; + ret = -EINVAL; + goto out; } if (dev_xdp_prog_count(dev)) { NL_SET_ERR_MSG(extack, "unable to custom memory provider to device with XDP program attached"); - return -EEXIST; + ret = -EEXIST; + goto out; } - - rxq = __netif_get_rx_queue(dev, rxq_idx); if (rxq->mp_params.mp_ops) { NL_SET_ERR_MSG(extack, "designated queue already memory provider bound"); - return -EEXIST; + ret = -EEXIST; + goto out; } #ifdef CONFIG_XDP_SOCKETS if (rxq->pool) { NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP"); - return -EBUSY; + ret = -EBUSY; + goto out; } #endif - rxq->mp_params = *p; ret = netdev_rx_queue_restart(dev, rxq_idx); if (ret) { rxq->mp_params.mp_ops = NULL; rxq->mp_params.mp_priv = NULL; } +out: + netif_put_rx_queue_lease_locked(orig_dev, dev); return ret; } @@ -240,38 +277,43 @@ int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, return ret; } -void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, +void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, const struct pp_memory_provider_params *old_p) { + struct net_device *orig_dev = dev; struct netdev_rx_queue *rxq; int err; - if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues)) + if (WARN_ON_ONCE(rxq_idx >= dev->real_num_rx_queues)) return; - rxq = __netif_get_rx_queue(dev, ifq_idx); + rxq = netif_get_rx_queue_lease_locked(&dev, &rxq_idx); + if (WARN_ON_ONCE(!rxq)) + return; /* Callers holding a netdev ref may get here after we already * went thru shutdown via dev_memory_provider_uninstall(). */ if (dev->reg_state > NETREG_REGISTERED && !rxq->mp_params.mp_ops) - return; + goto out; if (WARN_ON_ONCE(rxq->mp_params.mp_ops != old_p->mp_ops || rxq->mp_params.mp_priv != old_p->mp_priv)) - return; + goto out; rxq->mp_params.mp_ops = NULL; rxq->mp_params.mp_priv = NULL; - err = netdev_rx_queue_restart(dev, ifq_idx); + err = netdev_rx_queue_restart(dev, rxq_idx); WARN_ON(err && err != -ENETDOWN); +out: + netif_put_rx_queue_lease_locked(orig_dev, dev); } -void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, +void net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, struct pp_memory_provider_params *old_p) { netdev_lock(dev); - __net_mp_close_rxq(dev, ifq_idx, old_p); + __net_mp_close_rxq(dev, rxq_idx, old_p); netdev_unlock(dev); } -- cgit v1.2.3