From 9063de636cee235bd736ab3e4895e2826e606dea Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Fri, 25 Jul 2025 12:33:04 +0200 Subject: kcm: Fix splice support Flags passed in for splice() syscall should not end up in skb_recv_datagram(). As SPLICE_F_NONBLOCK == MSG_PEEK, kernel gets confused: skb isn't unlinked from a receive queue, while strp_msg::offset and strp_msg::full_len are updated. Unbreak the logic a bit more by mapping both O_NONBLOCK and SPLICE_F_NONBLOCK to MSG_DONTWAIT. This way we align with man splice(2) in regard to errno EAGAIN: SPLICE_F_NONBLOCK was specified in flags or one of the file descriptors had been marked as nonblocking (O_NONBLOCK), and the operation would block. Fixes: 5121197ecc5d ("kcm: close race conditions on sk_receive_queue") Fixes: 91687355b927 ("kcm: Splice support") Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20250725-kcm-splice-v1-1-9a725ad2ee71@rbox.co Signed-off-by: Jakub Kicinski --- net/kcm/kcmsock.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index a0be3896a934..a4971e6fa943 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -1029,6 +1030,11 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, ssize_t copied; struct sk_buff *skb; + if (sock->file->f_flags & O_NONBLOCK || flags & SPLICE_F_NONBLOCK) + flags = MSG_DONTWAIT; + else + flags = 0; + /* Only support splice for SOCKSEQPACKET */ skb = skb_recv_datagram(sk, flags, &err); -- cgit v1.2.3 From 2da4def0f487f24bbb0cece3bb2bcdcb918a0b72 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 25 Jul 2025 18:08:46 -0700 Subject: netpoll: prevent hanging NAPI when netcons gets enabled Paolo spotted hangs in NIPA running driver tests against virtio. The tests hang in virtnet_close() -> virtnet_napi_tx_disable(). The problem is only reproducible if running multiple of our tests in sequence (I used TEST_PROGS="xdp.py ping.py netcons_basic.sh \ netpoll_basic.py stats.py"). Initial suspicion was that this is a simple case of double-disable of NAPI, but instrumenting the code reveals: Deadlocked on NAPI ffff888007cd82c0 (virtnet_poll_tx): state: 0x37, disabled: false, owner: 0, listed: false, weight: 64 The NAPI was not in fact disabled, owner is 0 (rather than -1), so the NAPI "thinks" it's scheduled for CPU 0 but it's not listed (!list_empty(&n->poll_list) => false). It seems odd that normal NAPI processing would wedge itself like this. Better suspicion is that netpoll gets enabled while NAPI is polling, and also grabs the NAPI instance. This confuses napi_complete_done(): [netpoll] [normal NAPI] napi_poll() have = netpoll_poll_lock() rcu_access_pointer(dev->npinfo) return NULL # no netpoll __napi_poll() ->poll(->weight) poll_napi() cmpxchg(->poll_owner, -1, cpu) poll_one_napi() set_bit(NAPI_STATE_NPSVC, ->state) napi_complete_done() if (NAPIF_STATE_NPSVC) return false # exit without clearing SCHED This feels very unlikely, but perhaps virtio has some interactions with the hypervisor in the NAPI ->poll that makes the race window larger? Best I could to to prove the theory was to add and trigger this warning in napi_poll (just before netpoll_poll_unlock()): WARN_ONCE(!have && rcu_access_pointer(n->dev->npinfo) && napi_is_scheduled(n) && list_empty(&n->poll_list), "NAPI race with netpoll %px", n); If this warning hits the next virtio_close() will hang. This patch survived 30 test iterations without a hang (without it the longest clean run was around 10). Credit for triggering this goes to Breno's recent netconsole tests. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Paolo Abeni Link: https://lore.kernel.org/c5a93ed1-9abe-4880-a3bb-8d1678018b1d@redhat.com Acked-by: Jason Wang Reviewed-by: Xuan Zhuo Link: https://patch.msgid.link/20250726010846.1105875-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netpoll.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index a1da97b5b30b..5f65b62346d4 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -768,6 +768,13 @@ int netpoll_setup(struct netpoll *np) if (err) goto flush; rtnl_unlock(); + + /* Make sure all NAPI polls which started before dev->npinfo + * was visible have exited before we start calling NAPI poll. + * NAPI skips locking if dev->npinfo is NULL. + */ + synchronize_rcu(); + return 0; flush: -- cgit v1.2.3 From 759dfc7d04bab1b0b86113f1164dc1fec192b859 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Mon, 28 Jul 2025 11:06:47 +0300 Subject: netlink: avoid infinite retry looping in netlink_unicast() netlink_attachskb() checks for the socket's read memory allocation constraints. Firstly, it has: rmem < READ_ONCE(sk->sk_rcvbuf) to check if the just increased rmem value fits into the socket's receive buffer. If not, it proceeds and tries to wait for the memory under: rmem + skb->truesize > READ_ONCE(sk->sk_rcvbuf) The checks don't cover the case when skb->truesize + sk->sk_rmem_alloc is equal to sk->sk_rcvbuf. Thus the function neither successfully accepts these conditions, nor manages to reschedule the task - and is called in retry loop for indefinite time which is caught as: rcu: INFO: rcu_sched self-detected stall on CPU rcu: 0-....: (25999 ticks this GP) idle=ef2/1/0x4000000000000000 softirq=262269/262269 fqs=6212 (t=26000 jiffies g=230833 q=259957) NMI backtrace for cpu 0 CPU: 0 PID: 22 Comm: kauditd Not tainted 5.10.240 #68 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-4.fc42 04/01/2014 Call Trace: dump_stack lib/dump_stack.c:120 nmi_cpu_backtrace.cold lib/nmi_backtrace.c:105 nmi_trigger_cpumask_backtrace lib/nmi_backtrace.c:62 rcu_dump_cpu_stacks kernel/rcu/tree_stall.h:335 rcu_sched_clock_irq.cold kernel/rcu/tree.c:2590 update_process_times kernel/time/timer.c:1953 tick_sched_handle kernel/time/tick-sched.c:227 tick_sched_timer kernel/time/tick-sched.c:1399 __hrtimer_run_queues kernel/time/hrtimer.c:1652 hrtimer_interrupt kernel/time/hrtimer.c:1717 __sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1113 asm_call_irq_on_stack arch/x86/entry/entry_64.S:808 netlink_attachskb net/netlink/af_netlink.c:1234 netlink_unicast net/netlink/af_netlink.c:1349 kauditd_send_queue kernel/audit.c:776 kauditd_thread kernel/audit.c:897 kthread kernel/kthread.c:328 ret_from_fork arch/x86/entry/entry_64.S:304 Restore the original behavior of the check which commit in Fixes accidentally missed when restructuring the code. Found by Linux Verification Center (linuxtesting.org). Fixes: ae8f160e7eb2 ("netlink: Fix wraparounds of sk->sk_rmem_alloc.") Cc: stable@vger.kernel.org Signed-off-by: Fedor Pchelkin Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250728080727.255138-1-pchelkin@ispras.ru Signed-off-by: Jakub Kicinski --- net/netlink/af_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 5949855fa29e..e2f7080dd5d7 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1218,7 +1218,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, nlk = nlk_sk(sk); rmem = atomic_add_return(skb->truesize, &sk->sk_rmem_alloc); - if ((rmem == skb->truesize || rmem < READ_ONCE(sk->sk_rcvbuf)) && + if ((rmem == skb->truesize || rmem <= READ_ONCE(sk->sk_rcvbuf)) && !test_bit(NETLINK_S_CONGESTED, &nlk->state)) { netlink_skb_set_owner_r(skb, sk); return 0; -- cgit v1.2.3 From d45cf1e7d7180256e17c9ce88e32e8061a7887fe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 30 Jul 2025 13:17:38 +0000 Subject: ipv6: reject malicious packets in ipv6_gso_segment() syzbot was able to craft a packet with very long IPv6 extension headers leading to an overflow of skb->transport_header. This 16bit field has a limited range. Add skb_reset_transport_header_careful() helper and use it from ipv6_gso_segment() WARNING: CPU: 0 PID: 5871 at ./include/linux/skbuff.h:3032 skb_reset_transport_header include/linux/skbuff.h:3032 [inline] WARNING: CPU: 0 PID: 5871 at ./include/linux/skbuff.h:3032 ipv6_gso_segment+0x15e2/0x21e0 net/ipv6/ip6_offload.c:151 Modules linked in: CPU: 0 UID: 0 PID: 5871 Comm: syz-executor211 Not tainted 6.16.0-rc6-syzkaller-g7abc678e3084 #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/12/2025 RIP: 0010:skb_reset_transport_header include/linux/skbuff.h:3032 [inline] RIP: 0010:ipv6_gso_segment+0x15e2/0x21e0 net/ipv6/ip6_offload.c:151 Call Trace: skb_mac_gso_segment+0x31c/0x640 net/core/gso.c:53 nsh_gso_segment+0x54a/0xe10 net/nsh/nsh.c:110 skb_mac_gso_segment+0x31c/0x640 net/core/gso.c:53 __skb_gso_segment+0x342/0x510 net/core/gso.c:124 skb_gso_segment include/net/gso.h:83 [inline] validate_xmit_skb+0x857/0x11b0 net/core/dev.c:3950 validate_xmit_skb_list+0x84/0x120 net/core/dev.c:4000 sch_direct_xmit+0xd3/0x4b0 net/sched/sch_generic.c:329 __dev_xmit_skb net/core/dev.c:4102 [inline] __dev_queue_xmit+0x17b6/0x3a70 net/core/dev.c:4679 Fixes: d1da932ed4ec ("ipv6: Separate ipv6 offload support") Reported-by: syzbot+af43e647fd835acc02df@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/688a1a05.050a0220.5d226.0008.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Reviewed-by: Dawid Osuchowski Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250730131738.3385939-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 23 +++++++++++++++++++++++ net/ipv6/ip6_offload.c | 4 +++- 2 files changed, 26 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b8b06e71b73e..14b923ddb6df 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3033,6 +3033,29 @@ static inline void skb_reset_transport_header(struct sk_buff *skb) skb->transport_header = offset; } +/** + * skb_reset_transport_header_careful - conditionally reset transport header + * @skb: buffer to alter + * + * Hardened version of skb_reset_transport_header(). + * + * Returns: true if the operation was a success. + */ +static inline bool __must_check +skb_reset_transport_header_careful(struct sk_buff *skb) +{ + long offset = skb->data - skb->head; + + if (unlikely(offset != (typeof(skb->transport_header))offset)) + return false; + + if (unlikely(offset == (typeof(skb->transport_header))~0U)) + return false; + + skb->transport_header = offset; + return true; +} + static inline void skb_set_transport_header(struct sk_buff *skb, const int offset) { diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 9822163428b0..fce91183797a 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -148,7 +148,9 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, ops = rcu_dereference(inet6_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) { - skb_reset_transport_header(skb); + if (!skb_reset_transport_header_careful(skb)) + goto out; + segs = ops->callbacks.gso_segment(skb, features); if (!segs) skb->network_header = skb_mac_header(skb) + nhoff - skb->head; -- cgit v1.2.3 From ae8508b25def57982493c48694ef135973bfabe0 Mon Sep 17 00:00:00 2001 From: Takamitsu Iwai Date: Tue, 29 Jul 2025 02:31:49 +0900 Subject: net/sched: taprio: enforce minimum value for picos_per_byte Syzbot reported a WARNING in taprio_get_start_time(). When link speed is 470,589 or greater, q->picos_per_byte becomes too small, causing length_to_duration(q, ETH_ZLEN) to return zero. This zero value leads to validation failures in fill_sched_entry() and parse_taprio_schedule(), allowing arbitrary values to be assigned to entry->interval and cycle_time. As a result, sched->cycle can become zero. Since SPEED_800000 is the largest defined speed in include/uapi/linux/ethtool.h, this issue can occur in realistic scenarios. To ensure length_to_duration() returns a non-zero value for minimum-sized Ethernet frames (ETH_ZLEN = 60), picos_per_byte must be at least 17 (60 * 17 > PSEC_PER_NSEC which is 1000). This patch enforces a minimum value of 17 for picos_per_byte when the calculated value would be lower, and adds a warning message to inform users that scheduling accuracy may be affected at very high link speeds. Fixes: fb66df20a720 ("net/sched: taprio: extend minimum interval restriction to entire cycle too") Reported-by: syzbot+398e1ee4ca2cac05fddb@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=398e1ee4ca2cac05fddb Signed-off-by: Takamitsu Iwai Link: https://patch.msgid.link/20250728173149.45585-1-takamitz@amazon.co.jp Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index e759e43ad27e..39b735386996 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -43,6 +43,11 @@ static struct static_key_false taprio_have_working_mqprio; #define TAPRIO_SUPPORTED_FLAGS \ (TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST | TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD) #define TAPRIO_FLAGS_INVALID U32_MAX +/* Minimum value for picos_per_byte to ensure non-zero duration + * for minimum-sized Ethernet frames (ETH_ZLEN = 60). + * 60 * 17 > PSEC_PER_NSEC (1000) + */ +#define TAPRIO_PICOS_PER_BYTE_MIN 17 struct sched_entry { /* Durations between this GCL entry and the GCL entry where the @@ -1284,7 +1289,8 @@ static void taprio_start_sched(struct Qdisc *sch, } static void taprio_set_picos_per_byte(struct net_device *dev, - struct taprio_sched *q) + struct taprio_sched *q, + struct netlink_ext_ack *extack) { struct ethtool_link_ksettings ecmd; int speed = SPEED_10; @@ -1300,6 +1306,15 @@ static void taprio_set_picos_per_byte(struct net_device *dev, skip: picos_per_byte = (USEC_PER_SEC * 8) / speed; + if (picos_per_byte < TAPRIO_PICOS_PER_BYTE_MIN) { + if (!extack) + pr_warn("Link speed %d is too high. Schedule may be inaccurate.\n", + speed); + NL_SET_ERR_MSG_FMT_MOD(extack, + "Link speed %d is too high. Schedule may be inaccurate.", + speed); + picos_per_byte = TAPRIO_PICOS_PER_BYTE_MIN; + } atomic64_set(&q->picos_per_byte, picos_per_byte); netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n", @@ -1324,7 +1339,7 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, if (dev != qdisc_dev(q->root)) continue; - taprio_set_picos_per_byte(dev, q); + taprio_set_picos_per_byte(dev, q, NULL); stab = rtnl_dereference(q->root->stab); @@ -1844,7 +1859,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, q->flags = taprio_flags; /* Needed for length_to_duration() during netlink attribute parsing */ - taprio_set_picos_per_byte(dev, q); + taprio_set_picos_per_byte(dev, q, extack); err = taprio_parse_mqprio_opt(dev, mqprio, extack, q->flags); if (err < 0) -- cgit v1.2.3 From 1dbf1d590d10a6d1978e8184f8dfe20af22d680a Mon Sep 17 00:00:00 2001 From: Sharath Chandra Vurukala Date: Wed, 30 Jul 2025 16:21:18 +0530 Subject: net: Add locking to protect skb->dev access in ip_output In ip_output() skb->dev is updated from the skb_dst(skb)->dev this can become invalid when the interface is unregistered and freed, Introduced new skb_dst_dev_rcu() function to be used instead of skb_dst_dev() within rcu_locks in ip_output.This will ensure that all the skb's associated with the dev being deregistered will be transnmitted out first, before freeing the dev. Given that ip_output() is called within an rcu_read_lock() critical section or from a bottom-half context, it is safe to introduce an RCU read-side critical section within it. Multiple panic call stacks were observed when UL traffic was run in concurrency with device deregistration from different functions, pasting one sample for reference. [496733.627565][T13385] Call trace: [496733.627570][T13385] bpf_prog_ce7c9180c3b128ea_cgroupskb_egres+0x24c/0x7f0 [496733.627581][T13385] __cgroup_bpf_run_filter_skb+0x128/0x498 [496733.627595][T13385] ip_finish_output+0xa4/0xf4 [496733.627605][T13385] ip_output+0x100/0x1a0 [496733.627613][T13385] ip_send_skb+0x68/0x100 [496733.627618][T13385] udp_send_skb+0x1c4/0x384 [496733.627625][T13385] udp_sendmsg+0x7b0/0x898 [496733.627631][T13385] inet_sendmsg+0x5c/0x7c [496733.627639][T13385] __sys_sendto+0x174/0x1e4 [496733.627647][T13385] __arm64_sys_sendto+0x28/0x3c [496733.627653][T13385] invoke_syscall+0x58/0x11c [496733.627662][T13385] el0_svc_common+0x88/0xf4 [496733.627669][T13385] do_el0_svc+0x2c/0xb0 [496733.627676][T13385] el0_svc+0x2c/0xa4 [496733.627683][T13385] el0t_64_sync_handler+0x68/0xb4 [496733.627689][T13385] el0t_64_sync+0x1a4/0x1a8 Changes in v3: - Replaced WARN_ON() with WARN_ON_ONCE(), as suggested by Willem de Bruijn. - Dropped legacy lines mistakenly pulled in from an outdated branch. Changes in v2: - Addressed review comments from Eric Dumazet - Used READ_ONCE() to prevent potential load/store tearing - Added skb_dst_dev_rcu() and used along with rcu_read_lock() in ip_output Signed-off-by: Sharath Chandra Vurukala Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250730105118.GA26100@hu-sharathv-hyd.qualcomm.com Signed-off-by: Jakub Kicinski --- include/net/dst.h | 12 ++++++++++++ net/ipv4/ip_output.c | 15 ++++++++++----- 2 files changed, 22 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/dst.h b/include/net/dst.h index 00467c1b5093..bab01363bb97 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -568,11 +568,23 @@ static inline struct net_device *dst_dev(const struct dst_entry *dst) return READ_ONCE(dst->dev); } +static inline struct net_device *dst_dev_rcu(const struct dst_entry *dst) +{ + /* In the future, use rcu_dereference(dst->dev) */ + WARN_ON_ONCE(!rcu_read_lock_held()); + return READ_ONCE(dst->dev); +} + static inline struct net_device *skb_dst_dev(const struct sk_buff *skb) { return dst_dev(skb_dst(skb)); } +static inline struct net_device *skb_dst_dev_rcu(const struct sk_buff *skb) +{ + return dst_dev_rcu(skb_dst(skb)); +} + static inline struct net *skb_dst_dev_net(const struct sk_buff *skb) { return dev_net(skb_dst_dev(skb)); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 10a1d182fd84..84e7f8a2f50f 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -425,15 +425,20 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_device *dev = skb_dst_dev(skb), *indev = skb->dev; + struct net_device *dev, *indev = skb->dev; + int ret_val; + rcu_read_lock(); + dev = skb_dst_dev_rcu(skb); skb->dev = dev; skb->protocol = htons(ETH_P_IP); - return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, - net, sk, skb, indev, dev, - ip_finish_output, - !(IPCB(skb)->flags & IPSKB_REROUTED)); + ret_val = NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, + net, sk, skb, indev, dev, + ip_finish_output, + !(IPCB(skb)->flags & IPSKB_REROUTED)); + rcu_read_unlock(); + return ret_val; } EXPORT_SYMBOL(ip_output); -- cgit v1.2.3 From fa516c0d8bf90da9d5b168757162205aafe5d0e1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 31 Jul 2025 18:13:35 -0700 Subject: net: devmem: fix DMA direction on unmapping Looks like we always unmap the DMA_BUF with DMA_FROM_DEVICE direction. While at it unexport __net_devmem_dmabuf_binding_free(), it's internal. Found by code inspection. Fixes: bd61848900bf ("net: devmem: Implement TX path") Acked-by: Stanislav Fomichev Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20250801011335.2267515-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/devmem.c | 6 +++--- net/core/devmem.h | 7 +++---- 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/core/devmem.c b/net/core/devmem.c index b3a62ca0df65..24c591ab38ae 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -70,14 +70,13 @@ void __net_devmem_dmabuf_binding_free(struct work_struct *wq) gen_pool_destroy(binding->chunk_pool); dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, - DMA_FROM_DEVICE); + binding->direction); dma_buf_detach(binding->dmabuf, binding->attachment); dma_buf_put(binding->dmabuf); xa_destroy(&binding->bound_rxqs); kvfree(binding->tx_vec); kfree(binding); } -EXPORT_SYMBOL(__net_devmem_dmabuf_binding_free); struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) @@ -208,6 +207,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, mutex_init(&binding->lock); binding->dmabuf = dmabuf; + binding->direction = direction; binding->attachment = dma_buf_attach(binding->dmabuf, dev->dev.parent); if (IS_ERR(binding->attachment)) { @@ -312,7 +312,7 @@ err_tx_vec: kvfree(binding->tx_vec); err_unmap: dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, - DMA_FROM_DEVICE); + direction); err_detach: dma_buf_detach(dmabuf, binding->attachment); err_free_binding: diff --git a/net/core/devmem.h b/net/core/devmem.h index 0a3b28ba5c13..41cd6e1c9141 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -56,6 +56,9 @@ struct net_devmem_dmabuf_binding { */ u32 id; + /* DMA direction, FROM_DEVICE for Rx binding, TO_DEVICE for Tx. */ + enum dma_data_direction direction; + /* Array of net_iov pointers for this binding, sorted by virtual * address. This array is convenient to map the virtual addresses to * net_iovs in the TX path. @@ -165,10 +168,6 @@ static inline void net_devmem_put_net_iov(struct net_iov *niov) { } -static inline void __net_devmem_dmabuf_binding_free(struct work_struct *wq) -{ -} - static inline struct net_devmem_dmabuf_binding * net_devmem_bind_dmabuf(struct net_device *dev, enum dma_data_direction direction, -- cgit v1.2.3 From 01d3c8417b9c1b884a8a981a3b886da556512f36 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Fri, 1 Aug 2025 13:54:16 -0400 Subject: net/packet: fix a race in packet_set_ring() and packet_notifier() When packet_set_ring() releases po->bind_lock, another thread can run packet_notifier() and process an NETDEV_UP event. This race and the fix are both similar to that of commit 15fe076edea7 ("net/packet: fix a race in packet_bind() and packet_notifier()"). There too the packet_notifier NETDEV_UP event managed to run while a po->bind_lock critical section had to be temporarily released. And the fix was similarly to temporarily set po->num to zero to keep the socket unhooked until the lock is retaken. The po->bind_lock in packet_set_ring and packet_notifier precede the introduction of git history. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable@vger.kernel.org Signed-off-by: Quang Le Signed-off-by: Willem de Bruijn Link: https://patch.msgid.link/20250801175423.2970334-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- net/packet/af_packet.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index bc438d0d96a7..a7017d7f0927 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4573,10 +4573,10 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, spin_lock(&po->bind_lock); was_running = packet_sock_flag(po, PACKET_SOCK_RUNNING); num = po->num; - if (was_running) { - WRITE_ONCE(po->num, 0); + WRITE_ONCE(po->num, 0); + if (was_running) __unregister_prot_hook(sk, false); - } + spin_unlock(&po->bind_lock); synchronize_net(); @@ -4608,10 +4608,10 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, mutex_unlock(&po->pg_vec_lock); spin_lock(&po->bind_lock); - if (was_running) { - WRITE_ONCE(po->num, num); + WRITE_ONCE(po->num, num); + if (was_running) register_prot_hook(sk); - } + spin_unlock(&po->bind_lock); if (pg_vec && (po->tp_version > TPACKET_V2)) { /* Because we don't support block-based V3 on tx-ring */ -- cgit v1.2.3 From ffd2dc4c6c49ff4f1e5d34e454a6a55608104c17 Mon Sep 17 00:00:00 2001 From: Maher Azzouzi Date: Fri, 1 Aug 2025 17:18:57 -0700 Subject: net/sched: mqprio: fix stack out-of-bounds write in tc entry parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TCA_MQPRIO_TC_ENTRY_INDEX is validated using NLA_POLICY_MAX(NLA_U32, TC_QOPT_MAX_QUEUE), which allows the value TC_QOPT_MAX_QUEUE (16). This leads to a 4-byte out-of-bounds stack write in the fp[] array, which only has room for 16 elements (0–15). Fix this by changing the policy to allow only up to TC_QOPT_MAX_QUEUE - 1. Fixes: f62af20bed2d ("net/sched: mqprio: allow per-TC user input of FP adminStatus") Reviewed-by: Eric Dumazet Signed-off-by: Maher Azzouzi Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20250802001857.2702497-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/sched/sch_mqprio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 51d4013b6121..f3e5ef9a9592 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -152,7 +152,7 @@ static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt, static const struct nla_policy mqprio_tc_entry_policy[TCA_MQPRIO_TC_ENTRY_MAX + 1] = { [TCA_MQPRIO_TC_ENTRY_INDEX] = NLA_POLICY_MAX(NLA_U32, - TC_QOPT_MAX_QUEUE), + TC_QOPT_MAX_QUEUE - 1), [TCA_MQPRIO_TC_ENTRY_FP] = NLA_POLICY_RANGE(NLA_U32, TC_FP_EXPRESS, TC_FP_PREEMPTIBLE), -- cgit v1.2.3 From e6d76268813dc64cc0b74ea9c274501f2de05344 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Mon, 4 Aug 2025 16:44:57 +0000 Subject: net: Update threaded state in napi config in netif_set_threaded Commit 2677010e7793 ("Add support to set NAPI threaded for individual NAPI") added support to enable/disable threaded napi using netlink. This also extended the napi config save/restore functionality to set the napi threaded state. This breaks netdev reset for drivers that use napi threaded at device level and also use napi config save/restore on napi_disable/napi_enable. Basically on netdev with napi threaded enabled at device level, a napi_enable call will get stuck trying to stop the napi kthread. This is because the napi->config->threaded is set to disabled when threaded is enabled at device level. The issue can be reproduced on virtio-net device using qemu. To reproduce the issue run following, echo 1 > /sys/class/net/threaded ethtool -L eth0 combined 1 Update the threaded state in napi config in netif_set_threaded and add a new test that verifies this scenario. Tested on qemu with virtio-net: NETIF=eth0 ./tools/testing/selftests/drivers/net/napi_threaded.py TAP version 13 1..2 ok 1 napi_threaded.change_num_queues ok 2 napi_threaded.enable_dev_threaded_disable_napi_threaded # Totals: pass:2 fail:0 xfail:0 xpass:0 skip:0 error:0 Fixes: 2677010e7793 ("Add support to set NAPI threaded for individual NAPI") Signed-off-by: Samiullah Khawaja Link: https://patch.msgid.link/20250804164457.2494390-1-skhawaja@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 26 ++--- tools/testing/selftests/drivers/net/Makefile | 1 + .../testing/selftests/drivers/net/napi_threaded.py | 111 +++++++++++++++++++++ 3 files changed, 121 insertions(+), 17 deletions(-) create mode 100755 tools/testing/selftests/drivers/net/napi_threaded.py (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index b28ce68830b2..68dc47d7e700 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6978,6 +6978,12 @@ int napi_set_threaded(struct napi_struct *napi, if (napi->config) napi->config->threaded = threaded; + /* Setting/unsetting threaded mode on a napi might not immediately + * take effect, if the current napi instance is actively being + * polled. In this case, the switch between threaded mode and + * softirq mode will happen in the next round of napi_schedule(). + * This should not cause hiccups/stalls to the live traffic. + */ if (!threaded && napi->thread) { napi_stop_kthread(napi); } else { @@ -7011,23 +7017,9 @@ int netif_set_threaded(struct net_device *dev, WRITE_ONCE(dev->threaded, threaded); - /* Make sure kthread is created before THREADED bit - * is set. - */ - smp_mb__before_atomic(); - - /* Setting/unsetting threaded mode on a napi might not immediately - * take effect, if the current napi instance is actively being - * polled. In this case, the switch between threaded mode and - * softirq mode will happen in the next round of napi_schedule(). - * This should not cause hiccups/stalls to the live traffic. - */ - list_for_each_entry(napi, &dev->napi_list, dev_list) { - if (!threaded && napi->thread) - napi_stop_kthread(napi); - else - assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); - } + /* The error should not occur as the kthreads are already created. */ + list_for_each_entry(napi, &dev->napi_list, dev_list) + WARN_ON_ONCE(napi_set_threaded(napi, threaded)); return err; } diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index 3556f3563e08..984ece05f7f9 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -11,6 +11,7 @@ TEST_GEN_FILES := \ TEST_PROGS := \ napi_id.py \ + napi_threaded.py \ netcons_basic.sh \ netcons_cmdline.sh \ netcons_fragmented_msg.sh \ diff --git a/tools/testing/selftests/drivers/net/napi_threaded.py b/tools/testing/selftests/drivers/net/napi_threaded.py new file mode 100755 index 000000000000..b2698db39817 --- /dev/null +++ b/tools/testing/selftests/drivers/net/napi_threaded.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +""" +Test napi threaded states. +""" + +from lib.py import ksft_run, ksft_exit +from lib.py import ksft_eq, ksft_ne, ksft_ge +from lib.py import NetDrvEnv, NetdevFamily +from lib.py import cmd, defer, ethtool + + +def _assert_napi_threaded_enabled(nl, napi_id) -> None: + napi = nl.napi_get({'id': napi_id}) + ksft_eq(napi['threaded'], 'enabled') + ksft_ne(napi.get('pid'), None) + + +def _assert_napi_threaded_disabled(nl, napi_id) -> None: + napi = nl.napi_get({'id': napi_id}) + ksft_eq(napi['threaded'], 'disabled') + ksft_eq(napi.get('pid'), None) + + +def _set_threaded_state(cfg, threaded) -> None: + cmd(f"echo {threaded} > /sys/class/net/{cfg.ifname}/threaded") + + +def _setup_deferred_cleanup(cfg) -> None: + combined = ethtool(f"-l {cfg.ifname}", json=True)[0].get("combined", 0) + ksft_ge(combined, 2) + defer(ethtool, f"-L {cfg.ifname} combined {combined}") + + threaded = cmd(f"cat /sys/class/net/{cfg.ifname}/threaded").stdout + defer(_set_threaded_state, cfg, threaded) + + +def enable_dev_threaded_disable_napi_threaded(cfg, nl) -> None: + """ + Test that when napi threaded is enabled at device level and + then disabled at napi level for one napi, the threaded state + of all napis is preserved after a change in number of queues. + """ + + napis = nl.napi_get({'ifindex': cfg.ifindex}, dump=True) + ksft_ge(len(napis), 2) + + napi0_id = napis[0]['id'] + napi1_id = napis[1]['id'] + + _setup_deferred_cleanup(cfg) + + # set threaded + _set_threaded_state(cfg, 1) + + # check napi threaded is set for both napis + _assert_napi_threaded_enabled(nl, napi0_id) + _assert_napi_threaded_enabled(nl, napi1_id) + + # disable threaded for napi1 + nl.napi_set({'id': napi1_id, 'threaded': 'disabled'}) + + cmd(f"ethtool -L {cfg.ifname} combined 1") + cmd(f"ethtool -L {cfg.ifname} combined 2") + _assert_napi_threaded_enabled(nl, napi0_id) + _assert_napi_threaded_disabled(nl, napi1_id) + + +def change_num_queues(cfg, nl) -> None: + """ + Test that when napi threaded is enabled at device level, + the napi threaded state is preserved after a change in + number of queues. + """ + + napis = nl.napi_get({'ifindex': cfg.ifindex}, dump=True) + ksft_ge(len(napis), 2) + + napi0_id = napis[0]['id'] + napi1_id = napis[1]['id'] + + _setup_deferred_cleanup(cfg) + + # set threaded + _set_threaded_state(cfg, 1) + + # check napi threaded is set for both napis + _assert_napi_threaded_enabled(nl, napi0_id) + _assert_napi_threaded_enabled(nl, napi1_id) + + cmd(f"ethtool -L {cfg.ifname} combined 1") + cmd(f"ethtool -L {cfg.ifname} combined 2") + + # check napi threaded is set for both napis + _assert_napi_threaded_enabled(nl, napi0_id) + _assert_napi_threaded_enabled(nl, napi1_id) + + +def main() -> None: + """ Ksft boiler plate main """ + + with NetDrvEnv(__file__, queue_count=2) as cfg: + ksft_run([change_num_queues, + enable_dev_threaded_disable_napi_threaded], + args=(cfg, NetdevFamily())) + ksft_exit() + + +if __name__ == "__main__": + main() -- cgit v1.2.3