From 48693d119b2114f8eaf8b8f972b29e05ae581ad4 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 13 Jul 2025 00:30:06 +0100 Subject: SUNRPC: Remove unused xdr functions Remove a bunch of unused xdr_*decode* functions: The last use of xdr_decode_netobj() was removed in 2021 by: commit 7cf96b6d0104 ("lockd: Update the NLMv4 SHARE arguments decoder to use struct xdr_stream") The last use of xdr_decode_string_inplace() was removed in 2021 by: commit 3049e974a7c7 ("lockd: Update the NLMv4 FREE_ALL arguments decoder to use struct xdr_stream") The last use of xdr_stream_decode_opaque() was removed in 2024 by: commit fed8a17c61ff ("xdrgen: typedefs should use the built-in string and opaque functions") The functions xdr_stream_decode_string() and xdr_stream_decode_opaque_dup() were both added in 2018 by the commit 0e779aa70308 ("SUNRPC: Add helpers for decoding opaque and string types") but never used. Remove them. Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20250712233006.403226-1-linux@treblig.org Signed-off-by: Trond Myklebust --- net/sunrpc/xdr.c | 110 ------------------------------------------------------- 1 file changed, 110 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 2ea00e354ba6..a0aae1144212 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -37,19 +37,6 @@ xdr_encode_netobj(__be32 *p, const struct xdr_netobj *obj) } EXPORT_SYMBOL_GPL(xdr_encode_netobj); -__be32 * -xdr_decode_netobj(__be32 *p, struct xdr_netobj *obj) -{ - unsigned int len; - - if ((len = be32_to_cpu(*p++)) > XDR_MAX_NETOBJ) - return NULL; - obj->len = len; - obj->data = (u8 *) p; - return p + XDR_QUADLEN(len); -} -EXPORT_SYMBOL_GPL(xdr_decode_netobj); - /** * xdr_encode_opaque_fixed - Encode fixed length opaque data * @p: pointer to current position in XDR buffer. @@ -102,21 +89,6 @@ xdr_encode_string(__be32 *p, const char *string) } EXPORT_SYMBOL_GPL(xdr_encode_string); -__be32 * -xdr_decode_string_inplace(__be32 *p, char **sp, - unsigned int *lenp, unsigned int maxlen) -{ - u32 len; - - len = be32_to_cpu(*p++); - if (len > maxlen) - return NULL; - *lenp = len; - *sp = (char *) p; - return p + XDR_QUADLEN(len); -} -EXPORT_SYMBOL_GPL(xdr_decode_string_inplace); - /** * xdr_terminate_string - '\0'-terminate a string residing in an xdr_buf * @buf: XDR buffer where string resides @@ -2247,88 +2219,6 @@ out: } EXPORT_SYMBOL_GPL(xdr_process_buf); -/** - * xdr_stream_decode_opaque - Decode variable length opaque - * @xdr: pointer to xdr_stream - * @ptr: location to store opaque data - * @size: size of storage buffer @ptr - * - * Return values: - * On success, returns size of object stored in *@ptr - * %-EBADMSG on XDR buffer overflow - * %-EMSGSIZE on overflow of storage buffer @ptr - */ -ssize_t xdr_stream_decode_opaque(struct xdr_stream *xdr, void *ptr, size_t size) -{ - ssize_t ret; - void *p; - - ret = xdr_stream_decode_opaque_inline(xdr, &p, size); - if (ret <= 0) - return ret; - memcpy(ptr, p, ret); - return ret; -} -EXPORT_SYMBOL_GPL(xdr_stream_decode_opaque); - -/** - * xdr_stream_decode_opaque_dup - Decode and duplicate variable length opaque - * @xdr: pointer to xdr_stream - * @ptr: location to store pointer to opaque data - * @maxlen: maximum acceptable object size - * @gfp_flags: GFP mask to use - * - * Return values: - * On success, returns size of object stored in *@ptr - * %-EBADMSG on XDR buffer overflow - * %-EMSGSIZE if the size of the object would exceed @maxlen - * %-ENOMEM on memory allocation failure - */ -ssize_t xdr_stream_decode_opaque_dup(struct xdr_stream *xdr, void **ptr, - size_t maxlen, gfp_t gfp_flags) -{ - ssize_t ret; - void *p; - - ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen); - if (ret > 0) { - *ptr = kmemdup(p, ret, gfp_flags); - if (*ptr != NULL) - return ret; - ret = -ENOMEM; - } - *ptr = NULL; - return ret; -} -EXPORT_SYMBOL_GPL(xdr_stream_decode_opaque_dup); - -/** - * xdr_stream_decode_string - Decode variable length string - * @xdr: pointer to xdr_stream - * @str: location to store string - * @size: size of storage buffer @str - * - * Return values: - * On success, returns length of NUL-terminated string stored in *@str - * %-EBADMSG on XDR buffer overflow - * %-EMSGSIZE on overflow of storage buffer @str - */ -ssize_t xdr_stream_decode_string(struct xdr_stream *xdr, char *str, size_t size) -{ - ssize_t ret; - void *p; - - ret = xdr_stream_decode_opaque_inline(xdr, &p, size); - if (ret > 0) { - memcpy(str, p, ret); - str[ret] = '\0'; - return strlen(str); - } - *str = '\0'; - return ret; -} -EXPORT_SYMBOL_GPL(xdr_stream_decode_string); - /** * xdr_stream_decode_string_dup - Decode and duplicate variable length string * @xdr: pointer to xdr_stream -- cgit v1.2.3 From f66e6bffc531bafaeb067e6f6af56f52d5cd4ac2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 18 Jul 2025 19:13:55 -0700 Subject: SUNRPC: Silence warnings about parameters not being described Warning: net/sunrpc/auth_gss/gss_krb5_crypto.c:902 function parameter 'len' not described in 'krb5_etm_decrypt' Warning: net/sunrpc/auth_gss/gss_krb5_crypto.c:902 function parameter 'buf' not described in 'krb5_etm_decrypt' Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/gss_krb5_crypto.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index 8f2d65c1e831..16dcf115de1e 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -875,8 +875,8 @@ out_err: * krb5_etm_decrypt - Decrypt using the RFC 8009 rules * @kctx: Kerberos context * @offset: starting offset of the ciphertext, in bytes - * @len: - * @buf: + * @len: size of ciphertext to unwrap + * @buf: ciphertext to unwrap * @headskip: OUT: the enctype's confounder length, in octets * @tailskip: OUT: the enctype's HMAC length, in octets * -- cgit v1.2.3 From 9063de636cee235bd736ab3e4895e2826e606dea Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Fri, 25 Jul 2025 12:33:04 +0200 Subject: kcm: Fix splice support Flags passed in for splice() syscall should not end up in skb_recv_datagram(). As SPLICE_F_NONBLOCK == MSG_PEEK, kernel gets confused: skb isn't unlinked from a receive queue, while strp_msg::offset and strp_msg::full_len are updated. Unbreak the logic a bit more by mapping both O_NONBLOCK and SPLICE_F_NONBLOCK to MSG_DONTWAIT. This way we align with man splice(2) in regard to errno EAGAIN: SPLICE_F_NONBLOCK was specified in flags or one of the file descriptors had been marked as nonblocking (O_NONBLOCK), and the operation would block. Fixes: 5121197ecc5d ("kcm: close race conditions on sk_receive_queue") Fixes: 91687355b927 ("kcm: Splice support") Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20250725-kcm-splice-v1-1-9a725ad2ee71@rbox.co Signed-off-by: Jakub Kicinski --- net/kcm/kcmsock.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index a0be3896a934..a4971e6fa943 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -1029,6 +1030,11 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, ssize_t copied; struct sk_buff *skb; + if (sock->file->f_flags & O_NONBLOCK || flags & SPLICE_F_NONBLOCK) + flags = MSG_DONTWAIT; + else + flags = 0; + /* Only support splice for SOCKSEQPACKET */ skb = skb_recv_datagram(sk, flags, &err); -- cgit v1.2.3 From 2da4def0f487f24bbb0cece3bb2bcdcb918a0b72 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 25 Jul 2025 18:08:46 -0700 Subject: netpoll: prevent hanging NAPI when netcons gets enabled Paolo spotted hangs in NIPA running driver tests against virtio. The tests hang in virtnet_close() -> virtnet_napi_tx_disable(). The problem is only reproducible if running multiple of our tests in sequence (I used TEST_PROGS="xdp.py ping.py netcons_basic.sh \ netpoll_basic.py stats.py"). Initial suspicion was that this is a simple case of double-disable of NAPI, but instrumenting the code reveals: Deadlocked on NAPI ffff888007cd82c0 (virtnet_poll_tx): state: 0x37, disabled: false, owner: 0, listed: false, weight: 64 The NAPI was not in fact disabled, owner is 0 (rather than -1), so the NAPI "thinks" it's scheduled for CPU 0 but it's not listed (!list_empty(&n->poll_list) => false). It seems odd that normal NAPI processing would wedge itself like this. Better suspicion is that netpoll gets enabled while NAPI is polling, and also grabs the NAPI instance. This confuses napi_complete_done(): [netpoll] [normal NAPI] napi_poll() have = netpoll_poll_lock() rcu_access_pointer(dev->npinfo) return NULL # no netpoll __napi_poll() ->poll(->weight) poll_napi() cmpxchg(->poll_owner, -1, cpu) poll_one_napi() set_bit(NAPI_STATE_NPSVC, ->state) napi_complete_done() if (NAPIF_STATE_NPSVC) return false # exit without clearing SCHED This feels very unlikely, but perhaps virtio has some interactions with the hypervisor in the NAPI ->poll that makes the race window larger? Best I could to to prove the theory was to add and trigger this warning in napi_poll (just before netpoll_poll_unlock()): WARN_ONCE(!have && rcu_access_pointer(n->dev->npinfo) && napi_is_scheduled(n) && list_empty(&n->poll_list), "NAPI race with netpoll %px", n); If this warning hits the next virtio_close() will hang. This patch survived 30 test iterations without a hang (without it the longest clean run was around 10). Credit for triggering this goes to Breno's recent netconsole tests. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Paolo Abeni Link: https://lore.kernel.org/c5a93ed1-9abe-4880-a3bb-8d1678018b1d@redhat.com Acked-by: Jason Wang Reviewed-by: Xuan Zhuo Link: https://patch.msgid.link/20250726010846.1105875-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netpoll.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index a1da97b5b30b..5f65b62346d4 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -768,6 +768,13 @@ int netpoll_setup(struct netpoll *np) if (err) goto flush; rtnl_unlock(); + + /* Make sure all NAPI polls which started before dev->npinfo + * was visible have exited before we start calling NAPI poll. + * NAPI skips locking if dev->npinfo is NULL. + */ + synchronize_rcu(); + return 0; flush: -- cgit v1.2.3 From 759dfc7d04bab1b0b86113f1164dc1fec192b859 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Mon, 28 Jul 2025 11:06:47 +0300 Subject: netlink: avoid infinite retry looping in netlink_unicast() netlink_attachskb() checks for the socket's read memory allocation constraints. Firstly, it has: rmem < READ_ONCE(sk->sk_rcvbuf) to check if the just increased rmem value fits into the socket's receive buffer. If not, it proceeds and tries to wait for the memory under: rmem + skb->truesize > READ_ONCE(sk->sk_rcvbuf) The checks don't cover the case when skb->truesize + sk->sk_rmem_alloc is equal to sk->sk_rcvbuf. Thus the function neither successfully accepts these conditions, nor manages to reschedule the task - and is called in retry loop for indefinite time which is caught as: rcu: INFO: rcu_sched self-detected stall on CPU rcu: 0-....: (25999 ticks this GP) idle=ef2/1/0x4000000000000000 softirq=262269/262269 fqs=6212 (t=26000 jiffies g=230833 q=259957) NMI backtrace for cpu 0 CPU: 0 PID: 22 Comm: kauditd Not tainted 5.10.240 #68 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-4.fc42 04/01/2014 Call Trace: dump_stack lib/dump_stack.c:120 nmi_cpu_backtrace.cold lib/nmi_backtrace.c:105 nmi_trigger_cpumask_backtrace lib/nmi_backtrace.c:62 rcu_dump_cpu_stacks kernel/rcu/tree_stall.h:335 rcu_sched_clock_irq.cold kernel/rcu/tree.c:2590 update_process_times kernel/time/timer.c:1953 tick_sched_handle kernel/time/tick-sched.c:227 tick_sched_timer kernel/time/tick-sched.c:1399 __hrtimer_run_queues kernel/time/hrtimer.c:1652 hrtimer_interrupt kernel/time/hrtimer.c:1717 __sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1113 asm_call_irq_on_stack arch/x86/entry/entry_64.S:808 netlink_attachskb net/netlink/af_netlink.c:1234 netlink_unicast net/netlink/af_netlink.c:1349 kauditd_send_queue kernel/audit.c:776 kauditd_thread kernel/audit.c:897 kthread kernel/kthread.c:328 ret_from_fork arch/x86/entry/entry_64.S:304 Restore the original behavior of the check which commit in Fixes accidentally missed when restructuring the code. Found by Linux Verification Center (linuxtesting.org). Fixes: ae8f160e7eb2 ("netlink: Fix wraparounds of sk->sk_rmem_alloc.") Cc: stable@vger.kernel.org Signed-off-by: Fedor Pchelkin Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250728080727.255138-1-pchelkin@ispras.ru Signed-off-by: Jakub Kicinski --- net/netlink/af_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 5949855fa29e..e2f7080dd5d7 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1218,7 +1218,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, nlk = nlk_sk(sk); rmem = atomic_add_return(skb->truesize, &sk->sk_rmem_alloc); - if ((rmem == skb->truesize || rmem < READ_ONCE(sk->sk_rcvbuf)) && + if ((rmem == skb->truesize || rmem <= READ_ONCE(sk->sk_rcvbuf)) && !test_bit(NETLINK_S_CONGESTED, &nlk->state)) { netlink_skb_set_owner_r(skb, sk); return 0; -- cgit v1.2.3 From d45cf1e7d7180256e17c9ce88e32e8061a7887fe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 30 Jul 2025 13:17:38 +0000 Subject: ipv6: reject malicious packets in ipv6_gso_segment() syzbot was able to craft a packet with very long IPv6 extension headers leading to an overflow of skb->transport_header. This 16bit field has a limited range. Add skb_reset_transport_header_careful() helper and use it from ipv6_gso_segment() WARNING: CPU: 0 PID: 5871 at ./include/linux/skbuff.h:3032 skb_reset_transport_header include/linux/skbuff.h:3032 [inline] WARNING: CPU: 0 PID: 5871 at ./include/linux/skbuff.h:3032 ipv6_gso_segment+0x15e2/0x21e0 net/ipv6/ip6_offload.c:151 Modules linked in: CPU: 0 UID: 0 PID: 5871 Comm: syz-executor211 Not tainted 6.16.0-rc6-syzkaller-g7abc678e3084 #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/12/2025 RIP: 0010:skb_reset_transport_header include/linux/skbuff.h:3032 [inline] RIP: 0010:ipv6_gso_segment+0x15e2/0x21e0 net/ipv6/ip6_offload.c:151 Call Trace: skb_mac_gso_segment+0x31c/0x640 net/core/gso.c:53 nsh_gso_segment+0x54a/0xe10 net/nsh/nsh.c:110 skb_mac_gso_segment+0x31c/0x640 net/core/gso.c:53 __skb_gso_segment+0x342/0x510 net/core/gso.c:124 skb_gso_segment include/net/gso.h:83 [inline] validate_xmit_skb+0x857/0x11b0 net/core/dev.c:3950 validate_xmit_skb_list+0x84/0x120 net/core/dev.c:4000 sch_direct_xmit+0xd3/0x4b0 net/sched/sch_generic.c:329 __dev_xmit_skb net/core/dev.c:4102 [inline] __dev_queue_xmit+0x17b6/0x3a70 net/core/dev.c:4679 Fixes: d1da932ed4ec ("ipv6: Separate ipv6 offload support") Reported-by: syzbot+af43e647fd835acc02df@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/688a1a05.050a0220.5d226.0008.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Reviewed-by: Dawid Osuchowski Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250730131738.3385939-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 23 +++++++++++++++++++++++ net/ipv6/ip6_offload.c | 4 +++- 2 files changed, 26 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b8b06e71b73e..14b923ddb6df 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3033,6 +3033,29 @@ static inline void skb_reset_transport_header(struct sk_buff *skb) skb->transport_header = offset; } +/** + * skb_reset_transport_header_careful - conditionally reset transport header + * @skb: buffer to alter + * + * Hardened version of skb_reset_transport_header(). + * + * Returns: true if the operation was a success. + */ +static inline bool __must_check +skb_reset_transport_header_careful(struct sk_buff *skb) +{ + long offset = skb->data - skb->head; + + if (unlikely(offset != (typeof(skb->transport_header))offset)) + return false; + + if (unlikely(offset == (typeof(skb->transport_header))~0U)) + return false; + + skb->transport_header = offset; + return true; +} + static inline void skb_set_transport_header(struct sk_buff *skb, const int offset) { diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 9822163428b0..fce91183797a 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -148,7 +148,9 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, ops = rcu_dereference(inet6_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) { - skb_reset_transport_header(skb); + if (!skb_reset_transport_header_careful(skb)) + goto out; + segs = ops->callbacks.gso_segment(skb, features); if (!segs) skb->network_header = skb_mac_header(skb) + nhoff - skb->head; -- cgit v1.2.3 From ae8508b25def57982493c48694ef135973bfabe0 Mon Sep 17 00:00:00 2001 From: Takamitsu Iwai Date: Tue, 29 Jul 2025 02:31:49 +0900 Subject: net/sched: taprio: enforce minimum value for picos_per_byte Syzbot reported a WARNING in taprio_get_start_time(). When link speed is 470,589 or greater, q->picos_per_byte becomes too small, causing length_to_duration(q, ETH_ZLEN) to return zero. This zero value leads to validation failures in fill_sched_entry() and parse_taprio_schedule(), allowing arbitrary values to be assigned to entry->interval and cycle_time. As a result, sched->cycle can become zero. Since SPEED_800000 is the largest defined speed in include/uapi/linux/ethtool.h, this issue can occur in realistic scenarios. To ensure length_to_duration() returns a non-zero value for minimum-sized Ethernet frames (ETH_ZLEN = 60), picos_per_byte must be at least 17 (60 * 17 > PSEC_PER_NSEC which is 1000). This patch enforces a minimum value of 17 for picos_per_byte when the calculated value would be lower, and adds a warning message to inform users that scheduling accuracy may be affected at very high link speeds. Fixes: fb66df20a720 ("net/sched: taprio: extend minimum interval restriction to entire cycle too") Reported-by: syzbot+398e1ee4ca2cac05fddb@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=398e1ee4ca2cac05fddb Signed-off-by: Takamitsu Iwai Link: https://patch.msgid.link/20250728173149.45585-1-takamitz@amazon.co.jp Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index e759e43ad27e..39b735386996 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -43,6 +43,11 @@ static struct static_key_false taprio_have_working_mqprio; #define TAPRIO_SUPPORTED_FLAGS \ (TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST | TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD) #define TAPRIO_FLAGS_INVALID U32_MAX +/* Minimum value for picos_per_byte to ensure non-zero duration + * for minimum-sized Ethernet frames (ETH_ZLEN = 60). + * 60 * 17 > PSEC_PER_NSEC (1000) + */ +#define TAPRIO_PICOS_PER_BYTE_MIN 17 struct sched_entry { /* Durations between this GCL entry and the GCL entry where the @@ -1284,7 +1289,8 @@ static void taprio_start_sched(struct Qdisc *sch, } static void taprio_set_picos_per_byte(struct net_device *dev, - struct taprio_sched *q) + struct taprio_sched *q, + struct netlink_ext_ack *extack) { struct ethtool_link_ksettings ecmd; int speed = SPEED_10; @@ -1300,6 +1306,15 @@ static void taprio_set_picos_per_byte(struct net_device *dev, skip: picos_per_byte = (USEC_PER_SEC * 8) / speed; + if (picos_per_byte < TAPRIO_PICOS_PER_BYTE_MIN) { + if (!extack) + pr_warn("Link speed %d is too high. Schedule may be inaccurate.\n", + speed); + NL_SET_ERR_MSG_FMT_MOD(extack, + "Link speed %d is too high. Schedule may be inaccurate.", + speed); + picos_per_byte = TAPRIO_PICOS_PER_BYTE_MIN; + } atomic64_set(&q->picos_per_byte, picos_per_byte); netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n", @@ -1324,7 +1339,7 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, if (dev != qdisc_dev(q->root)) continue; - taprio_set_picos_per_byte(dev, q); + taprio_set_picos_per_byte(dev, q, NULL); stab = rtnl_dereference(q->root->stab); @@ -1844,7 +1859,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, q->flags = taprio_flags; /* Needed for length_to_duration() during netlink attribute parsing */ - taprio_set_picos_per_byte(dev, q); + taprio_set_picos_per_byte(dev, q, extack); err = taprio_parse_mqprio_opt(dev, mqprio, extack, q->flags); if (err < 0) -- cgit v1.2.3 From 1dbf1d590d10a6d1978e8184f8dfe20af22d680a Mon Sep 17 00:00:00 2001 From: Sharath Chandra Vurukala Date: Wed, 30 Jul 2025 16:21:18 +0530 Subject: net: Add locking to protect skb->dev access in ip_output In ip_output() skb->dev is updated from the skb_dst(skb)->dev this can become invalid when the interface is unregistered and freed, Introduced new skb_dst_dev_rcu() function to be used instead of skb_dst_dev() within rcu_locks in ip_output.This will ensure that all the skb's associated with the dev being deregistered will be transnmitted out first, before freeing the dev. Given that ip_output() is called within an rcu_read_lock() critical section or from a bottom-half context, it is safe to introduce an RCU read-side critical section within it. Multiple panic call stacks were observed when UL traffic was run in concurrency with device deregistration from different functions, pasting one sample for reference. [496733.627565][T13385] Call trace: [496733.627570][T13385] bpf_prog_ce7c9180c3b128ea_cgroupskb_egres+0x24c/0x7f0 [496733.627581][T13385] __cgroup_bpf_run_filter_skb+0x128/0x498 [496733.627595][T13385] ip_finish_output+0xa4/0xf4 [496733.627605][T13385] ip_output+0x100/0x1a0 [496733.627613][T13385] ip_send_skb+0x68/0x100 [496733.627618][T13385] udp_send_skb+0x1c4/0x384 [496733.627625][T13385] udp_sendmsg+0x7b0/0x898 [496733.627631][T13385] inet_sendmsg+0x5c/0x7c [496733.627639][T13385] __sys_sendto+0x174/0x1e4 [496733.627647][T13385] __arm64_sys_sendto+0x28/0x3c [496733.627653][T13385] invoke_syscall+0x58/0x11c [496733.627662][T13385] el0_svc_common+0x88/0xf4 [496733.627669][T13385] do_el0_svc+0x2c/0xb0 [496733.627676][T13385] el0_svc+0x2c/0xa4 [496733.627683][T13385] el0t_64_sync_handler+0x68/0xb4 [496733.627689][T13385] el0t_64_sync+0x1a4/0x1a8 Changes in v3: - Replaced WARN_ON() with WARN_ON_ONCE(), as suggested by Willem de Bruijn. - Dropped legacy lines mistakenly pulled in from an outdated branch. Changes in v2: - Addressed review comments from Eric Dumazet - Used READ_ONCE() to prevent potential load/store tearing - Added skb_dst_dev_rcu() and used along with rcu_read_lock() in ip_output Signed-off-by: Sharath Chandra Vurukala Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250730105118.GA26100@hu-sharathv-hyd.qualcomm.com Signed-off-by: Jakub Kicinski --- include/net/dst.h | 12 ++++++++++++ net/ipv4/ip_output.c | 15 ++++++++++----- 2 files changed, 22 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/dst.h b/include/net/dst.h index 00467c1b5093..bab01363bb97 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -568,11 +568,23 @@ static inline struct net_device *dst_dev(const struct dst_entry *dst) return READ_ONCE(dst->dev); } +static inline struct net_device *dst_dev_rcu(const struct dst_entry *dst) +{ + /* In the future, use rcu_dereference(dst->dev) */ + WARN_ON_ONCE(!rcu_read_lock_held()); + return READ_ONCE(dst->dev); +} + static inline struct net_device *skb_dst_dev(const struct sk_buff *skb) { return dst_dev(skb_dst(skb)); } +static inline struct net_device *skb_dst_dev_rcu(const struct sk_buff *skb) +{ + return dst_dev_rcu(skb_dst(skb)); +} + static inline struct net *skb_dst_dev_net(const struct sk_buff *skb) { return dev_net(skb_dst_dev(skb)); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 10a1d182fd84..84e7f8a2f50f 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -425,15 +425,20 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_device *dev = skb_dst_dev(skb), *indev = skb->dev; + struct net_device *dev, *indev = skb->dev; + int ret_val; + rcu_read_lock(); + dev = skb_dst_dev_rcu(skb); skb->dev = dev; skb->protocol = htons(ETH_P_IP); - return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, - net, sk, skb, indev, dev, - ip_finish_output, - !(IPCB(skb)->flags & IPSKB_REROUTED)); + ret_val = NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, + net, sk, skb, indev, dev, + ip_finish_output, + !(IPCB(skb)->flags & IPSKB_REROUTED)); + rcu_read_unlock(); + return ret_val; } EXPORT_SYMBOL(ip_output); -- cgit v1.2.3 From cc5d59081fa26506d02de2127ab822f40d88bc5a Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Thu, 31 Jul 2025 14:00:56 -0400 Subject: sunrpc: fix client side handling of tls alerts A security exploit was discovered in NFS over TLS in tls_alert_recv due to its assumption that there is valid data in the msghdr's iterator's kvec. Instead, this patch proposes the rework how control messages are setup and used by sock_recvmsg(). If no control message structure is setup, kTLS layer will read and process TLS data record types. As soon as it encounters a TLS control message, it would return an error. At that point, NFS can setup a kvec backed control buffer and read in the control message such as a TLS alert. Scott found that a msg iterator can advance the kvec pointer as a part of the copy process thus we need to revert the iterator before calling into the tls_alert_recv. Fixes: dea034b963c8 ("SUNRPC: Capture CMSG metadata on client-side receive") Suggested-by: Trond Myklebust Suggested-by: Scott Mayhew Signed-off-by: Olga Kornievskaia Link: https://lore.kernel.org/r/20250731180058.4669-3-okorniev@redhat.com Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 04ff66758fc3..c5f7bbf5775f 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -358,7 +358,7 @@ xs_alloc_sparse_pages(struct xdr_buf *buf, size_t want, gfp_t gfp) static int xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg, - struct cmsghdr *cmsg, int ret) + unsigned int *msg_flags, struct cmsghdr *cmsg, int ret) { u8 content_type = tls_get_record_type(sock->sk, cmsg); u8 level, description; @@ -371,7 +371,7 @@ xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg, * record, even though there might be more frames * waiting to be decrypted. */ - msg->msg_flags &= ~MSG_EOR; + *msg_flags &= ~MSG_EOR; break; case TLS_RECORD_TYPE_ALERT: tls_alert_recv(sock->sk, msg, &level, &description); @@ -386,19 +386,33 @@ xs_sock_process_cmsg(struct socket *sock, struct msghdr *msg, } static int -xs_sock_recv_cmsg(struct socket *sock, struct msghdr *msg, int flags) +xs_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags, int flags) { union { struct cmsghdr cmsg; u8 buf[CMSG_SPACE(sizeof(u8))]; } u; + u8 alert[2]; + struct kvec alert_kvec = { + .iov_base = alert, + .iov_len = sizeof(alert), + }; + struct msghdr msg = { + .msg_flags = *msg_flags, + .msg_control = &u, + .msg_controllen = sizeof(u), + }; int ret; - msg->msg_control = &u; - msg->msg_controllen = sizeof(u); - ret = sock_recvmsg(sock, msg, flags); - if (msg->msg_controllen != sizeof(u)) - ret = xs_sock_process_cmsg(sock, msg, &u.cmsg, ret); + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1, + alert_kvec.iov_len); + ret = sock_recvmsg(sock, &msg, flags); + if (ret > 0 && + tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) { + iov_iter_revert(&msg.msg_iter, ret); + ret = xs_sock_process_cmsg(sock, &msg, msg_flags, &u.cmsg, + -EAGAIN); + } return ret; } @@ -408,7 +422,13 @@ xs_sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags, size_t seek) ssize_t ret; if (seek != 0) iov_iter_advance(&msg->msg_iter, seek); - ret = xs_sock_recv_cmsg(sock, msg, flags); + ret = sock_recvmsg(sock, msg, flags); + /* Handle TLS inband control message lazily */ + if (msg->msg_flags & MSG_CTRUNC) { + msg->msg_flags &= ~(MSG_CTRUNC | MSG_EOR); + if (ret == 0 || ret == -EIO) + ret = xs_sock_recv_cmsg(sock, &msg->msg_flags, flags); + } return ret > 0 ? ret + seek : ret; } @@ -434,7 +454,7 @@ xs_read_discard(struct socket *sock, struct msghdr *msg, int flags, size_t count) { iov_iter_discard(&msg->msg_iter, ITER_DEST, count); - return xs_sock_recv_cmsg(sock, msg, flags); + return xs_sock_recvmsg(sock, msg, flags, 0); } #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -- cgit v1.2.3 From fa516c0d8bf90da9d5b168757162205aafe5d0e1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 31 Jul 2025 18:13:35 -0700 Subject: net: devmem: fix DMA direction on unmapping Looks like we always unmap the DMA_BUF with DMA_FROM_DEVICE direction. While at it unexport __net_devmem_dmabuf_binding_free(), it's internal. Found by code inspection. Fixes: bd61848900bf ("net: devmem: Implement TX path") Acked-by: Stanislav Fomichev Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20250801011335.2267515-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/devmem.c | 6 +++--- net/core/devmem.h | 7 +++---- 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/core/devmem.c b/net/core/devmem.c index b3a62ca0df65..24c591ab38ae 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -70,14 +70,13 @@ void __net_devmem_dmabuf_binding_free(struct work_struct *wq) gen_pool_destroy(binding->chunk_pool); dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, - DMA_FROM_DEVICE); + binding->direction); dma_buf_detach(binding->dmabuf, binding->attachment); dma_buf_put(binding->dmabuf); xa_destroy(&binding->bound_rxqs); kvfree(binding->tx_vec); kfree(binding); } -EXPORT_SYMBOL(__net_devmem_dmabuf_binding_free); struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) @@ -208,6 +207,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, mutex_init(&binding->lock); binding->dmabuf = dmabuf; + binding->direction = direction; binding->attachment = dma_buf_attach(binding->dmabuf, dev->dev.parent); if (IS_ERR(binding->attachment)) { @@ -312,7 +312,7 @@ err_tx_vec: kvfree(binding->tx_vec); err_unmap: dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, - DMA_FROM_DEVICE); + direction); err_detach: dma_buf_detach(dmabuf, binding->attachment); err_free_binding: diff --git a/net/core/devmem.h b/net/core/devmem.h index 0a3b28ba5c13..41cd6e1c9141 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -56,6 +56,9 @@ struct net_devmem_dmabuf_binding { */ u32 id; + /* DMA direction, FROM_DEVICE for Rx binding, TO_DEVICE for Tx. */ + enum dma_data_direction direction; + /* Array of net_iov pointers for this binding, sorted by virtual * address. This array is convenient to map the virtual addresses to * net_iovs in the TX path. @@ -165,10 +168,6 @@ static inline void net_devmem_put_net_iov(struct net_iov *niov) { } -static inline void __net_devmem_dmabuf_binding_free(struct work_struct *wq) -{ -} - static inline struct net_devmem_dmabuf_binding * net_devmem_bind_dmabuf(struct net_device *dev, enum dma_data_direction direction, -- cgit v1.2.3 From 01d3c8417b9c1b884a8a981a3b886da556512f36 Mon Sep 17 00:00:00 2001 From: Quang Le Date: Fri, 1 Aug 2025 13:54:16 -0400 Subject: net/packet: fix a race in packet_set_ring() and packet_notifier() When packet_set_ring() releases po->bind_lock, another thread can run packet_notifier() and process an NETDEV_UP event. This race and the fix are both similar to that of commit 15fe076edea7 ("net/packet: fix a race in packet_bind() and packet_notifier()"). There too the packet_notifier NETDEV_UP event managed to run while a po->bind_lock critical section had to be temporarily released. And the fix was similarly to temporarily set po->num to zero to keep the socket unhooked until the lock is retaken. The po->bind_lock in packet_set_ring and packet_notifier precede the introduction of git history. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable@vger.kernel.org Signed-off-by: Quang Le Signed-off-by: Willem de Bruijn Link: https://patch.msgid.link/20250801175423.2970334-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- net/packet/af_packet.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index bc438d0d96a7..a7017d7f0927 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4573,10 +4573,10 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, spin_lock(&po->bind_lock); was_running = packet_sock_flag(po, PACKET_SOCK_RUNNING); num = po->num; - if (was_running) { - WRITE_ONCE(po->num, 0); + WRITE_ONCE(po->num, 0); + if (was_running) __unregister_prot_hook(sk, false); - } + spin_unlock(&po->bind_lock); synchronize_net(); @@ -4608,10 +4608,10 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, mutex_unlock(&po->pg_vec_lock); spin_lock(&po->bind_lock); - if (was_running) { - WRITE_ONCE(po->num, num); + WRITE_ONCE(po->num, num); + if (was_running) register_prot_hook(sk); - } + spin_unlock(&po->bind_lock); if (pg_vec && (po->tp_version > TPACKET_V2)) { /* Because we don't support block-based V3 on tx-ring */ -- cgit v1.2.3 From ffd2dc4c6c49ff4f1e5d34e454a6a55608104c17 Mon Sep 17 00:00:00 2001 From: Maher Azzouzi Date: Fri, 1 Aug 2025 17:18:57 -0700 Subject: net/sched: mqprio: fix stack out-of-bounds write in tc entry parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TCA_MQPRIO_TC_ENTRY_INDEX is validated using NLA_POLICY_MAX(NLA_U32, TC_QOPT_MAX_QUEUE), which allows the value TC_QOPT_MAX_QUEUE (16). This leads to a 4-byte out-of-bounds stack write in the fp[] array, which only has room for 16 elements (0–15). Fix this by changing the policy to allow only up to TC_QOPT_MAX_QUEUE - 1. Fixes: f62af20bed2d ("net/sched: mqprio: allow per-TC user input of FP adminStatus") Reviewed-by: Eric Dumazet Signed-off-by: Maher Azzouzi Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20250802001857.2702497-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/sched/sch_mqprio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 51d4013b6121..f3e5ef9a9592 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -152,7 +152,7 @@ static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt, static const struct nla_policy mqprio_tc_entry_policy[TCA_MQPRIO_TC_ENTRY_MAX + 1] = { [TCA_MQPRIO_TC_ENTRY_INDEX] = NLA_POLICY_MAX(NLA_U32, - TC_QOPT_MAX_QUEUE), + TC_QOPT_MAX_QUEUE - 1), [TCA_MQPRIO_TC_ENTRY_FP] = NLA_POLICY_RANGE(NLA_U32, TC_FP_EXPRESS, TC_FP_PREEMPTIBLE), -- cgit v1.2.3 From e6d76268813dc64cc0b74ea9c274501f2de05344 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Mon, 4 Aug 2025 16:44:57 +0000 Subject: net: Update threaded state in napi config in netif_set_threaded Commit 2677010e7793 ("Add support to set NAPI threaded for individual NAPI") added support to enable/disable threaded napi using netlink. This also extended the napi config save/restore functionality to set the napi threaded state. This breaks netdev reset for drivers that use napi threaded at device level and also use napi config save/restore on napi_disable/napi_enable. Basically on netdev with napi threaded enabled at device level, a napi_enable call will get stuck trying to stop the napi kthread. This is because the napi->config->threaded is set to disabled when threaded is enabled at device level. The issue can be reproduced on virtio-net device using qemu. To reproduce the issue run following, echo 1 > /sys/class/net/threaded ethtool -L eth0 combined 1 Update the threaded state in napi config in netif_set_threaded and add a new test that verifies this scenario. Tested on qemu with virtio-net: NETIF=eth0 ./tools/testing/selftests/drivers/net/napi_threaded.py TAP version 13 1..2 ok 1 napi_threaded.change_num_queues ok 2 napi_threaded.enable_dev_threaded_disable_napi_threaded # Totals: pass:2 fail:0 xfail:0 xpass:0 skip:0 error:0 Fixes: 2677010e7793 ("Add support to set NAPI threaded for individual NAPI") Signed-off-by: Samiullah Khawaja Link: https://patch.msgid.link/20250804164457.2494390-1-skhawaja@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 26 ++--- tools/testing/selftests/drivers/net/Makefile | 1 + .../testing/selftests/drivers/net/napi_threaded.py | 111 +++++++++++++++++++++ 3 files changed, 121 insertions(+), 17 deletions(-) create mode 100755 tools/testing/selftests/drivers/net/napi_threaded.py (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index b28ce68830b2..68dc47d7e700 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6978,6 +6978,12 @@ int napi_set_threaded(struct napi_struct *napi, if (napi->config) napi->config->threaded = threaded; + /* Setting/unsetting threaded mode on a napi might not immediately + * take effect, if the current napi instance is actively being + * polled. In this case, the switch between threaded mode and + * softirq mode will happen in the next round of napi_schedule(). + * This should not cause hiccups/stalls to the live traffic. + */ if (!threaded && napi->thread) { napi_stop_kthread(napi); } else { @@ -7011,23 +7017,9 @@ int netif_set_threaded(struct net_device *dev, WRITE_ONCE(dev->threaded, threaded); - /* Make sure kthread is created before THREADED bit - * is set. - */ - smp_mb__before_atomic(); - - /* Setting/unsetting threaded mode on a napi might not immediately - * take effect, if the current napi instance is actively being - * polled. In this case, the switch between threaded mode and - * softirq mode will happen in the next round of napi_schedule(). - * This should not cause hiccups/stalls to the live traffic. - */ - list_for_each_entry(napi, &dev->napi_list, dev_list) { - if (!threaded && napi->thread) - napi_stop_kthread(napi); - else - assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); - } + /* The error should not occur as the kthreads are already created. */ + list_for_each_entry(napi, &dev->napi_list, dev_list) + WARN_ON_ONCE(napi_set_threaded(napi, threaded)); return err; } diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index 3556f3563e08..984ece05f7f9 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -11,6 +11,7 @@ TEST_GEN_FILES := \ TEST_PROGS := \ napi_id.py \ + napi_threaded.py \ netcons_basic.sh \ netcons_cmdline.sh \ netcons_fragmented_msg.sh \ diff --git a/tools/testing/selftests/drivers/net/napi_threaded.py b/tools/testing/selftests/drivers/net/napi_threaded.py new file mode 100755 index 000000000000..b2698db39817 --- /dev/null +++ b/tools/testing/selftests/drivers/net/napi_threaded.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +""" +Test napi threaded states. +""" + +from lib.py import ksft_run, ksft_exit +from lib.py import ksft_eq, ksft_ne, ksft_ge +from lib.py import NetDrvEnv, NetdevFamily +from lib.py import cmd, defer, ethtool + + +def _assert_napi_threaded_enabled(nl, napi_id) -> None: + napi = nl.napi_get({'id': napi_id}) + ksft_eq(napi['threaded'], 'enabled') + ksft_ne(napi.get('pid'), None) + + +def _assert_napi_threaded_disabled(nl, napi_id) -> None: + napi = nl.napi_get({'id': napi_id}) + ksft_eq(napi['threaded'], 'disabled') + ksft_eq(napi.get('pid'), None) + + +def _set_threaded_state(cfg, threaded) -> None: + cmd(f"echo {threaded} > /sys/class/net/{cfg.ifname}/threaded") + + +def _setup_deferred_cleanup(cfg) -> None: + combined = ethtool(f"-l {cfg.ifname}", json=True)[0].get("combined", 0) + ksft_ge(combined, 2) + defer(ethtool, f"-L {cfg.ifname} combined {combined}") + + threaded = cmd(f"cat /sys/class/net/{cfg.ifname}/threaded").stdout + defer(_set_threaded_state, cfg, threaded) + + +def enable_dev_threaded_disable_napi_threaded(cfg, nl) -> None: + """ + Test that when napi threaded is enabled at device level and + then disabled at napi level for one napi, the threaded state + of all napis is preserved after a change in number of queues. + """ + + napis = nl.napi_get({'ifindex': cfg.ifindex}, dump=True) + ksft_ge(len(napis), 2) + + napi0_id = napis[0]['id'] + napi1_id = napis[1]['id'] + + _setup_deferred_cleanup(cfg) + + # set threaded + _set_threaded_state(cfg, 1) + + # check napi threaded is set for both napis + _assert_napi_threaded_enabled(nl, napi0_id) + _assert_napi_threaded_enabled(nl, napi1_id) + + # disable threaded for napi1 + nl.napi_set({'id': napi1_id, 'threaded': 'disabled'}) + + cmd(f"ethtool -L {cfg.ifname} combined 1") + cmd(f"ethtool -L {cfg.ifname} combined 2") + _assert_napi_threaded_enabled(nl, napi0_id) + _assert_napi_threaded_disabled(nl, napi1_id) + + +def change_num_queues(cfg, nl) -> None: + """ + Test that when napi threaded is enabled at device level, + the napi threaded state is preserved after a change in + number of queues. + """ + + napis = nl.napi_get({'ifindex': cfg.ifindex}, dump=True) + ksft_ge(len(napis), 2) + + napi0_id = napis[0]['id'] + napi1_id = napis[1]['id'] + + _setup_deferred_cleanup(cfg) + + # set threaded + _set_threaded_state(cfg, 1) + + # check napi threaded is set for both napis + _assert_napi_threaded_enabled(nl, napi0_id) + _assert_napi_threaded_enabled(nl, napi1_id) + + cmd(f"ethtool -L {cfg.ifname} combined 1") + cmd(f"ethtool -L {cfg.ifname} combined 2") + + # check napi threaded is set for both napis + _assert_napi_threaded_enabled(nl, napi0_id) + _assert_napi_threaded_enabled(nl, napi1_id) + + +def main() -> None: + """ Ksft boiler plate main """ + + with NetDrvEnv(__file__, queue_count=2) as cfg: + ksft_run([change_num_queues, + enable_dev_threaded_disable_napi_threaded], + args=(cfg, NetdevFamily())) + ksft_exit() + + +if __name__ == "__main__": + main() -- cgit v1.2.3 From 42e42562c9cfcdacf000f1b42284a4fad24f8546 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 4 Aug 2025 11:05:43 +0200 Subject: xfrm: flush all states in xfrm_state_fini While reverting commit f75a2804da39 ("xfrm: destroy xfrm_state synchronously on net exit path"), I incorrectly changed xfrm_state_flush's "proto" argument back to IPSEC_PROTO_ANY. This reverts some of the changes in commit dbb2483b2a46 ("xfrm: clean up xfrm protocol checks"), and leads to some states not being removed when we exit the netns. Pass 0 instead of IPSEC_PROTO_ANY from both xfrm_state_fini xfrm6_tunnel_net_exit, so that xfrm_state_flush deletes all states. Fixes: 2a198bbec691 ("Revert "xfrm: destroy xfrm_state synchronously on net exit path"") Reported-by: syzbot+6641a61fe0e2e89ae8c5@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=6641a61fe0e2e89ae8c5 Tested-by: syzbot+6641a61fe0e2e89ae8c5@syzkaller.appspotmail.com Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Signed-off-by: Steffen Klassert --- net/ipv6/xfrm6_tunnel.c | 2 +- net/xfrm/xfrm_state.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 5120a763da0d..0a0eeaed0591 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -334,7 +334,7 @@ static void __net_exit xfrm6_tunnel_net_exit(struct net *net) struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); unsigned int i; - xfrm_state_flush(net, IPSEC_PROTO_ANY, false); + xfrm_state_flush(net, 0, false); xfrm_flush_gc(); for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 77db3b5fe4ac..78fcbb89cf32 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -3297,7 +3297,7 @@ void xfrm_state_fini(struct net *net) unsigned int sz; flush_work(&net->xfrm.state_hash_work); - xfrm_state_flush(net, IPSEC_PROTO_ANY, false); + xfrm_state_flush(net, 0, false); flush_work(&xfrm_state_gc_work); WARN_ON(!list_empty(&net->xfrm.state_all)); -- cgit v1.2.3 From bee47cb026e762841f3faece47b51f985e215edb Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 29 Jul 2025 12:40:20 -0400 Subject: sunrpc: fix handling of server side tls alerts Scott Mayhew discovered a security exploit in NFS over TLS in tls_alert_recv() due to its assumption it can read data from the msg iterator's kvec.. kTLS implementation splits TLS non-data record payload between the control message buffer (which includes the type such as TLS aler or TLS cipher change) and the rest of the payload (say TLS alert's level/description) which goes into the msg payload buffer. This patch proposes to rework how control messages are setup and used by sock_recvmsg(). If no control message structure is setup, kTLS layer will read and process TLS data record types. As soon as it encounters a TLS control message, it would return an error. At that point, NFS can setup a kvec backed msg buffer and read in the control message such as a TLS alert. Msg iterator can advance the kvec pointer as a part of the copy process thus we need to revert the iterator before calling into the tls_alert_recv. Reported-by: Scott Mayhew Fixes: 5e052dda121e ("SUNRPC: Recognize control messages in server-side TCP socket code") Suggested-by: Trond Myklebust Cc: stable@vger.kernel.org Signed-off-by: Olga Kornievskaia Signed-off-by: Chuck Lever --- net/sunrpc/svcsock.c | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 46c156b121db..e2c5e0e626f9 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -257,20 +257,47 @@ svc_tcp_sock_process_cmsg(struct socket *sock, struct msghdr *msg, } static int -svc_tcp_sock_recv_cmsg(struct svc_sock *svsk, struct msghdr *msg) +svc_tcp_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags) { union { struct cmsghdr cmsg; u8 buf[CMSG_SPACE(sizeof(u8))]; } u; - struct socket *sock = svsk->sk_sock; + u8 alert[2]; + struct kvec alert_kvec = { + .iov_base = alert, + .iov_len = sizeof(alert), + }; + struct msghdr msg = { + .msg_flags = *msg_flags, + .msg_control = &u, + .msg_controllen = sizeof(u), + }; + int ret; + + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1, + alert_kvec.iov_len); + ret = sock_recvmsg(sock, &msg, MSG_DONTWAIT); + if (ret > 0 && + tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) { + iov_iter_revert(&msg.msg_iter, ret); + ret = svc_tcp_sock_process_cmsg(sock, &msg, &u.cmsg, -EAGAIN); + } + return ret; +} + +static int +svc_tcp_sock_recvmsg(struct svc_sock *svsk, struct msghdr *msg) +{ int ret; + struct socket *sock = svsk->sk_sock; - msg->msg_control = &u; - msg->msg_controllen = sizeof(u); ret = sock_recvmsg(sock, msg, MSG_DONTWAIT); - if (unlikely(msg->msg_controllen != sizeof(u))) - ret = svc_tcp_sock_process_cmsg(sock, msg, &u.cmsg, ret); + if (msg->msg_flags & MSG_CTRUNC) { + msg->msg_flags &= ~(MSG_CTRUNC | MSG_EOR); + if (ret == 0 || ret == -EIO) + ret = svc_tcp_sock_recv_cmsg(sock, &msg->msg_flags); + } return ret; } @@ -321,7 +348,7 @@ static ssize_t svc_tcp_read_msg(struct svc_rqst *rqstp, size_t buflen, iov_iter_advance(&msg.msg_iter, seek); buflen -= seek; } - len = svc_tcp_sock_recv_cmsg(svsk, &msg); + len = svc_tcp_sock_recvmsg(svsk, &msg); if (len > 0) svc_flush_bvec(bvec, len, seek); @@ -1018,7 +1045,7 @@ static ssize_t svc_tcp_read_marker(struct svc_sock *svsk, iov.iov_base = ((char *)&svsk->sk_marker) + svsk->sk_tcplen; iov.iov_len = want; iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, want); - len = svc_tcp_sock_recv_cmsg(svsk, &msg); + len = svc_tcp_sock_recvmsg(svsk, &msg); if (len < 0) return len; svsk->sk_tcplen += len; -- cgit v1.2.3 From 234d1eff5d4987024be9d40ac07b918a5ae8db1a Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 4 Aug 2025 11:26:25 +0200 Subject: xfrm: restore GSO for SW crypto Commit 49431af6c4ef incorrectly assumes that the GSO path is only used by HW offload, but it's also useful for SW crypto. This patch re-enables GSO for SW crypto. It's not an exact revert to preserve the other changes made to xfrm_dev_offload_ok afterwards, but it reverts all of its effects. Fixes: 49431af6c4ef ("xfrm: rely on XFRM offload") Signed-off-by: Sabrina Dubroca Reviewed-by: Leon Romanovsky Reviewed-by: Zhu Yanjun Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_device.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index d2819baea414..1f88472aaac0 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -415,10 +415,12 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) struct net_device *dev = x->xso.dev; bool check_tunnel_size; - if (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED) + if (!x->type_offload || + (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED && x->encap)) return false; - if ((dev == xfrm_dst_path(dst)->dev) && !xdst->child->xfrm) { + if ((!dev || dev == xfrm_dst_path(dst)->dev) && + !xdst->child->xfrm) { mtu = xfrm_state_mtu(x, xdst->child_mtu_cached); if (skb->len <= mtu) goto ok; @@ -430,6 +432,9 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) return false; ok: + if (!dev) + return true; + check_tunnel_size = x->xso.type == XFRM_DEV_OFFLOAD_PACKET && x->props.mode == XFRM_MODE_TUNNEL; switch (x->props.family) { -- cgit v1.2.3 From 65f079a6c446a939eefe71e6d5957d5d6365fcf9 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 4 Aug 2025 11:26:26 +0200 Subject: xfrm: bring back device check in validate_xmit_xfrm This is partial revert of commit d53dda291bbd993a29b84d358d282076e3d01506. This change causes traffic using GSO with SW crypto running through a NIC capable of HW offload to no longer get segmented during validate_xmit_xfrm, and is unrelated to the bonding use case mentioned in the commit. Fixes: d53dda291bbd ("xfrm: Remove unneeded device check from validate_xmit_xfrm") Signed-off-by: Sabrina Dubroca Reviewed-by: Cosmin Ratiu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_device.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 1f88472aaac0..c7a1f080d2de 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -155,7 +155,8 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur return skb; } - if (skb_is_gso(skb) && unlikely(xmit_xfrm_check_overflow(skb))) { + if (skb_is_gso(skb) && (unlikely(x->xso.dev != dev) || + unlikely(xmit_xfrm_check_overflow(skb)))) { struct sk_buff *segs; /* Packet got rerouted, fixup features and segment it. */ -- cgit v1.2.3 From 1118aaa3b35157777890fffab91d8c1da841b20b Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 4 Aug 2025 11:26:27 +0200 Subject: udp: also consider secpath when evaluating ipsec use for checksumming Commit b40c5f4fde22 ("udp: disable inner UDP checksum offloads in IPsec case") tried to fix checksumming in UFO when the packets are going through IPsec, so that we can't rely on offloads because the UDP header and payload will be encrypted. But when doing a TCP test over VXLAN going through IPsec transport mode with GSO enabled (esp4_offload module loaded), I'm seeing broken UDP checksums on the encap after successful decryption. The skbs get to udp4_ufo_fragment/__skb_udp_tunnel_segment via __dev_queue_xmit -> validate_xmit_skb -> skb_gso_segment and at this point we've already dropped the dst (unless the device sets IFF_XMIT_DST_RELEASE, which is not common), so need_ipsec is false and we proceed with checksum offload. Make need_ipsec also check the secpath, which is not dropped on this callpath. Fixes: b40c5f4fde22 ("udp: disable inner UDP checksum offloads in IPsec case") Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/ipv4/udp_offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 5128e2a5b00a..b1f3fd302e9d 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -217,7 +217,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); skb->remcsum_offload = remcsum; - need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb)); + need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb); /* Try to offload checksum if possible */ offload_csum = !!(need_csum && !need_ipsec && -- cgit v1.2.3 From 25a8b88f000c33a1d580c317e93e40b953dc2fa5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 30 Jul 2025 23:45:32 +0200 Subject: netfilter: add back NETFILTER_XTABLES dependencies Some Kconfig symbols were changed to depend on the 'bool' symbol NETFILTER_XTABLES_LEGACY, which means they can now be set to built-in when the xtables code itself is in a loadable module: x86_64-linux-ld: vmlinux.o: in function `arpt_unregister_table_pre_exit': (.text+0x1831987): undefined reference to `xt_find_table' x86_64-linux-ld: vmlinux.o: in function `get_info.constprop.0': arp_tables.c:(.text+0x1831aab): undefined reference to `xt_request_find_table_lock' x86_64-linux-ld: arp_tables.c:(.text+0x1831bea): undefined reference to `xt_table_unlock' x86_64-linux-ld: vmlinux.o: in function `do_arpt_get_ctl': arp_tables.c:(.text+0x183205d): undefined reference to `xt_find_table_lock' x86_64-linux-ld: arp_tables.c:(.text+0x18320c1): undefined reference to `xt_table_unlock' x86_64-linux-ld: arp_tables.c:(.text+0x183219a): undefined reference to `xt_recseq' Change these to depend on both NETFILTER_XTABLES and NETFILTER_XTABLES_LEGACY. Fixes: 9fce66583f06 ("netfilter: Exclude LEGACY TABLES on PREEMPT_RT.") Signed-off-by: Arnd Bergmann Acked-by: Florian Westphal Tested-by: Breno Leitao Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/Kconfig | 1 + net/ipv4/netfilter/Kconfig | 3 +++ net/ipv6/netfilter/Kconfig | 1 + 3 files changed, 5 insertions(+) (limited to 'net') diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index 60f28e4fb5c0..4fd5a6ea26b4 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -43,6 +43,7 @@ config NF_CONNTRACK_BRIDGE config BRIDGE_NF_EBTABLES_LEGACY tristate "Legacy EBTABLES support" depends on BRIDGE && NETFILTER_XTABLES_LEGACY + depends on NETFILTER_XTABLES default n help Legacy ebtables packet/frame classifier. diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 2c438b140e88..7dc9772fe2d8 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -14,6 +14,7 @@ config NF_DEFRAG_IPV4 config IP_NF_IPTABLES_LEGACY tristate "Legacy IP tables support" depends on NETFILTER_XTABLES_LEGACY + depends on NETFILTER_XTABLES default m if NETFILTER_XTABLES_LEGACY help iptables is a legacy packet classifier. @@ -326,6 +327,7 @@ endif # IP_NF_IPTABLES config IP_NF_ARPTABLES tristate "Legacy ARPTABLES support" depends on NETFILTER_XTABLES_LEGACY + depends on NETFILTER_XTABLES default n help arptables is a legacy packet classifier. @@ -343,6 +345,7 @@ config IP_NF_ARPFILTER select IP_NF_ARPTABLES select NETFILTER_FAMILY_ARP depends on NETFILTER_XTABLES_LEGACY + depends on NETFILTER_XTABLES help ARP packet filtering defines a table `filter', which has a series of rules for simple ARP packet filtering at local input and diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 276860f65baa..81daf82ddc2d 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -10,6 +10,7 @@ menu "IPv6: Netfilter Configuration" config IP6_NF_IPTABLES_LEGACY tristate "Legacy IP6 tables support" depends on INET && IPV6 && NETFILTER_XTABLES_LEGACY + depends on NETFILTER_XTABLES default m if NETFILTER_XTABLES_LEGACY help ip6tables is a legacy packet classifier. -- cgit v1.2.3 From de788b2e6227462b6dcd0e07474e72c089008f74 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 1 Aug 2025 17:25:08 +0200 Subject: netfilter: ctnetlink: fix refcount leak on table dump There is a reference count leak in ctnetlink_dump_table(): if (res < 0) { nf_conntrack_get(&ct->ct_general); // HERE cb->args[1] = (unsigned long)ct; ... While its very unlikely, its possible that ct == last. If this happens, then the refcount of ct was already incremented. This 2nd increment is never undone. This prevents the conntrack object from being released, which in turn keeps prevents cnet->count from dropping back to 0. This will then block the netns dismantle (or conntrack rmmod) as nf_conntrack_cleanup_net_list() will wait forever. This can be reproduced by running conntrack_resize.sh selftest in a loop. It takes ~20 minutes for me on a preemptible kernel on average before I see a runaway kworker spinning in nf_conntrack_cleanup_net_list. One fix would to change this to: if (res < 0) { if (ct != last) nf_conntrack_get(&ct->ct_general); But this reference counting isn't needed in the first place. We can just store a cookie value instead. A followup patch will do the same for ctnetlink_exp_dump_table, it looks to me as if this has the same problem and like ctnetlink_dump_table, we only need a 'skip hint', not the actual object so we can apply the same cookie strategy there as well. Fixes: d205dc40798d ("[NETFILTER]: ctnetlink: fix deadlock in table dumping") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 486d52b45fe5..f403acd82437 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -884,8 +884,6 @@ errout: static int ctnetlink_done(struct netlink_callback *cb) { - if (cb->args[1]) - nf_ct_put((struct nf_conn *)cb->args[1]); kfree(cb->data); return 0; } @@ -1208,19 +1206,26 @@ ignore_entry: return 0; } +static unsigned long ctnetlink_get_id(const struct nf_conn *ct) +{ + unsigned long id = nf_ct_get_id(ct); + + return id ? id : 1; +} + static int ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0; struct net *net = sock_net(skb->sk); - struct nf_conn *ct, *last; + unsigned long last_id = cb->args[1]; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; struct nf_conn *nf_ct_evict[8]; + struct nf_conn *ct; int res, i; spinlock_t *lockp; - last = (struct nf_conn *)cb->args[1]; i = 0; local_bh_disable(); @@ -1257,7 +1262,7 @@ restart: continue; if (cb->args[1]) { - if (ct != last) + if (ctnetlink_get_id(ct) != last_id) continue; cb->args[1] = 0; } @@ -1270,8 +1275,7 @@ restart: NFNL_MSG_TYPE(cb->nlh->nlmsg_type), ct, true, flags); if (res < 0) { - nf_conntrack_get(&ct->ct_general); - cb->args[1] = (unsigned long)ct; + cb->args[1] = ctnetlink_get_id(ct); spin_unlock(lockp); goto out; } @@ -1284,12 +1288,10 @@ restart: } out: local_bh_enable(); - if (last) { + if (last_id) { /* nf ct hash resize happened, now clear the leftover. */ - if ((struct nf_conn *)cb->args[1] == last) + if (cb->args[1] == last_id) cb->args[1] = 0; - - nf_ct_put(last); } while (i) { -- cgit v1.2.3 From 1492e3dcb2be3aa46d1963da96aa9593e4e4db5a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 1 Aug 2025 17:25:09 +0200 Subject: netfilter: ctnetlink: remove refcounting in expectation dumpers Same pattern as previous patch: do not keep the expectation object alive via refcount, only store a cookie value and then use that as the skip hint for dump resumption. AFAICS this has the same issue as the one resolved in the conntrack dumper, when we do if (!refcount_inc_not_zero(&exp->use)) to increment the refcount, there is a chance that exp == last, which causes a double-increment of the refcount and subsequent memory leak. Fixes: cf6994c2b981 ("[NETFILTER]: nf_conntrack_netlink: sync expectation dumping with conntrack table dumping") Fixes: e844a928431f ("netfilter: ctnetlink: allow to dump expectation per master conntrack") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 41 +++++++++++++++--------------------- 1 file changed, 17 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index f403acd82437..50fd6809380f 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -3170,23 +3170,27 @@ errout: return 0; } #endif -static int ctnetlink_exp_done(struct netlink_callback *cb) + +static unsigned long ctnetlink_exp_id(const struct nf_conntrack_expect *exp) { - if (cb->args[1]) - nf_ct_expect_put((struct nf_conntrack_expect *)cb->args[1]); - return 0; + unsigned long id = (unsigned long)exp; + + id += nf_ct_get_id(exp->master); + id += exp->class; + + return id ? id : 1; } static int ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); - struct nf_conntrack_expect *exp, *last; struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); u_int8_t l3proto = nfmsg->nfgen_family; + unsigned long last_id = cb->args[1]; + struct nf_conntrack_expect *exp; rcu_read_lock(); - last = (struct nf_conntrack_expect *)cb->args[1]; for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) { restart: hlist_for_each_entry_rcu(exp, &nf_ct_expect_hash[cb->args[0]], @@ -3198,7 +3202,7 @@ restart: continue; if (cb->args[1]) { - if (exp != last) + if (ctnetlink_exp_id(exp) != last_id) continue; cb->args[1] = 0; } @@ -3207,9 +3211,7 @@ restart: cb->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp) < 0) { - if (!refcount_inc_not_zero(&exp->use)) - continue; - cb->args[1] = (unsigned long)exp; + cb->args[1] = ctnetlink_exp_id(exp); goto out; } } @@ -3220,32 +3222,30 @@ restart: } out: rcu_read_unlock(); - if (last) - nf_ct_expect_put(last); - return skb->len; } static int ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { - struct nf_conntrack_expect *exp, *last; struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); struct nf_conn *ct = cb->data; struct nf_conn_help *help = nfct_help(ct); u_int8_t l3proto = nfmsg->nfgen_family; + unsigned long last_id = cb->args[1]; + struct nf_conntrack_expect *exp; if (cb->args[0]) return 0; rcu_read_lock(); - last = (struct nf_conntrack_expect *)cb->args[1]; + restart: hlist_for_each_entry_rcu(exp, &help->expectations, lnode) { if (l3proto && exp->tuple.src.l3num != l3proto) continue; if (cb->args[1]) { - if (exp != last) + if (ctnetlink_exp_id(exp) != last_id) continue; cb->args[1] = 0; } @@ -3253,9 +3253,7 @@ restart: cb->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp) < 0) { - if (!refcount_inc_not_zero(&exp->use)) - continue; - cb->args[1] = (unsigned long)exp; + cb->args[1] = ctnetlink_exp_id(exp); goto out; } } @@ -3266,9 +3264,6 @@ restart: cb->args[0] = 1; out: rcu_read_unlock(); - if (last) - nf_ct_expect_put(last); - return skb->len; } @@ -3287,7 +3282,6 @@ static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl, struct nf_conntrack_zone zone; struct netlink_dump_control c = { .dump = ctnetlink_exp_ct_dump_table, - .done = ctnetlink_exp_done, }; err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, @@ -3337,7 +3331,6 @@ static int ctnetlink_get_expect(struct sk_buff *skb, else { struct netlink_dump_control c = { .dump = ctnetlink_exp_dump_table, - .done = ctnetlink_exp_done, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } -- cgit v1.2.3 From c8a7c2c608180f3b4e51dc958b3861242dcdd76d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 Aug 2025 12:10:41 +0200 Subject: netfilter: nft_set_pipapo: don't return bogus extension pointer Dan Carpenter says: Commit 17a20e09f086 ("netfilter: nft_set: remove one argument from lookup and update functions") [..] leads to the following Smatch static checker warning: net/netfilter/nft_set_pipapo_avx2.c:1269 nft_pipapo_avx2_lookup() error: uninitialized symbol 'ext'. Fix this by initing ext to NULL and set it only once we've found a match. Fixes: 17a20e09f086 ("netfilter: nft_set: remove one argument from lookup and update functions") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/netfilter-devel/aJBzc3V5wk-yPOnH@stanley.mountain/ Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_pipapo_avx2.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index db5d367e43c4..2f090e253caf 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1150,12 +1150,12 @@ nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, const u32 *key) { struct nft_pipapo *priv = nft_set_priv(set); + const struct nft_set_ext *ext = NULL; struct nft_pipapo_scratch *scratch; u8 genmask = nft_genmask_cur(net); const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; const u8 *rp = (const u8 *)key; - const struct nft_set_ext *ext; unsigned long *res, *fill; bool map_index; int i; @@ -1246,13 +1246,13 @@ next_match: goto out; if (last) { - ext = &f->mt[ret].e->ext; - if (unlikely(nft_set_elem_expired(ext) || - !nft_set_elem_active(ext, genmask))) { - ext = NULL; + const struct nft_set_ext *e = &f->mt[ret].e->ext; + + if (unlikely(nft_set_elem_expired(e) || + !nft_set_elem_active(e, genmask))) goto next_match; - } + ext = e; goto out; } -- cgit v1.2.3 From f54186df806fb1e9cb262d553f4ff942f9467cf1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 4 Aug 2025 13:35:15 +0300 Subject: netfilter: conntrack: clean up returns in nf_conntrack_log_invalid_sysctl() Smatch complains that these look like error paths with missing error codes, especially the one where we return if nf_log_is_registered() is true: net/netfilter/nf_conntrack_standalone.c:575 nf_conntrack_log_invalid_sysctl() warn: missing error code? 'ret' In fact, all these return zero deliberately. Change them to return a literal instead which helps readability as well as silencing the warning. Fixes: e89a68046687 ("netfilter: load nf_log_syslog on enabling nf_conntrack_log_invalid") Signed-off-by: Dan Carpenter Acked-by: Lance Yang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_standalone.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 9b8b10a85233..1f14ef0436c6 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -567,16 +567,16 @@ nf_conntrack_log_invalid_sysctl(const struct ctl_table *table, int write, return ret; if (*(u8 *)table->data == 0) - return ret; + return 0; /* Load nf_log_syslog only if no logger is currently registered */ for (i = 0; i < NFPROTO_NUMPROTO; i++) { if (nf_log_is_registered(i)) - return ret; + return 0; } request_module("%s", "nf_log_syslog"); - return ret; + return 0; } static struct ctl_table_header *nf_ct_netfilter_header; -- cgit v1.2.3 From 1dee968d22eaeb3eede70df513ab3f8dd1712e3e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 7 Aug 2025 12:02:42 +0200 Subject: netfilter: nft_socket: remove WARN_ON_ONCE with huge level value syzbot managed to reach this WARN_ON_ONCE by passing a huge level value, remove it. WARNING: CPU: 0 PID: 5853 at net/netfilter/nft_socket.c:220 nft_socket_init+0x2f4/0x3d0 net/netfilter/nft_socket.c:220 Reported-by: syzbot+a225fea35d7baf8dbdc3@syzkaller.appspotmail.com Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index 35d0409b0095..36affbb697c2 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -217,7 +217,7 @@ static int nft_socket_init(const struct nft_ctx *ctx, level += err; /* Implies a giant cgroup tree */ - if (WARN_ON_ONCE(level > 255)) + if (level > 255) return -EOPNOTSUPP; priv->level = level; -- cgit v1.2.3 From fa479132845e94b60068fad01c2a9979b3efe2dc Mon Sep 17 00:00:00 2001 From: Li Jun Date: Wed, 30 Jul 2025 18:50:19 +0800 Subject: bpf: Standardize function declaration style 'noinlne' after 'int' cause "ERROR: inline keyword should sit between storage class and type" by checkpatch.pl - Standardize function declaration style by moving 'noinline' modifier - Fix asm volatile statement formatting Signed-off-by: Li Jun Link: https://lore.kernel.org/r/20250730105019.436235-1-lijun01@kylinos.cn Signed-off-by: Alexei Starovoitov --- net/bpf/test_run.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 9728dbd4c66c..4a862d605386 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -524,27 +524,27 @@ __bpf_kfunc int bpf_fentry_test1(int a) } EXPORT_SYMBOL_GPL(bpf_fentry_test1); -int noinline bpf_fentry_test2(int a, u64 b) +noinline int bpf_fentry_test2(int a, u64 b) { return a + b; } -int noinline bpf_fentry_test3(char a, int b, u64 c) +noinline int bpf_fentry_test3(char a, int b, u64 c) { return a + b + c; } -int noinline bpf_fentry_test4(void *a, char b, int c, u64 d) +noinline int bpf_fentry_test4(void *a, char b, int c, u64 d) { return (long)a + b + c + d; } -int noinline bpf_fentry_test5(u64 a, void *b, short c, int d, u64 e) +noinline int bpf_fentry_test5(u64 a, void *b, short c, int d, u64 e) { return a + (long)b + c + d + e; } -int noinline bpf_fentry_test6(u64 a, void *b, short c, int d, void *e, u64 f) +noinline int bpf_fentry_test6(u64 a, void *b, short c, int d, void *e, u64 f) { return a + (long)b + c + d + (long)e + f; } @@ -553,13 +553,13 @@ struct bpf_fentry_test_t { struct bpf_fentry_test_t *a; }; -int noinline bpf_fentry_test7(struct bpf_fentry_test_t *arg) +noinline int bpf_fentry_test7(struct bpf_fentry_test_t *arg) { - asm volatile ("": "+r"(arg)); + asm volatile ("" : "+r"(arg)); return (long)arg; } -int noinline bpf_fentry_test8(struct bpf_fentry_test_t *arg) +noinline int bpf_fentry_test8(struct bpf_fentry_test_t *arg) { return (long)arg->a; } @@ -569,12 +569,12 @@ __bpf_kfunc u32 bpf_fentry_test9(u32 *a) return *a; } -int noinline bpf_fentry_test10(const void *a) +noinline int bpf_fentry_test10(const void *a) { return (long)a; } -void noinline bpf_fentry_test_sinfo(struct skb_shared_info *sinfo) +noinline void bpf_fentry_test_sinfo(struct skb_shared_info *sinfo) { } @@ -598,7 +598,7 @@ __bpf_kfunc int bpf_modify_return_test_tp(int nonce) return nonce; } -int noinline bpf_fentry_shadow_test(int a) +noinline int bpf_fentry_shadow_test(int a) { return a + 1; } -- cgit v1.2.3 From 64fdaa94bfe0cca3a0f4b2dd922486c5f59fe678 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 4 Aug 2025 17:36:54 -0700 Subject: net: page_pool: allow enabling recycling late, fix false positive warning Page pool can have pages "directly" (locklessly) recycled to it, if the NAPI that owns the page pool is scheduled to run on the same CPU. To make this safe we check that the NAPI is disabled while we destroy the page pool. In most cases NAPI and page pool lifetimes are tied together so this happens naturally. The queue API expects the following order of calls: -> mem_alloc alloc new pp -> stop napi_disable -> start napi_enable -> mem_free free old pp Here we allocate the page pool in ->mem_alloc and free in ->mem_free. But the NAPIs are only stopped between ->stop and ->start. We created page_pool_disable_direct_recycling() to safely shut down the recycling in ->stop. This way the page_pool_destroy() call in ->mem_free doesn't have to worry about recycling any more. Unfortunately, the page_pool_disable_direct_recycling() is not enough to deal with failures which necessitate freeing the _new_ page pool. If we hit a failure in ->mem_alloc or ->stop the new page pool has to be freed while the NAPI is active (assuming driver attaches the page pool to an existing NAPI instance and doesn't reallocate NAPIs). Freeing the new page pool is technically safe because it hasn't been used for any packets, yet, so there can be no recycling. But the check in napi_assert_will_not_race() has no way of knowing that. We could check if page pool is empty but that'd make the check much less likely to trigger during development. Add page_pool_enable_direct_recycling(), pairing with page_pool_disable_direct_recycling(). It will allow us to create the new page pools in "disabled" state and only enable recycling when we know the reconfig operation will not fail. Coincidentally it will also let us re-enable the recycling for the old pool, if the reconfig failed: -> mem_alloc (new) -> stop (old) # disables direct recycling for old -> start (new) # fail!! -> start (old) # go back to old pp but direct recycling is lost :( -> mem_free (new) The new helper is idempotent to make the life easier for drivers, which can operate in HDS mode and support zero-copy Rx. The driver can call the helper twice whether there are two pools or it has multiple references to a single pool. Fixes: 40eca00ae605 ("bnxt_en: unlink page pool when stopping Rx queue") Tested-by: David Wei Link: https://patch.msgid.link/20250805003654.2944974-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 9 ++++++++- include/net/page_pool/types.h | 2 ++ net/core/page_pool.c | 29 +++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 5578ddcb465d..76a4c5ae8000 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -3819,7 +3819,6 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp, if (BNXT_RX_PAGE_MODE(bp)) pp.pool_size += bp->rx_ring_size / rx_size_fac; pp.nid = numa_node; - pp.napi = &rxr->bnapi->napi; pp.netdev = bp->dev; pp.dev = &bp->pdev->dev; pp.dma_dir = bp->rx_dir; @@ -3851,6 +3850,12 @@ err_destroy_pp: return PTR_ERR(pool); } +static void bnxt_enable_rx_page_pool(struct bnxt_rx_ring_info *rxr) +{ + page_pool_enable_direct_recycling(rxr->head_pool, &rxr->bnapi->napi); + page_pool_enable_direct_recycling(rxr->page_pool, &rxr->bnapi->napi); +} + static int bnxt_alloc_rx_agg_bmap(struct bnxt *bp, struct bnxt_rx_ring_info *rxr) { u16 mem_size; @@ -3889,6 +3894,7 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp) rc = bnxt_alloc_rx_page_pool(bp, rxr, cpu_node); if (rc) return rc; + bnxt_enable_rx_page_pool(rxr); rc = xdp_rxq_info_reg(&rxr->xdp_rxq, bp->dev, i, 0); if (rc < 0) @@ -16031,6 +16037,7 @@ static int bnxt_queue_start(struct net_device *dev, void *qmem, int idx) goto err_reset; } + bnxt_enable_rx_page_pool(rxr); napi_enable_locked(&bnapi->napi); bnxt_db_nq_arm(bp, &cpr->cp_db, cpr->cp_raw_cons); diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 431b593de709..1509a536cb85 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -265,6 +265,8 @@ struct page_pool *page_pool_create_percpu(const struct page_pool_params *params, struct xdp_mem_info; #ifdef CONFIG_PAGE_POOL +void page_pool_enable_direct_recycling(struct page_pool *pool, + struct napi_struct *napi); void page_pool_disable_direct_recycling(struct page_pool *pool); void page_pool_destroy(struct page_pool *pool); void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 05e2e22a8f7c..343a6cac21e3 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -1201,6 +1201,35 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), pool->xdp_mem_id = mem->id; } +/** + * page_pool_enable_direct_recycling() - mark page pool as owned by NAPI + * @pool: page pool to modify + * @napi: NAPI instance to associate the page pool with + * + * Associate a page pool with a NAPI instance for lockless page recycling. + * This is useful when a new page pool has to be added to a NAPI instance + * without disabling that NAPI instance, to mark the point at which control + * path "hands over" the page pool to the NAPI instance. In most cases driver + * can simply set the @napi field in struct page_pool_params, and does not + * have to call this helper. + * + * The function is idempotent, but does not implement any refcounting. + * Single page_pool_disable_direct_recycling() will disable recycling, + * no matter how many times enable was called. + */ +void page_pool_enable_direct_recycling(struct page_pool *pool, + struct napi_struct *napi) +{ + if (READ_ONCE(pool->p.napi) == napi) + return; + WARN_ON(!napi || pool->p.napi); + + mutex_lock(&page_pools_lock); + WRITE_ONCE(pool->p.napi, napi); + mutex_unlock(&page_pools_lock); +} +EXPORT_SYMBOL(page_pool_enable_direct_recycling); + void page_pool_disable_direct_recycling(struct page_pool *pool) { /* Disable direct recycling based on pool->cpuid. -- cgit v1.2.3 From aba0c94f61ec05315fa7815d21aefa4c87f6a9f4 Mon Sep 17 00:00:00 2001 From: Budimir Markovic Date: Thu, 7 Aug 2025 04:18:11 +0000 Subject: vsock: Do not allow binding to VMADDR_PORT_ANY It is possible for a vsock to autobind to VMADDR_PORT_ANY. This can cause a use-after-free when a connection is made to the bound socket. The socket returned by accept() also has port VMADDR_PORT_ANY but is not on the list of unbound sockets. Binding it will result in an extra refcount decrement similar to the one fixed in fcdd2242c023 (vsock: Keep the binding until socket destruction). Modify the check in __vsock_bind_connectible() to also prevent binding to VMADDR_PORT_ANY. Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Reported-by: Budimir Markovic Signed-off-by: Budimir Markovic Reviewed-by: Stefano Garzarella Link: https://patch.msgid.link/20250807041811.678-1-markovicbudimir@gmail.com Signed-off-by: Jakub Kicinski --- net/vmw_vsock/af_vsock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index ead6a3c14b87..bebb355f3ffe 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -689,7 +689,8 @@ static int __vsock_bind_connectible(struct vsock_sock *vsk, unsigned int i; for (i = 0; i < MAX_PORT_RETRIES; i++) { - if (port <= LAST_RESERVED_PORT) + if (port == VMADDR_PORT_ANY || + port <= LAST_RESERVED_PORT) port = LAST_RESERVED_PORT + 1; new_addr.svm_port = port++; -- cgit v1.2.3 From fd60d8a086191fe33c2d719732d2482052fa6805 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 7 Aug 2025 15:40:11 -0400 Subject: sctp: linearize cloned gso packets in sctp_rcv A cloned head skb still shares these frag skbs in fraglist with the original head skb. It's not safe to access these frag skbs. syzbot reported two use-of-uninitialized-memory bugs caused by this: BUG: KMSAN: uninit-value in sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211 sctp_inq_pop+0x15b7/0x1920 net/sctp/inqueue.c:211 sctp_assoc_bh_rcv+0x1a7/0xc50 net/sctp/associola.c:998 sctp_inq_push+0x2ef/0x380 net/sctp/inqueue.c:88 sctp_backlog_rcv+0x397/0xdb0 net/sctp/input.c:331 sk_backlog_rcv+0x13b/0x420 include/net/sock.h:1122 __release_sock+0x1da/0x330 net/core/sock.c:3106 release_sock+0x6b/0x250 net/core/sock.c:3660 sctp_wait_for_connect+0x487/0x820 net/sctp/socket.c:9360 sctp_sendmsg_to_asoc+0x1ec1/0x1f00 net/sctp/socket.c:1885 sctp_sendmsg+0x32b9/0x4a80 net/sctp/socket.c:2031 inet_sendmsg+0x25a/0x280 net/ipv4/af_inet.c:851 sock_sendmsg_nosec net/socket.c:718 [inline] and BUG: KMSAN: uninit-value in sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987 sctp_assoc_bh_rcv+0x34e/0xbc0 net/sctp/associola.c:987 sctp_inq_push+0x2a3/0x350 net/sctp/inqueue.c:88 sctp_backlog_rcv+0x3c7/0xda0 net/sctp/input.c:331 sk_backlog_rcv+0x142/0x420 include/net/sock.h:1148 __release_sock+0x1d3/0x330 net/core/sock.c:3213 release_sock+0x6b/0x270 net/core/sock.c:3767 sctp_wait_for_connect+0x458/0x820 net/sctp/socket.c:9367 sctp_sendmsg_to_asoc+0x223a/0x2260 net/sctp/socket.c:1886 sctp_sendmsg+0x3910/0x49f0 net/sctp/socket.c:2032 inet_sendmsg+0x269/0x2a0 net/ipv4/af_inet.c:851 sock_sendmsg_nosec net/socket.c:712 [inline] This patch fixes it by linearizing cloned gso packets in sctp_rcv(). Fixes: 90017accff61 ("sctp: Add GSO support") Reported-by: syzbot+773e51afe420baaf0e2b@syzkaller.appspotmail.com Reported-by: syzbot+70a42f45e76bede082be@syzkaller.appspotmail.com Signed-off-by: Xin Long Reviewed-by: Marcelo Ricardo Leitner Link: https://patch.msgid.link/dd7dc337b99876d4132d0961f776913719f7d225.1754595611.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski --- net/sctp/input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index 2dc2666988fb..7e99894778d4 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -117,7 +117,7 @@ int sctp_rcv(struct sk_buff *skb) * it's better to just linearize it otherwise crc computing * takes longer. */ - if ((!is_gso && skb_linearize(skb)) || + if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) || !pskb_may_pull(skb, sizeof(struct sctphdr))) goto discard_it; -- cgit v1.2.3 From ccba9f6baa900e31ad1a4c36e6f3c176694f9eac Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 8 Aug 2025 17:12:04 -0700 Subject: net: update NAPI threaded config even for disabled NAPIs We have to make sure that all future NAPIs will have the right threaded state when the state is configured on the device level. We chose not to have an "unset" state for threaded, and not to wipe the NAPI config clean when channels are explicitly disabled. This means the persistent config structs "exist" even when their NAPIs are not instantiated. Differently put - the NAPI persistent state lives in the net_device (ncfg == struct napi_config): ,--- [napi 0] - [napi 1] [dev] | | `--- [ncfg 0] - [ncfg 1] so say we a device with 2 queues but only 1 enabled: ,--- [napi 0] [dev] | `--- [ncfg 0] - [ncfg 1] now we set the device to threaded=1: ,---------- [napi 0 (thr:1)] [dev(thr:1)] | `---------- [ncfg 0 (thr:1)] - [ncfg 1 (thr:?)] Since [ncfg 1] was not attached to a NAPI during configuration we skipped it. If we create a NAPI for it later it will have the old setting (presumably disabled). One could argue if this is right or not "in principle", but it's definitely not how things worked before per-NAPI config.. Fixes: 2677010e7793 ("Add support to set NAPI threaded for individual NAPI") Signed-off-by: Jakub Kicinski Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250809001205.1147153-3-kuba@kernel.org Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 5 ++++- net/core/dev.c | 7 ++++++- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5e5de4b0a433..f3a3b761abfb 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2071,6 +2071,8 @@ enum netdev_reg_state { * @max_pacing_offload_horizon: max EDT offload horizon in nsec. * @napi_config: An array of napi_config structures containing per-NAPI * settings. + * @num_napi_configs: number of allocated NAPI config structs, + * always >= max(num_rx_queues, num_tx_queues). * @gro_flush_timeout: timeout for GRO layer in NAPI * @napi_defer_hard_irqs: If not zero, provides a counter that would * allow to avoid NIC hard IRQ, on busy queues. @@ -2482,8 +2484,9 @@ struct net_device { u64 max_pacing_offload_horizon; struct napi_config *napi_config; - unsigned long gro_flush_timeout; + u32 num_napi_configs; u32 napi_defer_hard_irqs; + unsigned long gro_flush_timeout; /** * @up: copy of @state's IFF_UP, but safe to read with just @lock. diff --git a/net/core/dev.c b/net/core/dev.c index 68dc47d7e700..f180746382a1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6999,7 +6999,7 @@ int netif_set_threaded(struct net_device *dev, enum netdev_napi_threaded threaded) { struct napi_struct *napi; - int err = 0; + int i, err = 0; netdev_assert_locked_or_invisible(dev); @@ -7021,6 +7021,10 @@ int netif_set_threaded(struct net_device *dev, list_for_each_entry(napi, &dev->napi_list, dev_list) WARN_ON_ONCE(napi_set_threaded(napi, threaded)); + /* Override the config for all NAPIs even if currently not listed */ + for (i = 0; i < dev->num_napi_configs; i++) + dev->napi_config[i].threaded = threaded; + return err; } @@ -11873,6 +11877,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, goto free_all; dev->cfg_pending = dev->cfg; + dev->num_napi_configs = maxqs; napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config)); dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT); if (!dev->napi_config) -- cgit v1.2.3 From b3fc08ab9a565efb42fe08be046a0d203b82cdb8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 8 Aug 2025 17:12:05 -0700 Subject: net: prevent deadlocks when enabling NAPIs with mixed kthread config The following order of calls currently deadlocks if: - device has threaded=1; and - NAPI has persistent config with threaded=0. netif_napi_add_weight_config() dev->threaded == 1 napi_kthread_create() napi_enable() napi_restore_config() napi_set_threaded(0) napi_stop_kthread() while (NAPIF_STATE_SCHED) msleep(20) We deadlock because disabled NAPI has STATE_SCHED set. Creating a thread in netif_napi_add() just to destroy it in napi_disable() is fairly ugly in the first place. Let's read both the device config and the NAPI config in netif_napi_add(). Fixes: e6d76268813d ("net: Update threaded state in napi config in netif_set_threaded") Signed-off-by: Jakub Kicinski Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250809001205.1147153-4-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/core/dev.c | 5 +++-- net/core/dev.h | 8 ++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index f180746382a1..5a3c0f40a93f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7357,8 +7357,9 @@ void netif_napi_add_weight_locked(struct net_device *dev, * Clear dev->threaded if kthread creation failed so that * threaded mode will not be enabled in napi_enable(). */ - if (dev->threaded && napi_kthread_create(napi)) - dev->threaded = NETDEV_NAPI_THREADED_DISABLED; + if (napi_get_threaded_config(dev, napi)) + if (napi_kthread_create(napi)) + dev->threaded = NETDEV_NAPI_THREADED_DISABLED; netif_napi_set_irq_locked(napi, -1); } EXPORT_SYMBOL(netif_napi_add_weight_locked); diff --git a/net/core/dev.h b/net/core/dev.h index ab69edc0c3e3..d6b08d435479 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -323,6 +323,14 @@ static inline enum netdev_napi_threaded napi_get_threaded(struct napi_struct *n) return NETDEV_NAPI_THREADED_DISABLED; } +static inline enum netdev_napi_threaded +napi_get_threaded_config(struct net_device *dev, struct napi_struct *n) +{ + if (n->config) + return n->config->threaded; + return dev->threaded; +} + int napi_set_threaded(struct napi_struct *n, enum netdev_napi_threaded threaded); -- cgit v1.2.3 From c5ec7f49b480db0dfc83f395755b1c2a7c979920 Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Fri, 4 Jul 2025 13:17:47 +0200 Subject: devlink: let driver opt out of automatic phys_port_name generation Currently when adding devlink port, phys_port_name is automatically generated within devlink port initialization flow. As a result adding devlink port support to driver may result in forced changes of interface names, which breaks already existing network configs. This is an expected behavior but in some scenarios it would not be preferable to provide such limitation for legacy driver not being able to keep 'pre-devlink' interface name. Add flag no_phys_port_name to devlink_port_attrs struct which indicates if devlink should not alter name of interface. Suggested-by: Jiri Pirko Link: https://lore.kernel.org/all/nbwrfnjhvrcduqzjl4a2jafnvvud6qsbxlvxaxilnryglf4j7r@btuqrimnfuly/ Signed-off-by: Jedrzej Jagielski Signed-off-by: Tony Nguyen --- include/net/devlink.h | 6 +++++- net/devlink/port.c | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 93640a29427c..b32c9ceeb81d 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -78,6 +78,9 @@ struct devlink_port_pci_sf_attrs { * @flavour: flavour of the port * @split: indicates if this is split port * @splittable: indicates if the port can be split. + * @no_phys_port_name: skip automatic phys_port_name generation; for + * compatibility only, newly added driver/port instance + * should never set this. * @lanes: maximum number of lanes the port supports. 0 value is not passed to netlink. * @switch_id: if the port is part of switch, this is buffer with ID, otherwise this is NULL * @phys: physical port attributes @@ -87,7 +90,8 @@ struct devlink_port_pci_sf_attrs { */ struct devlink_port_attrs { u8 split:1, - splittable:1; + splittable:1, + no_phys_port_name:1; u32 lanes; enum devlink_port_flavour flavour; struct netdev_phys_item_id switch_id; diff --git a/net/devlink/port.c b/net/devlink/port.c index 939081a0e615..cb8d4df61619 100644 --- a/net/devlink/port.c +++ b/net/devlink/port.c @@ -1519,7 +1519,7 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, struct devlink_port_attrs *attrs = &devlink_port->attrs; int n = 0; - if (!devlink_port->attrs_set) + if (!devlink_port->attrs_set || devlink_port->attrs.no_phys_port_name) return -EOPNOTSUPP; switch (attrs->flavour) { -- cgit v1.2.3 From 526c2530cbf84428a0a2b5ca7800986c0912ac35 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Sun, 10 Aug 2025 15:29:40 +0800 Subject: tcp: cdg: remove redundant __GFP_NOWARN GFP_NOWAIT already includes __GFP_NOWARN, so let's remove the redundant __GFP_NOWARN. Signed-off-by: Qianfeng Rong Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250810072944.438574-2-rongqianfeng@vivo.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_cdg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index ba4d98e510e0..fbad6c35dee9 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -379,7 +379,7 @@ static void tcp_cdg_init(struct sock *sk) /* We silently fall back to window = 1 if allocation fails. */ if (window > 1) ca->gradients = kcalloc(window, sizeof(ca->gradients[0]), - GFP_NOWAIT | __GFP_NOWARN); + GFP_NOWAIT); ca->rtt_seq = tp->snd_nxt; ca->shadow_wnd = tcp_snd_cwnd(tp); } -- cgit v1.2.3 From 7792232a4ea1a8b6fe34d0d99a1e02aac185e633 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Sun, 10 Aug 2025 15:29:41 +0800 Subject: RDS: remove redundant __GFP_NOWARN GFP_NOWAIT already includes __GFP_NOWARN, so let's remove the redundant __GFP_NOWARN. Signed-off-by: Qianfeng Rong Reviewed-by: Allison Henderson Link: https://patch.msgid.link/20250810072944.438574-3-rongqianfeng@vivo.com Signed-off-by: Jakub Kicinski --- net/rds/ib_recv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index e53b7f266bd7..4248dfa816eb 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -1034,7 +1034,7 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, rds_ib_stats_inc(s_ib_rx_ring_empty); if (rds_ib_ring_low(&ic->i_recv_ring)) { - rds_ib_recv_refill(conn, 0, GFP_NOWAIT | __GFP_NOWARN); + rds_ib_recv_refill(conn, 0, GFP_NOWAIT); rds_ib_stats_inc(s_ib_rx_refill_from_cq); } } -- cgit v1.2.3 From 63fe077c21d323cbc8d56114e9139bbadab33ce5 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 11 Aug 2025 11:34:40 +0200 Subject: caif: Replace memset(0) + strscpy() with strscpy_pad() Replace memset(0) followed by strscpy() with strscpy_pad() to improve cfctrl_linkup_request(). This avoids zeroing the memory before copying the string and ensures the destination buffer is only written to once, simplifying the code and improving efficiency. Signed-off-by: Thorsten Blum Link: https://patch.msgid.link/20250811093442.5075-2-thorsten.blum@linux.dev Signed-off-by: Jakub Kicinski --- net/caif/cfctrl.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c index 06b604cf9d58..2aa1e7d46eb2 100644 --- a/net/caif/cfctrl.c +++ b/net/caif/cfctrl.c @@ -257,9 +257,7 @@ int cfctrl_linkup_request(struct cflayer *layer, cfpkt_add_body(pkt, &tmp16, 2); tmp16 = cpu_to_le16(param->u.utility.fifosize_bufs); cfpkt_add_body(pkt, &tmp16, 2); - memset(utility_name, 0, sizeof(utility_name)); - strscpy(utility_name, param->u.utility.name, - UTILITY_NAME_LENGTH); + strscpy_pad(utility_name, param->u.utility.name); cfpkt_add_body(pkt, utility_name, UTILITY_NAME_LENGTH); tmp8 = param->u.utility.paramlen; cfpkt_add_body(pkt, &tmp8, 1); -- cgit v1.2.3 From 86e3d52bd3e919181d5f7e5107065d16e694c8d8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 11 Aug 2025 14:52:52 +0000 Subject: phonet: add __rcu annotations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removes following sparse errors. make C=2 net/phonet/socket.o net/phonet/af_phonet.o CHECK net/phonet/socket.c net/phonet/socket.c:619:14: error: incompatible types in comparison expression (different address spaces): net/phonet/socket.c:619:14: struct sock [noderef] __rcu * net/phonet/socket.c:619:14: struct sock * net/phonet/socket.c:642:17: error: incompatible types in comparison expression (different address spaces): net/phonet/socket.c:642:17: struct sock [noderef] __rcu * net/phonet/socket.c:642:17: struct sock * net/phonet/socket.c:658:17: error: incompatible types in comparison expression (different address spaces): net/phonet/socket.c:658:17: struct sock [noderef] __rcu * net/phonet/socket.c:658:17: struct sock * net/phonet/socket.c:677:25: error: incompatible types in comparison expression (different address spaces): net/phonet/socket.c:677:25: struct sock [noderef] __rcu * net/phonet/socket.c:677:25: struct sock * net/phonet/socket.c:726:21: warning: context imbalance in 'pn_res_seq_start' - wrong count at exit net/phonet/socket.c:741:13: warning: context imbalance in 'pn_res_seq_stop' - wrong count at exit CHECK net/phonet/af_phonet.c net/phonet/af_phonet.c:35:14: error: incompatible types in comparison expression (different address spaces): net/phonet/af_phonet.c:35:14: struct phonet_protocol const [noderef] __rcu * net/phonet/af_phonet.c:35:14: struct phonet_protocol const * net/phonet/af_phonet.c:474:17: error: incompatible types in comparison expression (different address spaces): net/phonet/af_phonet.c:474:17: struct phonet_protocol const [noderef] __rcu * net/phonet/af_phonet.c:474:17: struct phonet_protocol const * net/phonet/af_phonet.c:486:9: error: incompatible types in comparison expression (different address spaces): net/phonet/af_phonet.c:486:9: struct phonet_protocol const [noderef] __rcu * net/phonet/af_phonet.c:486:9: struct phonet_protocol const * Signed-off-by: Eric Dumazet Acked-by: Rémi Denis-Courmont Link: https://patch.msgid.link/20250811145252.1007242-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/phonet/af_phonet.c | 4 ++-- net/phonet/socket.c | 23 ++++++++++++----------- 2 files changed, 14 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c index a27efa4faa4e..238a9638d2b0 100644 --- a/net/phonet/af_phonet.c +++ b/net/phonet/af_phonet.c @@ -22,7 +22,7 @@ #include /* Transport protocol registration */ -static const struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly; +static const struct phonet_protocol __rcu *proto_tab[PHONET_NPROTO] __read_mostly; static const struct phonet_protocol *phonet_proto_get(unsigned int protocol) { @@ -482,7 +482,7 @@ void phonet_proto_unregister(unsigned int protocol, const struct phonet_protocol *pp) { mutex_lock(&proto_tab_lock); - BUG_ON(proto_tab[protocol] != pp); + BUG_ON(rcu_access_pointer(proto_tab[protocol]) != pp); RCU_INIT_POINTER(proto_tab[protocol], NULL); mutex_unlock(&proto_tab_lock); synchronize_rcu(); diff --git a/net/phonet/socket.c b/net/phonet/socket.c index ea4d5e6533db..2b61a40b568e 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -602,7 +602,7 @@ const struct seq_operations pn_sock_seq_ops = { #endif static struct { - struct sock *sk[256]; + struct sock __rcu *sk[256]; } pnres; /* @@ -654,7 +654,7 @@ int pn_sock_unbind_res(struct sock *sk, u8 res) return -EPERM; mutex_lock(&resource_mutex); - if (pnres.sk[res] == sk) { + if (rcu_access_pointer(pnres.sk[res]) == sk) { RCU_INIT_POINTER(pnres.sk[res], NULL); ret = 0; } @@ -673,7 +673,7 @@ void pn_sock_unbind_all_res(struct sock *sk) mutex_lock(&resource_mutex); for (res = 0; res < 256; res++) { - if (pnres.sk[res] == sk) { + if (rcu_access_pointer(pnres.sk[res]) == sk) { RCU_INIT_POINTER(pnres.sk[res], NULL); match++; } @@ -688,7 +688,7 @@ void pn_sock_unbind_all_res(struct sock *sk) } #ifdef CONFIG_PROC_FS -static struct sock **pn_res_get_idx(struct seq_file *seq, loff_t pos) +static struct sock __rcu **pn_res_get_idx(struct seq_file *seq, loff_t pos) { struct net *net = seq_file_net(seq); unsigned int i; @@ -697,7 +697,7 @@ static struct sock **pn_res_get_idx(struct seq_file *seq, loff_t pos) return NULL; for (i = 0; i < 256; i++) { - if (pnres.sk[i] == NULL) + if (rcu_access_pointer(pnres.sk[i]) == NULL) continue; if (!pos) return pnres.sk + i; @@ -706,7 +706,7 @@ static struct sock **pn_res_get_idx(struct seq_file *seq, loff_t pos) return NULL; } -static struct sock **pn_res_get_next(struct seq_file *seq, struct sock **sk) +static struct sock __rcu **pn_res_get_next(struct seq_file *seq, struct sock __rcu **sk) { struct net *net = seq_file_net(seq); unsigned int i; @@ -728,7 +728,7 @@ static void *pn_res_seq_start(struct seq_file *seq, loff_t *pos) static void *pn_res_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct sock **sk; + struct sock __rcu **sk; if (v == SEQ_START_TOKEN) sk = pn_res_get_idx(seq, 0); @@ -747,11 +747,12 @@ static void pn_res_seq_stop(struct seq_file *seq, void *v) static int pn_res_seq_show(struct seq_file *seq, void *v) { seq_setwidth(seq, 63); - if (v == SEQ_START_TOKEN) + if (v == SEQ_START_TOKEN) { seq_puts(seq, "rs uid inode"); - else { - struct sock **psk = v; - struct sock *sk = *psk; + } else { + struct sock __rcu **psk = v; + struct sock *sk = rcu_dereference_protected(*psk, + lockdep_is_held(&resource_mutex)); seq_printf(seq, "%02X %5u %lu", (int) (psk - pnres.sk), -- cgit v1.2.3 From b3ba7d929ce197ff2651046798b94bd62eb0e680 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 11 Aug 2025 18:40:38 +0200 Subject: net/sched: Remove redundant memset(0) call in reset_policy() The call to nla_strscpy() already zero-pads the tail of the destination buffer which makes the additional memset(0) call redundant. Remove it. Signed-off-by: Thorsten Blum Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250811164039.43250-1-thorsten.blum@linux.dev Signed-off-by: Jakub Kicinski --- net/sched/act_simple.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index f3abe0545989..8e69a919b4fe 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -72,7 +72,6 @@ static int reset_policy(struct tc_action *a, const struct nlattr *defdata, d = to_defact(a); spin_lock_bh(&d->tcf_lock); goto_ch = tcf_action_set_ctrlact(a, p->action, goto_ch); - memset(d->tcfd_defdata, 0, SIMP_MAX_DATA); nla_strscpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); spin_unlock_bh(&d->tcf_lock); if (goto_ch) -- cgit v1.2.3 From 6db015fc4b5d5f63a64a193f65d98da3a7fc811d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 7 Aug 2025 16:29:06 -0700 Subject: tls: handle data disappearing from under the TLS ULP TLS expects that it owns the receive queue of the TCP socket. This cannot be guaranteed in case the reader of the TCP socket entered before the TLS ULP was installed, or uses some non-standard read API (eg. zerocopy ones). Replace the WARN_ON() and a buggy early exit (which leaves anchor pointing to a freed skb) with real error handling. Wipe the parsing state and tell the reader to retry. We already reload the anchor every time we (re)acquire the socket lock, so the only condition we need to avoid is an out of bounds read (not having enough bytes in the socket for previously parsed record len). If some data was read from under TLS but there's enough in the queue we'll reload and decrypt what is most likely not a valid TLS record. Leading to some undefined behavior from TLS perspective (corrupting a stream? missing an alert? missing an attack?) but no kernel crash should take place. Reported-by: William Liu Reported-by: Savino Dicanosa Link: https://lore.kernel.org/tFjq_kf7sWIG3A7CrCg_egb8CVsT_gsmHAK0_wxDPJXfIzxFAMxqmLwp3MlU5EHiet0AwwJldaaFdgyHpeIUCS-3m3llsmRzp9xIOBR4lAI=@syst3mfailure.io Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser") Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250807232907.600366-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/tls/tls.h | 2 +- net/tls/tls_strp.c | 11 ++++++++--- net/tls/tls_sw.c | 3 ++- 3 files changed, 11 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/tls/tls.h b/net/tls/tls.h index 774859b63f0d..4e077068e6d9 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -196,7 +196,7 @@ void tls_strp_msg_done(struct tls_strparser *strp); int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb); void tls_rx_msg_ready(struct tls_strparser *strp); -void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh); +bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh); int tls_strp_msg_cow(struct tls_sw_context_rx *ctx); struct sk_buff *tls_strp_msg_detach(struct tls_sw_context_rx *ctx); int tls_strp_msg_hold(struct tls_strparser *strp, struct sk_buff_head *dst); diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index 095cf31bae0b..d71643b494a1 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -475,7 +475,7 @@ static void tls_strp_load_anchor_with_queue(struct tls_strparser *strp, int len) strp->stm.offset = offset; } -void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh) +bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh) { struct strp_msg *rxm; struct tls_msg *tlm; @@ -484,8 +484,11 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh) DEBUG_NET_WARN_ON_ONCE(!strp->stm.full_len); if (!strp->copy_mode && force_refresh) { - if (WARN_ON(tcp_inq(strp->sk) < strp->stm.full_len)) - return; + if (unlikely(tcp_inq(strp->sk) < strp->stm.full_len)) { + WRITE_ONCE(strp->msg_ready, 0); + memset(&strp->stm, 0, sizeof(strp->stm)); + return false; + } tls_strp_load_anchor_with_queue(strp, strp->stm.full_len); } @@ -495,6 +498,8 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh) rxm->offset = strp->stm.offset; tlm = tls_msg(strp->anchor); tlm->control = strp->mark; + + return true; } /* Called with lock held on lower socket */ diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 549d1ea01a72..51c98a007dda 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1384,7 +1384,8 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, return sock_intr_errno(timeo); } - tls_strp_msg_load(&ctx->strp, released); + if (unlikely(!tls_strp_msg_load(&ctx->strp, released))) + return tls_rx_rec_wait(sk, psock, nonblock, false); return 1; } -- cgit v1.2.3 From 30c1d25b9870d551be42535067d5481668b5e6f3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 11 Aug 2025 12:26:10 +0200 Subject: netfilter: nft_set_pipapo: fix null deref for empty set Blamed commit broke the check for a null scratch map: - if (unlikely(!m || !*raw_cpu_ptr(m->scratch))) + if (unlikely(!raw_cpu_ptr(m->scratch))) This should have been "if (!*raw_ ...)". Use the pattern of the avx2 version which is more readable. This can only be reproduced if avx2 support isn't available. Fixes: d8d871a35ca9 ("netfilter: nft_set_pipapo: merge pipapo_get/lookup") Signed-off-by: Florian Westphal --- net/netfilter/nft_set_pipapo.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 1a19649c2851..9a10251228fd 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -426,10 +426,9 @@ static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m, local_bh_disable(); - if (unlikely(!raw_cpu_ptr(m->scratch))) - goto out; - scratch = *raw_cpu_ptr(m->scratch); + if (unlikely(!scratch)) + goto out; map_index = scratch->map_index; -- cgit v1.2.3 From c0a23bbc98e93704a1f4fb5e7e7bb2d7c0fb6eb3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Jul 2025 14:26:11 +0200 Subject: ipvs: Fix estimator kthreads preferred affinity The estimator kthreads' affinity are defined by sysctl overwritten preferences and applied through a plain call to the scheduler's affinity API. However since the introduction of managed kthreads preferred affinity, such a practice shortcuts the kthreads core code which eventually overwrites the target to the default unbound affinity. Fix this with using the appropriate kthread's API. Fixes: d1a89197589c ("kthread: Default affine kthread to its preferred NUMA node") Signed-off-by: Frederic Weisbecker Acked-by: Julian Anastasov Signed-off-by: Florian Westphal --- include/net/ip_vs.h | 13 +++++++++++++ kernel/kthread.c | 1 + net/netfilter/ipvs/ip_vs_est.c | 3 ++- 3 files changed, 16 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index ff406ef4fd4a..29a36709e7f3 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1163,6 +1163,14 @@ static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) return housekeeping_cpumask(HK_TYPE_KTHREAD); } +static inline const struct cpumask *sysctl_est_preferred_cpulist(struct netns_ipvs *ipvs) +{ + if (ipvs->est_cpulist_valid) + return ipvs->sysctl_est_cpulist; + else + return NULL; +} + static inline int sysctl_est_nice(struct netns_ipvs *ipvs) { return ipvs->sysctl_est_nice; @@ -1270,6 +1278,11 @@ static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) return housekeeping_cpumask(HK_TYPE_KTHREAD); } +static inline const struct cpumask *sysctl_est_preferred_cpulist(struct netns_ipvs *ipvs) +{ + return NULL; +} + static inline int sysctl_est_nice(struct netns_ipvs *ipvs) { return IPVS_EST_NICE; diff --git a/kernel/kthread.c b/kernel/kthread.c index 0e98b228a8ef..31b072e8d427 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -893,6 +893,7 @@ out: return ret; } +EXPORT_SYMBOL_GPL(kthread_affine_preferred); /* * Re-affine kthreads according to their preferences diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index f821ad2e19b3..15049b826732 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -265,7 +265,8 @@ int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, } set_user_nice(kd->task, sysctl_est_nice(ipvs)); - set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs)); + if (sysctl_est_preferred_cpulist(ipvs)) + kthread_affine_preferred(kd->task, sysctl_est_preferred_cpulist(ipvs)); pr_info("starting estimator thread %d...\n", kd->id); wake_up_process(kd->task); -- cgit v1.2.3 From cf5fb87fcdaaaafec55dcc0dc5a9e15ead343973 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 13 Aug 2025 02:38:50 +0200 Subject: netfilter: nf_tables: reject duplicate device on updates A chain/flowtable update with duplicated devices in the same batch is possible. Unfortunately, netdev event path only removes the first device that is found, leaving unregistered the hook of the duplicated device. Check if a duplicated device exists in the transaction batch, bail out with EEXIST in such case. WARNING is hit when unregistering the hook: [49042.221275] WARNING: CPU: 4 PID: 8425 at net/netfilter/core.c:340 nf_hook_entry_head+0xaa/0x150 [49042.221375] CPU: 4 UID: 0 PID: 8425 Comm: nft Tainted: G S 6.16.0+ #170 PREEMPT(full) [...] [49042.221382] RIP: 0010:nf_hook_entry_head+0xaa/0x150 Fixes: 78d9f48f7f44 ("netfilter: nf_tables: add devices to existing flowtable") Fixes: b9703ed44ffb ("netfilter: nf_tables: support for adding new devices to an existing netdev chain") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 13d0ed9d1895..58c5425d61c2 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2803,6 +2803,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, struct nft_chain *chain = ctx->chain; struct nft_chain_hook hook = {}; struct nft_stats __percpu *stats = NULL; + struct nftables_pernet *nft_net; struct nft_hook *h, *next; struct nf_hook_ops *ops; struct nft_trans *trans; @@ -2845,6 +2846,20 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, if (nft_hook_list_find(&basechain->hook_list, h)) { list_del(&h->list); nft_netdev_hook_free(h); + continue; + } + + nft_net = nft_pernet(ctx->net); + list_for_each_entry(trans, &nft_net->commit_list, list) { + if (trans->msg_type != NFT_MSG_NEWCHAIN || + trans->table != ctx->table || + !nft_trans_chain_update(trans)) + continue; + + if (nft_hook_list_find(&nft_trans_chain_hooks(trans), h)) { + nft_chain_release_hook(&hook); + return -EEXIST; + } } } } else { @@ -9060,6 +9075,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, { const struct nlattr * const *nla = ctx->nla; struct nft_flowtable_hook flowtable_hook; + struct nftables_pernet *nft_net; struct nft_hook *hook, *next; struct nf_hook_ops *ops; struct nft_trans *trans; @@ -9076,6 +9092,20 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, if (nft_hook_list_find(&flowtable->hook_list, hook)) { list_del(&hook->list); nft_netdev_hook_free(hook); + continue; + } + + nft_net = nft_pernet(ctx->net); + list_for_each_entry(trans, &nft_net->commit_list, list) { + if (trans->msg_type != NFT_MSG_NEWFLOWTABLE || + trans->table != ctx->table || + !nft_trans_flowtable_update(trans)) + continue; + + if (nft_hook_list_find(&nft_trans_flowtable_hooks(trans), hook)) { + err = -EEXIST; + goto err_flowtable_update_hook; + } } } -- cgit v1.2.3 From 4d18083d6b2c02e89d833c92c3fb79e2fe1e6795 Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Tue, 12 Aug 2025 09:59:29 +0800 Subject: vsock: use sizeof(struct sockaddr_storage) instead of magic value Previous commit 230b183921ec ("net: Use standard structures for generic socket address structures.") use 'struct sockaddr_storage address;' to replace 'char address[MAX_SOCK_ADDR];'. The macro MAX_SOCK_ADDR is removed by commit 01893c82b4e6 ("net: Remove MAX_SOCK_ADDR constant"). The comment in vsock_getname() is outdated, use sizeof(struct sockaddr_storage) instead of magic value 128. Signed-off-by: Wang Liang Link: https://patch.msgid.link/20250812015929.1419896-1-wangliang74@huawei.com Signed-off-by: Jakub Kicinski --- net/vmw_vsock/af_vsock.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index ead6a3c14b87..f7b2d61d1d16 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1028,12 +1028,7 @@ static int vsock_getname(struct socket *sock, vm_addr = &vsk->local_addr; } - /* sys_getsockname() and sys_getpeername() pass us a - * MAX_SOCK_ADDR-sized buffer and don't set addr_len. Unfortunately - * that macro is defined in socket.c instead of .h, so we hardcode its - * value here. - */ - BUILD_BUG_ON(sizeof(*vm_addr) > 128); + BUILD_BUG_ON(sizeof(*vm_addr) > sizeof(struct sockaddr_storage)); memcpy(addr, vm_addr, sizeof(*vm_addr)); err = sizeof(*vm_addr); -- cgit v1.2.3 From a58893aa173923fdc49c2d35d638d8133065e952 Mon Sep 17 00:00:00 2001 From: Matt Johnston Date: Tue, 12 Aug 2025 13:08:58 +0800 Subject: net: mctp: Fix bad kfree_skb in bind lookup test The kunit test's skb_pkt is consumed by mctp_dst_input() so shouldn't be freed separately. Fixes: e6d8e7dbc5a3 ("net: mctp: Add bind lookup test") Reported-by: Alexandre Ghiti Closes: https://lore.kernel.org/all/734b02a3-1941-49df-a0da-ec14310d41e4@ghiti.fr/ Signed-off-by: Matt Johnston Tested-by: Alexandre Ghiti Link: https://patch.msgid.link/20250812-fix-mctp-bind-test-v1-1-5e2128664eb3@codeconstruct.com.au Signed-off-by: Jakub Kicinski --- net/mctp/test/route-test.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c index fb6b46a952cb..69a3ccfc6310 100644 --- a/net/mctp/test/route-test.c +++ b/net/mctp/test/route-test.c @@ -1586,7 +1586,6 @@ static void mctp_test_bind_lookup(struct kunit *test) cleanup: kfree_skb(skb_sock); - kfree_skb(skb_pkt); /* Drop all binds */ for (size_t i = 0; i < ARRAY_SIZE(lookup_binds); i++) -- cgit v1.2.3 From 87c6efc5ce9c126ae4a781bc04504b83780e3650 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Tue, 12 Aug 2025 18:40:29 +0200 Subject: net/sched: ets: use old 'nbands' while purging unused classes Shuang reported sch_ets test-case [1] crashing in ets_class_qlen_notify() after recent changes from Lion [2]. The problem is: in ets_qdisc_change() we purge unused DWRR queues; the value of 'q->nbands' is the new one, and the cleanup should be done with the old one. The problem is here since my first attempts to fix ets_qdisc_change(), but it surfaced again after the recent qdisc len accounting fixes. Fix it purging idle DWRR queues before assigning a new value of 'q->nbands', so that all purge operations find a consistent configuration: - old 'q->nbands' because it's needed by ets_class_find() - old 'q->nstrict' because it's needed by ets_class_is_strict() BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: Oops: 0000 [#1] SMP NOPTI CPU: 62 UID: 0 PID: 39457 Comm: tc Kdump: loaded Not tainted 6.12.0-116.el10.x86_64 #1 PREEMPT(voluntary) Hardware name: Dell Inc. PowerEdge R640/06DKY5, BIOS 2.12.2 07/09/2021 RIP: 0010:__list_del_entry_valid_or_report+0x4/0x80 Code: ff 4c 39 c7 0f 84 39 19 8e ff b8 01 00 00 00 c3 cc cc cc cc 66 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa <48> 8b 17 48 8b 4f 08 48 85 d2 0f 84 56 19 8e ff 48 85 c9 0f 84 ab RSP: 0018:ffffba186009f400 EFLAGS: 00010202 RAX: 00000000000000d6 RBX: 0000000000000000 RCX: 0000000000000004 RDX: ffff9f0fa29b69c0 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffffffffc12c2400 R08: 0000000000000008 R09: 0000000000000004 R10: ffffffffffffffff R11: 0000000000000004 R12: 0000000000000000 R13: ffff9f0f8cfe0000 R14: 0000000000100005 R15: 0000000000000000 FS: 00007f2154f37480(0000) GS:ffff9f269c1c0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000001530be001 CR4: 00000000007726f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ets_class_qlen_notify+0x65/0x90 [sch_ets] qdisc_tree_reduce_backlog+0x74/0x110 ets_qdisc_change+0x630/0xa40 [sch_ets] __tc_modify_qdisc.constprop.0+0x216/0x7f0 tc_modify_qdisc+0x7c/0x120 rtnetlink_rcv_msg+0x145/0x3f0 netlink_rcv_skb+0x53/0x100 netlink_unicast+0x245/0x390 netlink_sendmsg+0x21b/0x470 ____sys_sendmsg+0x39d/0x3d0 ___sys_sendmsg+0x9a/0xe0 __sys_sendmsg+0x7a/0xd0 do_syscall_64+0x7d/0x160 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f2155114084 Code: 89 02 b8 ff ff ff ff eb bb 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 f3 0f 1e fa 80 3d 25 f0 0c 00 00 74 13 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 48 83 ec 28 89 54 24 1c 48 89 RSP: 002b:00007fff1fd7a988 EFLAGS: 00000202 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000560ec063e5e0 RCX: 00007f2155114084 RDX: 0000000000000000 RSI: 00007fff1fd7a9f0 RDI: 0000000000000003 RBP: 00007fff1fd7aa60 R08: 0000000000000010 R09: 000000000000003f R10: 0000560ee9b3a010 R11: 0000000000000202 R12: 00007fff1fd7aae0 R13: 000000006891ccde R14: 0000560ec063e5e0 R15: 00007fff1fd7aad0 [1] https://lore.kernel.org/netdev/e08c7f4a6882f260011909a868311c6e9b54f3e4.1639153474.git.dcaratti@redhat.com/ [2] https://lore.kernel.org/netdev/d912cbd7-193b-4269-9857-525bee8bbb6a@gmail.com/ Cc: stable@vger.kernel.org Fixes: 103406b38c60 ("net/sched: Always pass notifications when child class becomes empty") Fixes: c062f2a0b04d ("net/sched: sch_ets: don't remove idle classes from the round-robin list") Fixes: dcc68b4d8084 ("net: sch_ets: Add a new Qdisc") Reported-by: Li Shuang Closes: https://issues.redhat.com/browse/RHEL-108026 Reviewed-by: Petr Machata Co-developed-by: Ivan Vecera Signed-off-by: Ivan Vecera Signed-off-by: Davide Caratti Link: https://patch.msgid.link/7928ff6d17db47a2ae7cc205c44777b1f1950545.1755016081.git.dcaratti@redhat.com Signed-off-by: Jakub Kicinski --- net/sched/sch_ets.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index 037f764822b9..82635dd2cfa5 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -651,6 +651,12 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_lock(sch); + for (i = nbands; i < oldbands; i++) { + if (i >= q->nstrict && q->classes[i].qdisc->q.qlen) + list_del_init(&q->classes[i].alist); + qdisc_purge_queue(q->classes[i].qdisc); + } + WRITE_ONCE(q->nbands, nbands); for (i = nstrict; i < q->nstrict; i++) { if (q->classes[i].qdisc->q.qlen) { @@ -658,11 +664,6 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, q->classes[i].deficit = quanta[i]; } } - for (i = q->nbands; i < oldbands; i++) { - if (i >= q->nstrict && q->classes[i].qdisc->q.qlen) - list_del_init(&q->classes[i].alist); - qdisc_purge_queue(q->classes[i].qdisc); - } WRITE_ONCE(q->nstrict, nstrict); memcpy(q->prio2band, priomap, sizeof(priomap)); -- cgit v1.2.3 From 52565a935213cd6a8662ddb8efe5b4219343a25d Mon Sep 17 00:00:00 2001 From: Sven Stegemann Date: Tue, 12 Aug 2025 21:18:03 +0200 Subject: net: kcm: Fix race condition in kcm_unattach() syzbot found a race condition when kcm_unattach(psock) and kcm_release(kcm) are executed at the same time. kcm_unattach() is missing a check of the flag kcm->tx_stopped before calling queue_work(). If the kcm has a reserved psock, kcm_unattach() might get executed between cancel_work_sync() and unreserve_psock() in kcm_release(), requeuing kcm->tx_work right before kcm gets freed in kcm_done(). Remove kcm->tx_stopped and replace it by the less error-prone disable_work_sync(). Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module") Reported-by: syzbot+e62c9db591c30e174662@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=e62c9db591c30e174662 Reported-by: syzbot+d199b52665b6c3069b94@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=d199b52665b6c3069b94 Reported-by: syzbot+be6b1fdfeae512726b4e@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=be6b1fdfeae512726b4e Signed-off-by: Sven Stegemann Link: https://patch.msgid.link/20250812191810.27777-1-sven@stegemann.de Signed-off-by: Jakub Kicinski --- include/net/kcm.h | 1 - net/kcm/kcmsock.c | 10 ++-------- 2 files changed, 2 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/kcm.h b/include/net/kcm.h index 441e993be634..d9c35e71ecea 100644 --- a/include/net/kcm.h +++ b/include/net/kcm.h @@ -71,7 +71,6 @@ struct kcm_sock { struct list_head wait_psock_list; struct sk_buff *seq_skb; struct mutex tx_mutex; - u32 tx_stopped : 1; /* Don't use bit fields here, these are set under different locks */ bool tx_wait; diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index a4971e6fa943..b4f01cb07561 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -430,7 +430,7 @@ static void psock_write_space(struct sock *sk) /* Check if the socket is reserved so someone is waiting for sending. */ kcm = psock->tx_kcm; - if (kcm && !unlikely(kcm->tx_stopped)) + if (kcm) queue_work(kcm_wq, &kcm->tx_work); spin_unlock_bh(&mux->lock); @@ -1693,12 +1693,6 @@ static int kcm_release(struct socket *sock) */ __skb_queue_purge(&sk->sk_write_queue); - /* Set tx_stopped. This is checked when psock is bound to a kcm and we - * get a writespace callback. This prevents further work being queued - * from the callback (unbinding the psock occurs after canceling work. - */ - kcm->tx_stopped = 1; - release_sock(sk); spin_lock_bh(&mux->lock); @@ -1714,7 +1708,7 @@ static int kcm_release(struct socket *sock) /* Cancel work. After this point there should be no outside references * to the kcm socket. */ - cancel_work_sync(&kcm->tx_work); + disable_work_sync(&kcm->tx_work); lock_sock(sk); psock = kcm->tx_psock; -- cgit v1.2.3 From f22cc6f766f84496b260347d4f0d92cf95f30699 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 11 Aug 2025 16:42:09 -0700 Subject: net: ethtool: support including Flow Label in the flow hash for RSS Some modern NICs support including the IPv6 Flow Label in the flow hash for RSS queue selection. This is outside the old "Microsoft spec", but was included in the OCP NIC spec: [ ] RSS include flow label in the hash (configurable) https://www.opencompute.org/w/index.php?title=Core_Offloads#Receive_Side_Scaling RSS Flow Label hashing allows TCP Protective Load Balancing (PLB) to recover from receiver congestion / overload. Rx CPU/queue hotspots are relatively common for data ingest workloads, and so far we had to try to detect the condition at the RPC layer and reopen the connection. PLB lets us change the Flow Label and therefore Rx CPU on RTO, with minimal packet reordering. PLB reaction times are much faster, and can happen at any point in the connection, not just at RPC boundaries. Due to the nature of host processing (relatively long queues, other kernel subsystems masking IRQs for 100s of msecs) the risk of reordering within the host is higher than in the network. But for applications which need it - it is far preferable to potentially persistent overload of subset of queues. It is expected that the hash communicated to the host may change if the Flow Label changes. This may be surprising to some host software, but I don't expect the devices can compute two Toeplitz hashes, one with the Flow Label for queue selection and one without for the rx hash communicated to the host. Besides, changing the hash may potentially help to change the path thru host queues. User can disable NETIF_F_RXHASH if they require a stable flow hash. The name RXH_IP6_FL was chosen based on what we call Flow Label variables in IPv6 processing (fl). I prefer fl_lbl but that appears to be an fbnic-only spelling. We could spell out RXH_IP6_FLOW_LABEL but existing RXH_ defines are a lot more terse. Willem notes [1] that Flow Label is defined as identifying the flow and therefore including both the flow label _and_ the L4 header fields is not generally necessary. But it should not hurt so it's not explicitly prevented if the driver supports hashing on both at the same time. Link: https://lore.kernel.org/68483433b45e2_3cd66f29440@willemb.c.googlers.com.notmuch [1] Signed-off-by: Jakub Kicinski Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250811234212.580748-2-kuba@kernel.org Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/ethtool.yaml | 3 +++ include/uapi/linux/ethtool.h | 1 + net/ethtool/ioctl.c | 25 +++++++++++++++++++++++++ net/ethtool/rss.c | 27 ++++++++++++++------------- 4 files changed, 43 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 1bc1bd7d33c2..7a7594713f1f 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -204,6 +204,9 @@ definitions: doc: dst port in case of TCP/UDP/SCTP - name: gtp-teid + - + name: ip6-fl + doc: IPv6 Flow Label - name: discard value: 31 diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 9e9afdd1238a..8bd5ea5469d9 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -2380,6 +2380,7 @@ enum { #define RXH_L4_B_0_1 (1 << 6) /* src port in case of TCP/UDP/SCTP */ #define RXH_L4_B_2_3 (1 << 7) /* dst port in case of TCP/UDP/SCTP */ #define RXH_GTP_TEID (1 << 8) /* teid in case of GTP */ +#define RXH_IP6_FL (1 << 9) /* IPv6 flow label */ #define RXH_DISCARD (1 << 31) #define RX_CLS_FLOW_DISC 0xffffffffffffffffULL diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 43a7854e784e..0b2a4d0573b3 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1014,6 +1014,28 @@ static bool flow_type_hashable(u32 flow_type) return false; } +static bool flow_type_v6(u32 flow_type) +{ + switch (flow_type) { + case TCP_V6_FLOW: + case UDP_V6_FLOW: + case SCTP_V6_FLOW: + case AH_ESP_V6_FLOW: + case AH_V6_FLOW: + case ESP_V6_FLOW: + case IPV6_FLOW: + case GTPU_V6_FLOW: + case GTPC_V6_FLOW: + case GTPC_TEID_V6_FLOW: + case GTPU_EH_V6_FLOW: + case GTPU_UL_V6_FLOW: + case GTPU_DL_V6_FLOW: + return true; + } + + return false; +} + /* When adding a new type, update the assert and, if it's hashable, add it to * the flow_type_hashable switch case. */ @@ -1077,6 +1099,9 @@ ethtool_set_rxfh_fields(struct net_device *dev, u32 cmd, void __user *useraddr) if (rc) return rc; + if (info.data & RXH_IP6_FL && !flow_type_v6(info.flow_type)) + return -EINVAL; + if (info.flow_type & FLOW_RSS && info.rss_context && !ops->rxfh_per_ctx_fields) return -EINVAL; diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index 992e98abe9dd..202d95e8bf3e 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -536,35 +536,36 @@ void ethtool_rss_notify(struct net_device *dev, u32 type, u32 rss_context) #define RFH_MASK (RXH_L2DA | RXH_VLAN | RXH_IP_SRC | RXH_IP_DST | \ RXH_L3_PROTO | RXH_L4_B_0_1 | RXH_L4_B_2_3 | \ RXH_GTP_TEID | RXH_DISCARD) +#define RFH_MASKv6 (RFH_MASK | RXH_IP6_FL) static const struct nla_policy ethnl_rss_flows_policy[] = { [ETHTOOL_A_FLOW_ETHER] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_IP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_IP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_IP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_TCP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_UDP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_SCTP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_AH_ESP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_TCP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_UDP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_SCTP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_AH_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_TCP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), + [ETHTOOL_A_FLOW_UDP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), + [ETHTOOL_A_FLOW_SCTP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), + [ETHTOOL_A_FLOW_AH_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_AH4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_ESP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_AH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_AH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), + [ETHTOOL_A_FLOW_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPU4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPU6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPC4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPC6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPC6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPC_TEID4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPC_TEID6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPC_TEID6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPU_EH4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPU_EH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU_EH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPU_UL4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPU_UL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU_UL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPU_DL4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPU_DL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU_DL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), }; const struct nla_policy ethnl_rss_set_policy[ETHTOOL_A_RSS_FLOW_HASH + 1] = { -- cgit v1.2.3 From 0ebc0bcd0aa0037019aac996c50166c7baf44ff8 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 13 Aug 2025 12:44:16 +0300 Subject: devlink/port: Simplify return checks Drop always returning 0 from the helper routine and simplify its callers. Reviewed-by: Jiri Pirko Signed-off-by: Parav Pandit Link: https://patch.msgid.link/20250813094417.7269-2-parav@nvidia.com Signed-off-by: Jakub Kicinski --- net/devlink/port.c | 29 ++++++----------------------- 1 file changed, 6 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/devlink/port.c b/net/devlink/port.c index cb8d4df61619..aaca1b23aa5f 100644 --- a/net/devlink/port.c +++ b/net/devlink/port.c @@ -1333,8 +1333,8 @@ int devlink_port_netdevice_event(struct notifier_block *nb, return NOTIFY_OK; } -static int __devlink_port_attrs_set(struct devlink_port *devlink_port, - enum devlink_port_flavour flavour) +static void __devlink_port_attrs_set(struct devlink_port *devlink_port, + enum devlink_port_flavour flavour) { struct devlink_port_attrs *attrs = &devlink_port->attrs; @@ -1347,7 +1347,6 @@ static int __devlink_port_attrs_set(struct devlink_port *devlink_port, } else { devlink_port->switch_port = false; } - return 0; } /** @@ -1359,14 +1358,10 @@ static int __devlink_port_attrs_set(struct devlink_port *devlink_port, void devlink_port_attrs_set(struct devlink_port *devlink_port, struct devlink_port_attrs *attrs) { - int ret; - ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); devlink_port->attrs = *attrs; - ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); - if (ret) - return; + __devlink_port_attrs_set(devlink_port, attrs->flavour); WARN_ON(attrs->splittable && attrs->split); } EXPORT_SYMBOL_GPL(devlink_port_attrs_set); @@ -1383,14 +1378,10 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 contro u16 pf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; - int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); - ret = __devlink_port_attrs_set(devlink_port, - DEVLINK_PORT_FLAVOUR_PCI_PF); - if (ret) - return; + __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_PF); attrs->pci_pf.controller = controller; attrs->pci_pf.pf = pf; attrs->pci_pf.external = external; @@ -1411,14 +1402,10 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro u16 pf, u16 vf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; - int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); - ret = __devlink_port_attrs_set(devlink_port, - DEVLINK_PORT_FLAVOUR_PCI_VF); - if (ret) - return; + __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF); attrs->pci_vf.controller = controller; attrs->pci_vf.pf = pf; attrs->pci_vf.vf = vf; @@ -1439,14 +1426,10 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro u16 pf, u32 sf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; - int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); - ret = __devlink_port_attrs_set(devlink_port, - DEVLINK_PORT_FLAVOUR_PCI_SF); - if (ret) - return; + __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_SF); attrs->pci_sf.controller = controller; attrs->pci_sf.pf = pf; attrs->pci_sf.sf = sf; -- cgit v1.2.3 From 41a6e8ab18642741437da932c2f5762b185e928c Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 13 Aug 2025 12:44:17 +0300 Subject: devlink/port: Check attributes early and constify Constify the devlink port attributes to indicate they are read only and does not depend on anything else. Therefore, validate it early before setting in the devlink port. Reviewed-by: Jiri Pirko Signed-off-by: Parav Pandit Link: https://patch.msgid.link/20250813094417.7269-3-parav@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 2 +- net/devlink/port.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index b32c9ceeb81d..3119d053bc4d 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1743,7 +1743,7 @@ void devlink_port_type_ib_set(struct devlink_port *devlink_port, struct ib_device *ibdev); void devlink_port_type_clear(struct devlink_port *devlink_port); void devlink_port_attrs_set(struct devlink_port *devlink_port, - struct devlink_port_attrs *devlink_port_attrs); + const struct devlink_port_attrs *attrs); void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, bool external); void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, diff --git a/net/devlink/port.c b/net/devlink/port.c index aaca1b23aa5f..93d8a25bb920 100644 --- a/net/devlink/port.c +++ b/net/devlink/port.c @@ -1356,13 +1356,13 @@ static void __devlink_port_attrs_set(struct devlink_port *devlink_port, * @attrs: devlink port attrs */ void devlink_port_attrs_set(struct devlink_port *devlink_port, - struct devlink_port_attrs *attrs) + const struct devlink_port_attrs *attrs) { ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + WARN_ON(attrs->splittable && attrs->split); devlink_port->attrs = *attrs; __devlink_port_attrs_set(devlink_port, attrs->flavour); - WARN_ON(attrs->splittable && attrs->split); } EXPORT_SYMBOL_GPL(devlink_port_attrs_set); -- cgit v1.2.3 From eeea7688632e0c697f66bdd71708c8faf36f6540 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Wed, 13 Aug 2025 20:55:26 +0800 Subject: net/sched: Use TC_RTAB_SIZE instead of magic number Replace magic number with TC_RTAB_SIZE to make it more informative. Signed-off-by: Yue Haibing Link: https://patch.msgid.link/20250813125526.853895-1-yuehaibing@huawei.com Signed-off-by: Jakub Kicinski --- net/sched/sch_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index d7c767b861a4..1e058b46d3e1 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -431,7 +431,7 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) && - !memcmp(&rtab->data, nla_data(tab), 1024)) { + !memcmp(&rtab->data, nla_data(tab), TC_RTAB_SIZE)) { rtab->refcnt++; return rtab; } @@ -441,7 +441,7 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, if (rtab) { rtab->rate = *r; rtab->refcnt = 1; - memcpy(rtab->data, nla_data(tab), 1024); + memcpy(rtab->data, nla_data(tab), TC_RTAB_SIZE); if (r->linklayer == TC_LINKLAYER_UNAWARE) r->linklayer = __detect_linklayer(r, rtab->data); rtab->next = qdisc_rtab_list; -- cgit v1.2.3 From 3d05b24429e1de7a17c8fdccb04a04dbc8ad297b Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 12 Aug 2025 11:02:12 +0300 Subject: bridge: Redirect to backup port when port is administratively down If a backup port is configured for a bridge port, the bridge will redirect known unicast traffic towards the backup port when the primary port is administratively up but without a carrier. This is useful, for example, in MLAG configurations where a system is connected to two switches and there is a peer link between both switches. The peer link serves as the backup port in case one of the switches loses its connection to the multi-homed system. In order to avoid flooding when the primary port loses its carrier, the bridge does not flush dynamic FDB entries pointing to the port upon STP disablement, if the port has a backup port. The above means that known unicast traffic destined to the primary port will be blackholed when the port is put administratively down, until the FDB entries pointing to it are aged-out. Given that the current behavior is quite weird and unlikely to be depended on by anyone, amend the bridge to redirect to the backup port also when the primary port is administratively down and not only when it does not have a carrier. The change is motivated by a report from a user who expected traffic to be redirected to the backup port when the primary port was put administratively down while debugging a network issue. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250812080213.325298-2-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/bridge/br_forward.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 29097e984b4f..870bdf2e082c 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -148,7 +148,8 @@ void br_forward(const struct net_bridge_port *to, goto out; /* redirect to backup link if the destination port is down */ - if (rcu_access_pointer(to->backup_port) && !netif_carrier_ok(to->dev)) { + if (rcu_access_pointer(to->backup_port) && + (!netif_carrier_ok(to->dev) || !netif_running(to->dev))) { struct net_bridge_port *backup_port; backup_port = rcu_dereference(to->backup_port); -- cgit v1.2.3 From d1547bf460baec718b3398365f8de33d25c5f36f Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Wed, 13 Aug 2025 10:10:54 +0800 Subject: net: bridge: fix soft lockup in br_multicast_query_expired() When set multicast_query_interval to a large value, the local variable 'time' in br_multicast_send_query() may overflow. If the time is smaller than jiffies, the timer will expire immediately, and then call mod_timer() again, which creates a loop and may trigger the following soft lockup issue. watchdog: BUG: soft lockup - CPU#1 stuck for 221s! [rb_consumer:66] CPU: 1 UID: 0 PID: 66 Comm: rb_consumer Not tainted 6.16.0+ #259 PREEMPT(none) Call Trace: __netdev_alloc_skb+0x2e/0x3a0 br_ip6_multicast_alloc_query+0x212/0x1b70 __br_multicast_send_query+0x376/0xac0 br_multicast_send_query+0x299/0x510 br_multicast_query_expired.constprop.0+0x16d/0x1b0 call_timer_fn+0x3b/0x2a0 __run_timers+0x619/0x950 run_timer_softirq+0x11c/0x220 handle_softirqs+0x18e/0x560 __irq_exit_rcu+0x158/0x1a0 sysvec_apic_timer_interrupt+0x76/0x90 This issue can be reproduced with: ip link add br0 type bridge echo 1 > /sys/class/net/br0/bridge/multicast_querier echo 0xffffffffffffffff > /sys/class/net/br0/bridge/multicast_query_interval ip link set dev br0 up The multicast_startup_query_interval can also cause this issue. Similar to the commit 99b40610956a ("net: bridge: mcast: add and enforce query interval minimum"), add check for the query interval maximum to fix this issue. Link: https://lore.kernel.org/netdev/20250806094941.1285944-1-wangliang74@huawei.com/ Link: https://lore.kernel.org/netdev/20250812091818.542238-1-wangliang74@huawei.com/ Fixes: d902eee43f19 ("bridge: Add multicast count/interval sysfs entries") Suggested-by: Nikolay Aleksandrov Signed-off-by: Wang Liang Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250813021054.1643649-1-wangliang74@huawei.com Signed-off-by: Jakub Kicinski --- net/bridge/br_multicast.c | 16 ++++++++++++++++ net/bridge/br_private.h | 2 ++ 2 files changed, 18 insertions(+) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 1377f31b719c..8ce145938b02 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -4818,6 +4818,14 @@ void br_multicast_set_query_intvl(struct net_bridge_mcast *brmctx, intvl_jiffies = BR_MULTICAST_QUERY_INTVL_MIN; } + if (intvl_jiffies > BR_MULTICAST_QUERY_INTVL_MAX) { + br_info(brmctx->br, + "trying to set multicast query interval above maximum, setting to %lu (%ums)\n", + jiffies_to_clock_t(BR_MULTICAST_QUERY_INTVL_MAX), + jiffies_to_msecs(BR_MULTICAST_QUERY_INTVL_MAX)); + intvl_jiffies = BR_MULTICAST_QUERY_INTVL_MAX; + } + brmctx->multicast_query_interval = intvl_jiffies; } @@ -4834,6 +4842,14 @@ void br_multicast_set_startup_query_intvl(struct net_bridge_mcast *brmctx, intvl_jiffies = BR_MULTICAST_STARTUP_QUERY_INTVL_MIN; } + if (intvl_jiffies > BR_MULTICAST_STARTUP_QUERY_INTVL_MAX) { + br_info(brmctx->br, + "trying to set multicast startup query interval above maximum, setting to %lu (%ums)\n", + jiffies_to_clock_t(BR_MULTICAST_STARTUP_QUERY_INTVL_MAX), + jiffies_to_msecs(BR_MULTICAST_STARTUP_QUERY_INTVL_MAX)); + intvl_jiffies = BR_MULTICAST_STARTUP_QUERY_INTVL_MAX; + } + brmctx->multicast_startup_query_interval = intvl_jiffies; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index b159aae594c0..8de0904b9627 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -31,6 +31,8 @@ #define BR_MULTICAST_DEFAULT_HASH_MAX 4096 #define BR_MULTICAST_QUERY_INTVL_MIN msecs_to_jiffies(1000) #define BR_MULTICAST_STARTUP_QUERY_INTVL_MIN BR_MULTICAST_QUERY_INTVL_MIN +#define BR_MULTICAST_QUERY_INTVL_MAX msecs_to_jiffies(86400000) /* 24 hours */ +#define BR_MULTICAST_STARTUP_QUERY_INTVL_MAX BR_MULTICAST_QUERY_INTVL_MAX #define BR_HWDOM_MAX BITS_PER_LONG -- cgit v1.2.3 From 52bf272636bda69587952b35ae97690b8dc89941 Mon Sep 17 00:00:00 2001 From: William Liu Date: Tue, 12 Aug 2025 23:57:57 +0000 Subject: net/sched: Fix backlog accounting in qdisc_dequeue_internal This issue applies for the following qdiscs: hhf, fq, fq_codel, and fq_pie, and occurs in their change handlers when adjusting to the new limit. The problem is the following in the values passed to the subsequent qdisc_tree_reduce_backlog call given a tbf parent: When the tbf parent runs out of tokens, skbs of these qdiscs will be placed in gso_skb. Their peek handlers are qdisc_peek_dequeued, which accounts for both qlen and backlog. However, in the case of qdisc_dequeue_internal, ONLY qlen is accounted for when pulling from gso_skb. This means that these qdiscs are missing a qdisc_qstats_backlog_dec when dropping packets to satisfy the new limit in their change handlers. One can observe this issue with the following (with tc patched to support a limit of 0): export TARGET=fq tc qdisc del dev lo root tc qdisc add dev lo root handle 1: tbf rate 8bit burst 100b latency 1ms tc qdisc replace dev lo handle 3: parent 1:1 $TARGET limit 1000 echo ''; echo 'add child'; tc -s -d qdisc show dev lo ping -I lo -f -c2 -s32 -W0.001 127.0.0.1 2>&1 >/dev/null echo ''; echo 'after ping'; tc -s -d qdisc show dev lo tc qdisc change dev lo handle 3: parent 1:1 $TARGET limit 0 echo ''; echo 'after limit drop'; tc -s -d qdisc show dev lo tc qdisc replace dev lo handle 2: parent 1:1 sfq echo ''; echo 'post graft'; tc -s -d qdisc show dev lo The second to last show command shows 0 packets but a positive number (74) of backlog bytes. The problem becomes clearer in the last show command, where qdisc_purge_queue triggers qdisc_tree_reduce_backlog with the positive backlog and causes an underflow in the tbf parent's backlog (4096 Mb instead of 0). To fix this issue, the codepath for all clients of qdisc_dequeue_internal has been simplified: codel, pie, hhf, fq, fq_pie, and fq_codel. qdisc_dequeue_internal handles the backlog adjustments for all cases that do not directly use the dequeue handler. The old fq_codel_change limit adjustment loop accumulated the arguments to the subsequent qdisc_tree_reduce_backlog call through the cstats field. However, this is confusing and error prone as fq_codel_dequeue could also potentially mutate this field (which qdisc_dequeue_internal calls in the non gso_skb case), so we have unified the code here with other qdiscs. Fixes: 2d3cbfd6d54a ("net_sched: Flush gso_skb list too during ->change()") Fixes: 4b549a2ef4be ("fq_codel: Fair Queue Codel AQM") Fixes: 10239edf86f1 ("net-qdisc-hhf: Heavy-Hitter Filter (HHF) qdisc") Signed-off-by: William Liu Reviewed-by: Savino Dicanosa Link: https://patch.msgid.link/20250812235725.45243-1-will@willsroot.io Signed-off-by: Jakub Kicinski --- include/net/sch_generic.h | 11 ++++++++--- net/sched/sch_codel.c | 12 +++++++----- net/sched/sch_fq.c | 12 +++++++----- net/sched/sch_fq_codel.c | 12 +++++++----- net/sched/sch_fq_pie.c | 12 +++++++----- net/sched/sch_hhf.c | 12 +++++++----- net/sched/sch_pie.c | 12 +++++++----- 7 files changed, 50 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 638948be4c50..738cd5b13c62 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -1038,12 +1038,17 @@ static inline struct sk_buff *qdisc_dequeue_internal(struct Qdisc *sch, bool dir skb = __skb_dequeue(&sch->gso_skb); if (skb) { sch->q.qlen--; + qdisc_qstats_backlog_dec(sch, skb); return skb; } - if (direct) - return __qdisc_dequeue_head(&sch->q); - else + if (direct) { + skb = __qdisc_dequeue_head(&sch->q); + if (skb) + qdisc_qstats_backlog_dec(sch, skb); + return skb; + } else { return sch->dequeue(sch); + } } static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch) diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index c93761040c6e..fa0314679e43 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -101,9 +101,9 @@ static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { static int codel_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct codel_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_CODEL_MAX + 1]; - unsigned int qlen, dropped = 0; int err; err = nla_parse_nested_deprecated(tb, TCA_CODEL_MAX, opt, @@ -142,15 +142,17 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt, WRITE_ONCE(q->params.ecn, !!nla_get_u32(tb[TCA_CODEL_ECN])); - qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, true); - dropped += qdisc_pkt_len(skb); - qdisc_qstats_backlog_dec(sch, skb); + if (!skb) + break; + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_qdisc_drop(skb, sch); } - qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 902ff5470607..fee922da2f99 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -1013,11 +1013,11 @@ static int fq_load_priomap(struct fq_sched_data *q, static int fq_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct fq_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_FQ_MAX + 1]; - int err, drop_count = 0; - unsigned drop_len = 0; u32 fq_log; + int err; err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy, NULL); @@ -1135,16 +1135,18 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, err = fq_resize(sch, fq_log); sch_tree_lock(sch); } + while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, false); if (!skb) break; - drop_len += qdisc_pkt_len(skb); + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); - drop_count++; } - qdisc_tree_reduce_backlog(sch, drop_count, drop_len); + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return err; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 2a0f3a513bfa..a14142392939 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -366,6 +366,7 @@ static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct fq_codel_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_FQ_CODEL_MAX + 1]; u32 quantum = 0; @@ -443,13 +444,14 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, q->memory_usage > q->memory_limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, false); - q->cstats.drop_len += qdisc_pkt_len(skb); + if (!skb) + break; + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); - q->cstats.drop_count++; } - qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len); - q->cstats.drop_count = 0; - q->cstats.drop_len = 0; + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index b0e34daf1f75..7b96bc3ff891 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -287,10 +287,9 @@ begin: static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct fq_pie_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_FQ_PIE_MAX + 1]; - unsigned int len_dropped = 0; - unsigned int num_dropped = 0; int err; err = nla_parse_nested(tb, TCA_FQ_PIE_MAX, opt, fq_pie_policy, extack); @@ -368,11 +367,14 @@ static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, false); - len_dropped += qdisc_pkt_len(skb); - num_dropped += 1; + if (!skb) + break; + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); } - qdisc_tree_reduce_backlog(sch, num_dropped, len_dropped); + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 5aa434b46707..2d4855e28a28 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -508,9 +508,9 @@ static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = { static int hhf_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct hhf_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_HHF_MAX + 1]; - unsigned int qlen, prev_backlog; int err; u64 non_hh_quantum; u32 new_quantum = q->quantum; @@ -561,15 +561,17 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt, usecs_to_jiffies(us)); } - qlen = sch->q.qlen; - prev_backlog = sch->qstats.backlog; while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, false); + if (!skb) + break; + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); } - qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, - prev_backlog - sch->qstats.backlog); + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index ad46ee3ed5a9..0a377313b6a9 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -141,9 +141,9 @@ static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { static int pie_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct pie_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_PIE_MAX + 1]; - unsigned int qlen, dropped = 0; int err; err = nla_parse_nested_deprecated(tb, TCA_PIE_MAX, opt, pie_policy, @@ -193,15 +193,17 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt, nla_get_u32(tb[TCA_PIE_DQ_RATE_ESTIMATOR])); /* Drop excess packets if new limit is lower */ - qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, true); - dropped += qdisc_pkt_len(skb); - qdisc_qstats_backlog_dec(sch, skb); + if (!skb) + break; + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_qdisc_drop(skb, sch); } - qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; -- cgit v1.2.3 From 2327a3d6f65ce2fe2634546dde4a25ef52296fec Mon Sep 17 00:00:00 2001 From: Charalampos Mitrodimas Date: Tue, 12 Aug 2025 15:51:25 +0000 Subject: net: ipv6: fix field-spanning memcpy warning in AH output Fix field-spanning memcpy warnings in ah6_output() and ah6_output_done() where extension headers are copied to/from IPv6 address fields, triggering fortify-string warnings about writes beyond the 16-byte address fields. memcpy: detected field-spanning write (size 40) of single field "&top_iph->saddr" at net/ipv6/ah6.c:439 (size 16) WARNING: CPU: 0 PID: 8838 at net/ipv6/ah6.c:439 ah6_output+0xe7e/0x14e0 net/ipv6/ah6.c:439 The warnings are false positives as the extension headers are intentionally placed after the IPv6 header in memory. Fix by properly copying addresses and extension headers separately, and introduce helper functions to avoid code duplication. Reported-by: syzbot+01b0667934cdceb4451c@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=01b0667934cdceb4451c Signed-off-by: Charalampos Mitrodimas Signed-off-by: Steffen Klassert --- net/ipv6/ah6.c | 50 +++++++++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index eb474f0987ae..95372e0f1d21 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -46,6 +46,34 @@ struct ah_skb_cb { #define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0])) +/* Helper to save IPv6 addresses and extension headers to temporary storage */ +static inline void ah6_save_hdrs(struct tmp_ext *iph_ext, + struct ipv6hdr *top_iph, int extlen) +{ + if (!extlen) + return; + +#if IS_ENABLED(CONFIG_IPV6_MIP6) + iph_ext->saddr = top_iph->saddr; +#endif + iph_ext->daddr = top_iph->daddr; + memcpy(&iph_ext->hdrs, top_iph + 1, extlen - sizeof(*iph_ext)); +} + +/* Helper to restore IPv6 addresses and extension headers from temporary storage */ +static inline void ah6_restore_hdrs(struct ipv6hdr *top_iph, + struct tmp_ext *iph_ext, int extlen) +{ + if (!extlen) + return; + +#if IS_ENABLED(CONFIG_IPV6_MIP6) + top_iph->saddr = iph_ext->saddr; +#endif + top_iph->daddr = iph_ext->daddr; + memcpy(top_iph + 1, &iph_ext->hdrs, extlen - sizeof(*iph_ext)); +} + static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags, unsigned int size) { @@ -301,13 +329,7 @@ static void ah6_output_done(void *data, int err) memcpy(ah->auth_data, icv, ahp->icv_trunc_len); memcpy(top_iph, iph_base, IPV6HDR_BASELEN); - if (extlen) { -#if IS_ENABLED(CONFIG_IPV6_MIP6) - memcpy(&top_iph->saddr, iph_ext, extlen); -#else - memcpy(&top_iph->daddr, iph_ext, extlen); -#endif - } + ah6_restore_hdrs(top_iph, iph_ext, extlen); kfree(AH_SKB_CB(skb)->tmp); xfrm_output_resume(skb->sk, skb, err); @@ -378,12 +400,8 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) */ memcpy(iph_base, top_iph, IPV6HDR_BASELEN); + ah6_save_hdrs(iph_ext, top_iph, extlen); if (extlen) { -#if IS_ENABLED(CONFIG_IPV6_MIP6) - memcpy(iph_ext, &top_iph->saddr, extlen); -#else - memcpy(iph_ext, &top_iph->daddr, extlen); -#endif err = ipv6_clear_mutable_options(top_iph, extlen - sizeof(*iph_ext) + sizeof(*top_iph), @@ -434,13 +452,7 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) memcpy(ah->auth_data, icv, ahp->icv_trunc_len); memcpy(top_iph, iph_base, IPV6HDR_BASELEN); - if (extlen) { -#if IS_ENABLED(CONFIG_IPV6_MIP6) - memcpy(&top_iph->saddr, iph_ext, extlen); -#else - memcpy(&top_iph->daddr, iph_ext, extlen); -#endif - } + ah6_restore_hdrs(top_iph, iph_ext, extlen); out_free: kfree(iph_base); -- cgit v1.2.3 From 9f4f591cd5a410f4203a9c104f92d467945b7d7e Mon Sep 17 00:00:00 2001 From: Miguel García Date: Thu, 14 Aug 2025 21:32:17 +0200 Subject: xfrm: xfrm_user: use strscpy() for alg_name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the strcpy() calls that copy the canonical algorithm name into alg_name with strscpy() to avoid potential overflows and guarantee NULL termination. Destination is alg_name in xfrm_algo/xfrm_algo_auth/xfrm_algo_aead (size CRYPTO_MAX_ALG_NAME). Tested in QEMU (BusyBox/Alpine rootfs): - Added ESP AEAD (rfc4106(gcm(aes))) and classic ESP (sha256 + cbc(aes)) - Verified canonical names via ip -d xfrm state - Checked IPComp negative (unknown algo) and deflate path Signed-off-by: Miguel García Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 684239018bec..010c9e6638c0 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -593,7 +593,7 @@ static int attach_one_algo(struct xfrm_algo **algpp, u8 *props, if (!p) return -ENOMEM; - strcpy(p->alg_name, algo->name); + strscpy(p->alg_name, algo->name); *algpp = p; return 0; } @@ -620,7 +620,7 @@ static int attach_crypt(struct xfrm_state *x, struct nlattr *rta, if (!p) return -ENOMEM; - strcpy(p->alg_name, algo->name); + strscpy(p->alg_name, algo->name); x->ealg = p; x->geniv = algo->uinfo.encr.geniv; return 0; @@ -649,7 +649,7 @@ static int attach_auth(struct xfrm_algo_auth **algpp, u8 *props, if (!p) return -ENOMEM; - strcpy(p->alg_name, algo->name); + strscpy(p->alg_name, algo->name); p->alg_key_len = ualg->alg_key_len; p->alg_trunc_len = algo->uinfo.auth.icv_truncbits; memcpy(p->alg_key, ualg->alg_key, (ualg->alg_key_len + 7) / 8); @@ -684,7 +684,7 @@ static int attach_auth_trunc(struct xfrm_algo_auth **algpp, u8 *props, if (!p) return -ENOMEM; - strcpy(p->alg_name, algo->name); + strscpy(p->alg_name, algo->name); if (!p->alg_trunc_len) p->alg_trunc_len = algo->uinfo.auth.icv_truncbits; @@ -714,7 +714,7 @@ static int attach_aead(struct xfrm_state *x, struct nlattr *rta, if (!p) return -ENOMEM; - strcpy(p->alg_name, algo->name); + strscpy(p->alg_name, algo->name); x->aead = p; x->geniv = algo->uinfo.aead.geniv; return 0; -- cgit v1.2.3 From de5d7d3f27ddd4046736f558a40e252ddda82013 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Mon, 28 Jul 2025 17:08:44 +0800 Subject: Bluetooth: hci_sync: Avoid adding default advertising on startup list_empty(&hdev->adv_instances) is always true during startup, so an advertising instance is added by default. Call trace: dump_backtrace+0x94/0xec show_stack+0x18/0x24 dump_stack_lvl+0x48/0x60 dump_stack+0x18/0x24 hci_setup_ext_adv_instance_sync+0x17c/0x328 hci_powered_update_adv_sync+0xb4/0x12c hci_powered_update_sync+0x54/0x70 hci_power_on_sync+0xe4/0x278 hci_set_powered_sync+0x28/0x34 set_powered_sync+0x40/0x58 hci_cmd_sync_work+0x94/0x100 process_one_work+0x168/0x444 worker_thread+0x378/0x3f4 kthread+0x108/0x10c ret_from_fork+0x10/0x20 Link: https://github.com/bluez/bluez/issues/1442 Signed-off-by: Yang Li Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 2b4f21fbf9c1..7397b6b50ccb 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -3344,7 +3344,7 @@ static int hci_powered_update_adv_sync(struct hci_dev *hdev) * advertising data. This also applies to the case * where BR/EDR was toggled during the AUTO_OFF phase. */ - if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || + if (hci_dev_test_flag(hdev, HCI_ADVERTISING) && list_empty(&hdev->adv_instances)) { if (ext_adv_capable(hdev)) { err = hci_setup_ext_adv_instance_sync(hdev, 0x00); -- cgit v1.2.3 From ca88be1a2725a42f8dbad579181611d9dcca8e88 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 24 Jul 2025 16:43:18 -0400 Subject: Bluetooth: hci_sync: Fix scan state after PA Sync has been established Passive scanning is used to program the address of the peer to be synchronized, so once HCI_EV_LE_PA_SYNC_ESTABLISHED is received it needs to be updated after clearing HCI_PA_SYNC then call hci_update_passive_scan_sync to return it to its original state. Fixes: 6d0417e4e1cf ("Bluetooth: hci_conn: Fix not setting conn_timeout for Broadcast Receiver") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 7397b6b50ccb..387c128f2ba0 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -6985,8 +6985,6 @@ static void create_pa_complete(struct hci_dev *hdev, void *data, int err) hci_dev_lock(hdev); - hci_dev_clear_flag(hdev, HCI_PA_SYNC); - if (!hci_conn_valid(hdev, conn)) clear_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags); @@ -7080,6 +7078,11 @@ static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data) __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_CREATE_SYNC_CANCEL, 0, NULL, HCI_CMD_TIMEOUT); + hci_dev_clear_flag(hdev, HCI_PA_SYNC); + + /* Update passive scan since HCI_PA_SYNC flag has been cleared */ + hci_update_passive_scan_sync(hdev); + return err; } -- cgit v1.2.3 From aee29c18a38d479c2f058c9b6a39b0527cf81d10 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 24 Jul 2025 16:36:27 -0400 Subject: Bluetooth: ISO: Fix getname not returning broadcast fields getname shall return iso_bc fields for both BIS_LINK and PA_LINK since the likes of bluetoothd do use the getpeername to retrieve the SID both when enumerating the broadcasters and when synchronizing. Fixes: a7bcffc673de ("Bluetooth: Add PA_LINK to distinguish BIG sync and PA sync connections") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 7bd3aa0a6db9..eaffd25570e3 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1347,7 +1347,7 @@ static int iso_sock_getname(struct socket *sock, struct sockaddr *addr, bacpy(&sa->iso_bdaddr, &iso_pi(sk)->dst); sa->iso_bdaddr_type = iso_pi(sk)->dst_type; - if (hcon && hcon->type == BIS_LINK) { + if (hcon && (hcon->type == BIS_LINK || hcon->type == PA_LINK)) { sa->iso_bc->bc_sid = iso_pi(sk)->bc_sid; sa->iso_bc->bc_num_bis = iso_pi(sk)->bc_num_bis; memcpy(sa->iso_bc->bc_bis, iso_pi(sk)->bc_bis, -- cgit v1.2.3 From d36349ea73d805bb72cbc24ab90cb1da4ad5c379 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 28 Jul 2025 13:51:01 -0400 Subject: Bluetooth: hci_conn: Fix running bis_cleanup for hci_conn->type PA_LINK Connections with type of PA_LINK shall be considered temporary just to track the lifetime of PA Sync setup, once the BIG Sync is established and connection are created with BIS_LINK the existing PA_LINK connection shall not longer use bis_cleanup otherwise it terminates the PA Sync when that shall be left to BIS_LINK connection to do it. Fixes: a7bcffc673de ("Bluetooth: Add PA_LINK to distinguish BIG sync and PA sync connections") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 12 +++++++++++- net/bluetooth/hci_event.c | 7 ++++++- 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 7d1e79f69cd1..f8b20b609a03 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -830,7 +830,17 @@ static void bis_cleanup(struct hci_conn *conn) /* Check if ISO connection is a BIS and terminate advertising * set and BIG if there are no other connections using it. */ - bis = hci_conn_hash_lookup_big(hdev, conn->iso_qos.bcast.big); + bis = hci_conn_hash_lookup_big_state(hdev, + conn->iso_qos.bcast.big, + BT_CONNECTED, + HCI_ROLE_MASTER); + if (bis) + return; + + bis = hci_conn_hash_lookup_big_state(hdev, + conn->iso_qos.bcast.big, + BT_CONNECT, + HCI_ROLE_MASTER); if (bis) return; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 8aa5039b975a..4f0a6116291e 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6957,9 +6957,14 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, continue; } - if (ev->status != 0x42) + if (ev->status != 0x42) { /* Mark PA sync as established */ set_bit(HCI_CONN_PA_SYNC, &bis->flags); + /* Reset cleanup callback of PA Sync so it doesn't + * terminate the sync when deleting the connection. + */ + conn->cleanup = NULL; + } bis->sync_handle = conn->sync_handle; bis->iso_qos.bcast.big = ev->handle; -- cgit v1.2.3 From 3ba486c5f3ce2c22ffd29c0103404cdbe21912b3 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 29 Jul 2025 12:11:09 -0400 Subject: Bluetooth: hci_conn: Fix not cleaning up Broadcaster/Broadcast Source This fixes Broadcaster/Broadcast Source not sending HCI_OP_LE_TERM_BIG because HCI_CONN_PER_ADV where not being set. Fixes: a7bcffc673de ("Bluetooth: Add PA_LINK to distinguish BIG sync and PA sync connections") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index f8b20b609a03..ab6fe5b0cc0f 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -2259,7 +2259,7 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, * the start periodic advertising and create BIG commands have * been queued */ - hci_conn_hash_list_state(hdev, bis_mark_per_adv, PA_LINK, + hci_conn_hash_list_state(hdev, bis_mark_per_adv, BIS_LINK, BT_BOUND, &data); /* Queue start periodic advertising and create BIG */ -- cgit v1.2.3 From 709788b154caf042874d765628ffa860f0bb0d1e Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 4 Aug 2025 09:54:05 -0400 Subject: Bluetooth: hci_core: Fix using {cis,bis}_capable for current settings {cis,bis}_capable only indicates the controller supports the feature since it doesn't check that LE is enabled so it shall not be used for current setting, instead this introduces {cis,bis}_enabled macros that can be used to indicate that these features are currently enabled. Fixes: 26afbd826ee3 ("Bluetooth: Add initial implementation of CIS connections") Fixes: eca0ae4aea66 ("Bluetooth: Add initial implementation of BIS connections") Fixes: ae7533613133 ("Bluetooth: Check for ISO support in controller") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/bluetooth.h | 4 ++-- include/net/bluetooth/hci_core.h | 13 ++++++++++++- net/bluetooth/hci_sync.c | 4 ++-- net/bluetooth/iso.c | 14 +++++++------- net/bluetooth/mgmt.c | 10 +++++----- 5 files changed, 28 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index ada5b56a4413..e5751f3070b8 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -647,7 +647,7 @@ static inline void sco_exit(void) #if IS_ENABLED(CONFIG_BT_LE) int iso_init(void); int iso_exit(void); -bool iso_enabled(void); +bool iso_inited(void); #else static inline int iso_init(void) { @@ -659,7 +659,7 @@ static inline int iso_exit(void) return 0; } -static inline bool iso_enabled(void) +static inline bool iso_inited(void) { return false; } diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 4dc11c66f7b8..bc29f2e2e16f 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1915,6 +1915,8 @@ void hci_conn_del_sysfs(struct hci_conn *conn); !hci_dev_test_flag(dev, HCI_RPA_EXPIRED)) #define adv_rpa_valid(adv) (bacmp(&adv->random_addr, BDADDR_ANY) && \ !adv->rpa_expired) +#define le_enabled(dev) (lmp_le_capable(dev) && \ + hci_dev_test_flag(dev, HCI_LE_ENABLED)) #define scan_1m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_1M) || \ ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_1M)) @@ -1981,14 +1983,23 @@ void hci_conn_del_sysfs(struct hci_conn *conn); /* CIS Master/Slave and BIS support */ #define iso_capable(dev) (cis_capable(dev) || bis_capable(dev)) +#define iso_enabled(dev) (le_enabled(dev) && iso_capable(dev)) #define cis_capable(dev) \ (cis_central_capable(dev) || cis_peripheral_capable(dev)) +#define cis_enabled(dev) (le_enabled(dev) && cis_capable(dev)) #define cis_central_capable(dev) \ ((dev)->le_features[3] & HCI_LE_CIS_CENTRAL) +#define cis_central_enabled(dev) \ + (le_enabled(dev) && cis_central_capable(dev)) #define cis_peripheral_capable(dev) \ ((dev)->le_features[3] & HCI_LE_CIS_PERIPHERAL) +#define cis_peripheral_enabled(dev) \ + (le_enabled(dev) && cis_peripheral_capable(dev)) #define bis_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_BROADCASTER) -#define sync_recv_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_SYNC_RECEIVER) +#define bis_enabled(dev) (le_enabled(dev) && bis_capable(dev)) +#define sync_recv_capable(dev) \ + ((dev)->le_features[3] & HCI_LE_ISO_SYNC_RECEIVER) +#define sync_recv_enabled(dev) (le_enabled(dev) && sync_recv_capable(dev)) #define mws_transport_config_capable(dev) (((dev)->commands[30] & 0x08) && \ (!hci_test_quirk((dev), HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG))) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 387c128f2ba0..aa7d7a8ec3ee 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -4531,14 +4531,14 @@ static int hci_le_set_host_feature_sync(struct hci_dev *hdev) { struct hci_cp_le_set_host_feature cp; - if (!cis_capable(hdev)) + if (!iso_capable(hdev)) return 0; memset(&cp, 0, sizeof(cp)); /* Connected Isochronous Channels (Host Support) */ cp.bit_number = 32; - cp.bit_value = 1; + cp.bit_value = iso_enabled(hdev) ? 0x01 : 0x00; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_HOST_FEATURE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index eaffd25570e3..5ce823ca3aaf 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -2483,11 +2483,11 @@ static const struct net_proto_family iso_sock_family_ops = { .create = iso_sock_create, }; -static bool iso_inited; +static bool inited; -bool iso_enabled(void) +bool iso_inited(void) { - return iso_inited; + return inited; } int iso_init(void) @@ -2496,7 +2496,7 @@ int iso_init(void) BUILD_BUG_ON(sizeof(struct sockaddr_iso) > sizeof(struct sockaddr)); - if (iso_inited) + if (inited) return -EALREADY; err = proto_register(&iso_proto, 0); @@ -2524,7 +2524,7 @@ int iso_init(void) iso_debugfs = debugfs_create_file("iso", 0444, bt_debugfs, NULL, &iso_debugfs_fops); - iso_inited = true; + inited = true; return 0; @@ -2535,7 +2535,7 @@ error: int iso_exit(void) { - if (!iso_inited) + if (!inited) return -EALREADY; bt_procfs_cleanup(&init_net, "iso"); @@ -2549,7 +2549,7 @@ int iso_exit(void) proto_unregister(&iso_proto); - iso_inited = false; + inited = false; return 0; } diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 1ce682038b51..c42dffe77daf 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -922,16 +922,16 @@ static u32 get_current_settings(struct hci_dev *hdev) if (hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED)) settings |= MGMT_SETTING_WIDEBAND_SPEECH; - if (cis_central_capable(hdev)) + if (cis_central_enabled(hdev)) settings |= MGMT_SETTING_CIS_CENTRAL; - if (cis_peripheral_capable(hdev)) + if (cis_peripheral_enabled(hdev)) settings |= MGMT_SETTING_CIS_PERIPHERAL; - if (bis_capable(hdev)) + if (bis_enabled(hdev)) settings |= MGMT_SETTING_ISO_BROADCASTER; - if (sync_recv_capable(hdev)) + if (sync_recv_enabled(hdev)) settings |= MGMT_SETTING_ISO_SYNC_RECEIVER; if (ll_privacy_capable(hdev)) @@ -4513,7 +4513,7 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, } if (IS_ENABLED(CONFIG_BT_LE)) { - flags = iso_enabled() ? BIT(0) : 0; + flags = iso_inited() ? BIT(0) : 0; memcpy(rp->features[idx].uuid, iso_socket_uuid, 16); rp->features[idx].flags = cpu_to_le32(flags); idx++; -- cgit v1.2.3 From 3dcf7175f2c04bd3a7d50db3fa42a0bd933b6e23 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 4 Aug 2025 14:05:03 -0400 Subject: Bluetooth: hci_core: Fix using ll_privacy_capable for current settings ll_privacy_capable only indicates that the controller supports the feature but it doesnt' check that LE is enabled so it end up being marked as active in the current settings when it shouldn't. Fixes: ad383c2c65a5 ("Bluetooth: hci_sync: Enable advertising when LL privacy is enabled") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 1 + net/bluetooth/mgmt.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index bc29f2e2e16f..bb30bde6f0e8 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1934,6 +1934,7 @@ void hci_conn_del_sysfs(struct hci_conn *conn); ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_CODED)) #define ll_privacy_capable(dev) ((dev)->le_features[0] & HCI_LE_LL_PRIVACY) +#define ll_privacy_enabled(dev) (le_enabled(dev) && ll_privacy_capable(dev)) #define privacy_mode_capable(dev) (ll_privacy_capable(dev) && \ ((dev)->commands[39] & 0x04)) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index c42dffe77daf..3166f5fb876b 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -934,7 +934,7 @@ static u32 get_current_settings(struct hci_dev *hdev) if (sync_recv_enabled(hdev)) settings |= MGMT_SETTING_ISO_SYNC_RECEIVER; - if (ll_privacy_capable(hdev)) + if (ll_privacy_enabled(hdev)) settings |= MGMT_SETTING_LL_PRIVACY; return settings; -- cgit v1.2.3 From 4d19cd228bbe8ff84a63fe7b11bc756b4b4370c7 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 7 Aug 2025 15:56:03 +0800 Subject: Bluetooth: hci_sync: Prevent unintended PA sync when SID is 0xFF After LE Extended Scan times out, conn->sid remains 0xFF, so the PA sync creation process should be aborted. Btmon snippet from PA sync with SID=0xFF: < HCI Command: LE Set Extended.. (0x08|0x0042) plen 6 #74726 [hci0] 863.107927 Extended scan: Enabled (0x01) Filter duplicates: Enabled (0x01) Duration: 0 msec (0x0000) Period: 0.00 sec (0x0000) > HCI Event: Command Complete (0x0e) plen 4 #74727 [hci0] 863.109389 LE Set Extended Scan Enable (0x08|0x0042) ncmd 1 Status: Success (0x00) < HCI Command: LE Periodic Ad.. (0x08|0x0044) plen 14 #74728 [hci0] 865.141168 Options: 0x0000 Use advertising SID, Advertiser Address Type and address Reporting initially enabled SID: 0xff Adv address type: Random (0x01) Adv address: 0D:D7:2C:E7:42:46 (Non-Resolvable) Skip: 0x0000 Sync timeout: 20000 msec (0x07d0) Sync CTE type: 0x0000 > HCI Event: Command Status (0x0f) plen 4 #74729 [hci0] 865.143223 LE Periodic Advertising Create Sync (0x08|0x0044) ncmd 1 Status: Success (0x00) Fixes: e2d471b7806b ("Bluetooth: ISO: Fix not using SID from adv report") Signed-off-by: Yang Li Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index aa7d7a8ec3ee..31d72b9683ef 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -7045,10 +7045,13 @@ static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data) /* SID has not been set listen for HCI_EV_LE_EXT_ADV_REPORT to update * it. */ - if (conn->sid == HCI_SID_INVALID) - __hci_cmd_sync_status_sk(hdev, HCI_OP_NOP, 0, NULL, - HCI_EV_LE_EXT_ADV_REPORT, - conn->conn_timeout, NULL); + if (conn->sid == HCI_SID_INVALID) { + err = __hci_cmd_sync_status_sk(hdev, HCI_OP_NOP, 0, NULL, + HCI_EV_LE_EXT_ADV_REPORT, + conn->conn_timeout, NULL); + if (err == -ETIMEDOUT) + goto done; + } memset(&cp, 0, sizeof(cp)); cp.options = qos->bcast.options; @@ -7078,6 +7081,7 @@ static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data) __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_CREATE_SYNC_CANCEL, 0, NULL, HCI_CMD_TIMEOUT); +done: hci_dev_clear_flag(hdev, HCI_PA_SYNC); /* Update passive scan since HCI_PA_SYNC flag has been cleared */ -- cgit v1.2.3 From 0b3725dbf61b51e7c663834811b3691157ae17d6 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Sat, 9 Aug 2025 11:36:20 +0300 Subject: Bluetooth: hci_event: fix MTU for BN == 0 in CIS Established BN == 0x00 in CIS Established means no isochronous data for the corresponding direction (Core v6.1 pp. 2394). In this case SDU MTU should be 0. However, the specification does not say the Max_PDU_C_To_P or P_To_C are then zero. Intel AX210 in Framed CIS mode sets nonzero Max_PDU for direction with zero BN. This causes failure later when we try to LE Setup ISO Data Path for disabled direction, which is disallowed (Core v6.1 pp. 2750). Fix by setting SDU MTU to 0 if BN == 0. Fixes: 2be22f1941d5f ("Bluetooth: hci_event: Fix parsing of CIS Established Event") Signed-off-by: Pauli Virtanen Reviewed-by: Paul Menzel Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 4f0a6116291e..fe7cdd67ad2a 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6745,8 +6745,8 @@ static void hci_le_cis_established_evt(struct hci_dev *hdev, void *data, qos->ucast.out.latency = DIV_ROUND_CLOSEST(get_unaligned_le24(ev->p_latency), 1000); - qos->ucast.in.sdu = le16_to_cpu(ev->c_mtu); - qos->ucast.out.sdu = le16_to_cpu(ev->p_mtu); + qos->ucast.in.sdu = ev->c_bn ? le16_to_cpu(ev->c_mtu) : 0; + qos->ucast.out.sdu = ev->p_bn ? le16_to_cpu(ev->p_mtu) : 0; qos->ucast.in.phy = ev->c_phy; qos->ucast.out.phy = ev->p_phy; break; @@ -6760,8 +6760,8 @@ static void hci_le_cis_established_evt(struct hci_dev *hdev, void *data, qos->ucast.in.latency = DIV_ROUND_CLOSEST(get_unaligned_le24(ev->p_latency), 1000); - qos->ucast.out.sdu = le16_to_cpu(ev->c_mtu); - qos->ucast.in.sdu = le16_to_cpu(ev->p_mtu); + qos->ucast.out.sdu = ev->c_bn ? le16_to_cpu(ev->c_mtu) : 0; + qos->ucast.in.sdu = ev->p_bn ? le16_to_cpu(ev->p_mtu) : 0; qos->ucast.out.phy = ev->c_phy; qos->ucast.in.phy = ev->p_phy; break; -- cgit v1.2.3 From 0eaf7c7e85da7495c0e03a99375707fc954f5e7b Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Tue, 5 Aug 2025 22:14:51 +0300 Subject: Bluetooth: hci_conn: do return error from hci_enhanced_setup_sync() The commit e07a06b4eb41 ("Bluetooth: Convert SCO configure_datapath to hci_sync") missed to update the *return* statement under the *case* of BT_CODEC_TRANSPARENT in hci_enhanced_setup_sync(), which led to returning success (0) instead of the negative error code (-EINVAL). However, the result of hci_enhanced_setup_sync() seems to be ignored anyway, since NULL gets passed to hci_cmd_sync_queue() as the last argument in that case and the only function interested in that result is specified by that argument. Fixes: e07a06b4eb41 ("Bluetooth: Convert SCO configure_datapath to hci_sync") Signed-off-by: Sergey Shtylyov Reviewed-by: Paul Menzel Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index ab6fe5b0cc0f..7a879290dd28 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -339,7 +339,8 @@ static int hci_enhanced_setup_sync(struct hci_dev *hdev, void *data) case BT_CODEC_TRANSPARENT: if (!find_next_esco_param(conn, esco_param_msbc, ARRAY_SIZE(esco_param_msbc))) - return false; + return -EINVAL; + param = &esco_param_msbc[conn->attempt - 1]; cp.tx_coding_format.id = 0x03; cp.rx_coding_format.id = 0x03; -- cgit v1.2.3 From 7de0eebbb4c3bb44c296f66679ad37480139dc6e Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Thu, 14 Aug 2025 12:23:55 +0800 Subject: net: bridge: remove unused argument of br_multicast_query_expired() Since commit 67b746f94ff3 ("net: bridge: mcast: make sure querier port/address updates are consistent"), the argument 'querier' is unused, just get rid of it. Signed-off-by: Wang Liang Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250814042355.1720755-1-wangliang74@huawei.com Signed-off-by: Jakub Kicinski --- net/bridge/br_multicast.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 1377f31b719c..4dc62d01e2d3 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -4049,8 +4049,7 @@ int br_multicast_rcv(struct net_bridge_mcast **brmctx, } static void br_multicast_query_expired(struct net_bridge_mcast *brmctx, - struct bridge_mcast_own_query *query, - struct bridge_mcast_querier *querier) + struct bridge_mcast_own_query *query) { spin_lock(&brmctx->br->multicast_lock); if (br_multicast_ctx_vlan_disabled(brmctx)) @@ -4069,8 +4068,7 @@ static void br_ip4_multicast_query_expired(struct timer_list *t) struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t, ip4_own_query.timer); - br_multicast_query_expired(brmctx, &brmctx->ip4_own_query, - &brmctx->ip4_querier); + br_multicast_query_expired(brmctx, &brmctx->ip4_own_query); } #if IS_ENABLED(CONFIG_IPV6) @@ -4079,8 +4077,7 @@ static void br_ip6_multicast_query_expired(struct timer_list *t) struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t, ip6_own_query.timer); - br_multicast_query_expired(brmctx, &brmctx->ip6_own_query, - &brmctx->ip6_querier); + br_multicast_query_expired(brmctx, &brmctx->ip6_own_query); } #endif -- cgit v1.2.3 From 89d912e494f786e79f69ed9d567a8842c71dbb03 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 14 Aug 2025 11:59:27 +0200 Subject: bpf: Add dynptr type for skb metadata Add a dynptr type, similar to skb dynptr, but for the skb metadata access. The dynptr provides an alternative to __sk_buff->data_meta for accessing the custom metadata area allocated using the bpf_xdp_adjust_meta() helper. More importantly, it abstracts away the fact where the storage for the custom metadata lives, which opens up the way to persist the metadata by relocating it as the skb travels through the network stack layers. Writes to skb metadata invalidate any existing skb payload and metadata slices. While this is more restrictive that needed at the moment, it leaves the door open to reallocating the metadata on writes, and should be only a minor inconvenience to the users. Only the program types which can access __sk_buff->data_meta today are allowed to create a dynptr for skb metadata at the moment. We need to modify the network stack to persist the metadata across layers before opening up access to other BPF hooks. Once more BPF hooks gain access to skb_meta dynptr, we will also need to add a read-only variant of the helper similar to bpf_dynptr_from_skb_rdonly. skb_meta dynptr ops are stubbed out and implemented by subsequent changes. Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Reviewed-by: Jesse Brandeburg Link: https://patch.msgid.link/20250814-skb-metadata-thru-dynptr-v7-1-8a39e636e0fb@cloudflare.com --- include/linux/bpf.h | 7 ++++++- kernel/bpf/helpers.c | 7 +++++++ kernel/bpf/log.c | 2 ++ kernel/bpf/verifier.c | 15 +++++++++++++-- net/core/filter.c | 41 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 69 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cc700925b802..ec527b476dba 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -767,12 +767,15 @@ enum bpf_type_flag { */ MEM_WRITE = BIT(18 + BPF_BASE_TYPE_BITS), + /* DYNPTR points to skb_metadata_end()-skb_metadata_len() */ + DYNPTR_TYPE_SKB_META = BIT(19 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; #define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \ - | DYNPTR_TYPE_XDP) + | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META) /* Max number of base types. */ #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) @@ -1358,6 +1361,8 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_SKB, /* Underlying data is a xdp_buff */ BPF_DYNPTR_TYPE_XDP, + /* Points to skb_metadata_end()-skb_metadata_len() */ + BPF_DYNPTR_TYPE_SKB_META, }; int bpf_dynptr_check_size(u32 size); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6b4877e85a68..9552b32208c5 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1780,6 +1780,8 @@ static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *s return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len); case BPF_DYNPTR_TYPE_XDP: return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len); + case BPF_DYNPTR_TYPE_SKB_META: + return -EOPNOTSUPP; /* not implemented */ default: WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type); return -EFAULT; @@ -1836,6 +1838,8 @@ int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, if (flags) return -EINVAL; return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len); + case BPF_DYNPTR_TYPE_SKB_META: + return -EOPNOTSUPP; /* not implemented */ default: WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type); return -EFAULT; @@ -1882,6 +1886,7 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3 return (unsigned long)(ptr->data + ptr->offset + offset); case BPF_DYNPTR_TYPE_SKB: case BPF_DYNPTR_TYPE_XDP: + case BPF_DYNPTR_TYPE_SKB_META: /* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */ return 0; default: @@ -2710,6 +2715,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false); return buffer__opt; } + case BPF_DYNPTR_TYPE_SKB_META: + return NULL; /* not implemented */ default: WARN_ONCE(true, "unknown dynptr type %d\n", type); return NULL; diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 38050f4ee400..e4983c1303e7 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -498,6 +498,8 @@ const char *dynptr_type_str(enum bpf_dynptr_type type) return "skb"; case BPF_DYNPTR_TYPE_XDP: return "xdp"; + case BPF_DYNPTR_TYPE_SKB_META: + return "skb_meta"; case BPF_DYNPTR_TYPE_INVALID: return ""; default: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c4f69a9e9af6..5964bed40ffb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -674,6 +674,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type) return BPF_DYNPTR_TYPE_SKB; case DYNPTR_TYPE_XDP: return BPF_DYNPTR_TYPE_XDP; + case DYNPTR_TYPE_SKB_META: + return BPF_DYNPTR_TYPE_SKB_META; default: return BPF_DYNPTR_TYPE_INVALID; } @@ -690,6 +692,8 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type) return DYNPTR_TYPE_SKB; case BPF_DYNPTR_TYPE_XDP: return DYNPTR_TYPE_XDP; + case BPF_DYNPTR_TYPE_SKB_META: + return DYNPTR_TYPE_SKB_META; default: return 0; } @@ -2274,7 +2278,8 @@ static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg) static bool reg_is_dynptr_slice_pkt(const struct bpf_reg_state *reg) { return base_type(reg->type) == PTR_TO_MEM && - (reg->type & DYNPTR_TYPE_SKB || reg->type & DYNPTR_TYPE_XDP); + (reg->type & + (DYNPTR_TYPE_SKB | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META)); } /* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */ @@ -11641,7 +11646,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (dynptr_type == BPF_DYNPTR_TYPE_INVALID) return -EFAULT; - if (dynptr_type == BPF_DYNPTR_TYPE_SKB) + if (dynptr_type == BPF_DYNPTR_TYPE_SKB || + dynptr_type == BPF_DYNPTR_TYPE_SKB_META) /* this will trigger clear_all_pkt_pointers(), which will * invalidate all dynptr slices associated with the skb */ @@ -12228,6 +12234,7 @@ enum special_kfunc_type { KF_bpf_rbtree_right, KF_bpf_dynptr_from_skb, KF_bpf_dynptr_from_xdp, + KF_bpf_dynptr_from_skb_meta, KF_bpf_dynptr_slice, KF_bpf_dynptr_slice_rdwr, KF_bpf_dynptr_clone, @@ -12277,9 +12284,11 @@ BTF_ID(func, bpf_rbtree_right) #ifdef CONFIG_NET BTF_ID(func, bpf_dynptr_from_skb) BTF_ID(func, bpf_dynptr_from_xdp) +BTF_ID(func, bpf_dynptr_from_skb_meta) #else BTF_ID_UNUSED BTF_ID_UNUSED +BTF_ID_UNUSED #endif BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) @@ -13253,6 +13262,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ dynptr_arg_type |= DYNPTR_TYPE_SKB; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) { dynptr_arg_type |= DYNPTR_TYPE_XDP; + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb_meta]) { + dynptr_arg_type |= DYNPTR_TYPE_SKB_META; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] && (dynptr_arg_type & MEM_UNINIT)) { enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type; diff --git a/net/core/filter.c b/net/core/filter.c index da391e2b0788..31b4b50dbadf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -12007,6 +12007,36 @@ __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, return 0; } +/** + * bpf_dynptr_from_skb_meta() - Initialize a dynptr to the skb metadata area. + * @skb_: socket buffer carrying the metadata + * @flags: future use, must be zero + * @ptr__uninit: dynptr to initialize + * + * Set up a dynptr for access to the metadata area earlier allocated from the + * XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to + * &__sk_buff->data_meta. + * + * Return: + * * %0 - dynptr ready to use + * * %-EINVAL - invalid flags, dynptr set to null + */ +__bpf_kfunc int bpf_dynptr_from_skb_meta(struct __sk_buff *skb_, u64 flags, + struct bpf_dynptr *ptr__uninit) +{ + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; + struct sk_buff *skb = (struct sk_buff *)skb_; + + if (flags) { + bpf_dynptr_set_null(ptr); + return -EINVAL; + } + + bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb)); + + return 0; +} + __bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags, struct bpf_dynptr *ptr__uninit) { @@ -12181,6 +12211,10 @@ BTF_KFUNCS_START(bpf_kfunc_check_set_skb) BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_skb) +BTF_KFUNCS_START(bpf_kfunc_check_set_skb_meta) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta) + BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) BTF_ID_FLAGS(func, bpf_dynptr_from_xdp) BTF_KFUNCS_END(bpf_kfunc_check_set_xdp) @@ -12202,6 +12236,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { .set = &bpf_kfunc_check_set_skb, }; +static const struct btf_kfunc_id_set bpf_kfunc_set_skb_meta = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_skb_meta, +}; + static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_xdp, @@ -12237,6 +12276,8 @@ static int __init bpf_kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb_meta); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb_meta); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, &bpf_kfunc_set_sock_addr); -- cgit v1.2.3 From 6877cd392baecf816c2ba896a9d42874628004a5 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 14 Aug 2025 11:59:28 +0200 Subject: bpf: Enable read/write access to skb metadata through a dynptr Now that we can create a dynptr to skb metadata, make reads to the metadata area possible with bpf_dynptr_read() or through a bpf_dynptr_slice(), and make writes to the metadata area possible with bpf_dynptr_write() or through a bpf_dynptr_slice_rdwr(). Note that for cloned skbs which share data with the original, we limit the skb metadata dynptr to be read-only since we don't unclone on a bpf_dynptr_write to metadata. Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250814-skb-metadata-thru-dynptr-v7-2-8a39e636e0fb@cloudflare.com --- include/linux/filter.h | 6 ++++++ kernel/bpf/helpers.c | 10 +++++++--- net/core/filter.c | 16 ++++++++++++++++ 3 files changed, 29 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/linux/filter.h b/include/linux/filter.h index 1e7fd3ee759e..9ed21b65e2e9 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1784,6 +1784,7 @@ int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len); void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf, unsigned long len, bool flush); +void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset); #else /* CONFIG_NET */ static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) @@ -1818,6 +1819,11 @@ static inline void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, voi unsigned long len, bool flush) { } + +static inline void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset) +{ + return NULL; +} #endif /* CONFIG_NET */ #endif /* __LINUX_FILTER_H__ */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9552b32208c5..cdffd74ddbe6 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1781,7 +1781,8 @@ static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *s case BPF_DYNPTR_TYPE_XDP: return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len); case BPF_DYNPTR_TYPE_SKB_META: - return -EOPNOTSUPP; /* not implemented */ + memmove(dst, bpf_skb_meta_pointer(src->data, src->offset + offset), len); + return 0; default: WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type); return -EFAULT; @@ -1839,7 +1840,10 @@ int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, return -EINVAL; return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len); case BPF_DYNPTR_TYPE_SKB_META: - return -EOPNOTSUPP; /* not implemented */ + if (flags) + return -EINVAL; + memmove(bpf_skb_meta_pointer(dst->data, dst->offset + offset), src, len); + return 0; default: WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type); return -EFAULT; @@ -2716,7 +2720,7 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, return buffer__opt; } case BPF_DYNPTR_TYPE_SKB_META: - return NULL; /* not implemented */ + return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset); default: WARN_ONCE(true, "unknown dynptr type %d\n", type); return NULL; diff --git a/net/core/filter.c b/net/core/filter.c index 31b4b50dbadf..63f3baee2daf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -11990,6 +11990,16 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return func; } +/** + * bpf_skb_meta_pointer() - Gets a mutable pointer within the skb metadata area. + * @skb: socket buffer carrying the metadata + * @offset: offset into the metadata area, must be <= skb_metadata_len() + */ +void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset) +{ + return skb_metadata_end(skb) - skb_metadata_len(skb) + offset; +} + __bpf_kfunc_start_defs(); __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, struct bpf_dynptr *ptr__uninit) @@ -12017,6 +12027,9 @@ __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, * XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to * &__sk_buff->data_meta. * + * If passed @skb_ is a clone which shares the data with the original, the + * dynptr will be read-only. This limitation may be lifted in the future. + * * Return: * * %0 - dynptr ready to use * * %-EINVAL - invalid flags, dynptr set to null @@ -12034,6 +12047,9 @@ __bpf_kfunc int bpf_dynptr_from_skb_meta(struct __sk_buff *skb_, u64 flags, bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb)); + if (skb_cloned(skb)) + bpf_dynptr_set_rdonly(ptr); + return 0; } -- cgit v1.2.3 From 864e3396976ef41de6cc7bc366276bf4e084fff2 Mon Sep 17 00:00:00 2001 From: Jakub Ramaseuski Date: Thu, 14 Aug 2025 12:51:19 +0200 Subject: net: gso: Forbid IPv6 TSO with extensions on devices with only IPV6_CSUM When performing Generic Segmentation Offload (GSO) on an IPv6 packet that contains extension headers, the kernel incorrectly requests checksum offload if the egress device only advertises NETIF_F_IPV6_CSUM feature, which has a strict contract: it supports checksum offload only for plain TCP or UDP over IPv6 and explicitly does not support packets with extension headers. The current GSO logic violates this contract by failing to disable the feature for packets with extension headers, such as those used in GREoIPv6 tunnels. This violation results in the device being asked to perform an operation it cannot support, leading to a `skb_warn_bad_offload` warning and a collapse of network throughput. While device TSO/USO is correctly bypassed in favor of software GSO for these packets, the GSO stack must be explicitly told not to request checksum offload. Mask NETIF_F_IPV6_CSUM, NETIF_F_TSO6 and NETIF_F_GSO_UDP_L4 in gso_features_check if the IPv6 header contains extension headers to compute checksum in software. The exception is a BIG TCP extension, which, as stated in commit 68e068cabd2c6c53 ("net: reenable NETIF_F_IPV6_CSUM offload for BIG TCP packets"): "The feature is only enabled on devices that support BIG TCP TSO. The header is only present for PF_PACKET taps like tcpdump, and not transmitted by physical devices." kernel log output (truncated): WARNING: CPU: 1 PID: 5273 at net/core/dev.c:3535 skb_warn_bad_offload+0x81/0x140 ... Call Trace: skb_checksum_help+0x12a/0x1f0 validate_xmit_skb+0x1a3/0x2d0 validate_xmit_skb_list+0x4f/0x80 sch_direct_xmit+0x1a2/0x380 __dev_xmit_skb+0x242/0x670 __dev_queue_xmit+0x3fc/0x7f0 ip6_finish_output2+0x25e/0x5d0 ip6_finish_output+0x1fc/0x3f0 ip6_tnl_xmit+0x608/0xc00 [ip6_tunnel] ip6gre_tunnel_xmit+0x1c0/0x390 [ip6_gre] dev_hard_start_xmit+0x63/0x1c0 __dev_queue_xmit+0x6d0/0x7f0 ip6_finish_output2+0x214/0x5d0 ip6_finish_output+0x1fc/0x3f0 ip6_xmit+0x2ca/0x6f0 ip6_finish_output+0x1fc/0x3f0 ip6_xmit+0x2ca/0x6f0 inet6_csk_xmit+0xeb/0x150 __tcp_transmit_skb+0x555/0xa80 tcp_write_xmit+0x32a/0xe90 tcp_sendmsg_locked+0x437/0x1110 tcp_sendmsg+0x2f/0x50 ... skb linear: 00000000: e4 3d 1a 7d ec 30 e4 3d 1a 7e 5d 90 86 dd 60 0e skb linear: 00000010: 00 0a 1b 34 3c 40 20 11 00 00 00 00 00 00 00 00 skb linear: 00000020: 00 00 00 00 00 12 20 11 00 00 00 00 00 00 00 00 skb linear: 00000030: 00 00 00 00 00 11 2f 00 04 01 04 01 01 00 00 00 skb linear: 00000040: 86 dd 60 0e 00 0a 1b 00 06 40 20 23 00 00 00 00 skb linear: 00000050: 00 00 00 00 00 00 00 00 00 12 20 23 00 00 00 00 skb linear: 00000060: 00 00 00 00 00 00 00 00 00 11 bf 96 14 51 13 f9 skb linear: 00000070: ae 27 a0 a8 2b e3 80 18 00 40 5b 6f 00 00 01 01 skb linear: 00000080: 08 0a 42 d4 50 d5 4b 70 f8 1a Fixes: 04c20a9356f283da ("net: skip offload for NETIF_F_IPV6_CSUM if ipv6 header contains extension") Reported-by: Tianhao Zhao Suggested-by: Michal Schmidt Suggested-by: Willem de Bruijn Signed-off-by: Jakub Ramaseuski Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250814105119.1525687-1-jramaseu@redhat.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 5a3c0f40a93f..93a25d87b86b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3779,6 +3779,18 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, features &= ~NETIF_F_TSO_MANGLEID; } + /* NETIF_F_IPV6_CSUM does not support IPv6 extension headers, + * so neither does TSO that depends on it. + */ + if (features & NETIF_F_IPV6_CSUM && + (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6 || + (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && + vlan_get_protocol(skb) == htons(ETH_P_IPV6))) && + skb_transport_header_was_set(skb) && + skb_network_header_len(skb) != sizeof(struct ipv6hdr) && + !ipv6_has_hopopt_jumbo(skb)) + features &= ~(NETIF_F_IPV6_CSUM | NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4); + return features; } -- cgit v1.2.3 From 84967deee9d9870b15bc4c3acb50f1d401807902 Mon Sep 17 00:00:00 2001 From: Minhong He Date: Fri, 15 Aug 2025 14:38:45 +0800 Subject: ipv6: sr: validate HMAC algorithm ID in seg6_hmac_info_add The seg6_genl_sethmac() directly uses the algorithm ID provided by the userspace without verifying whether it is an HMAC algorithm supported by the system. If an unsupported HMAC algorithm ID is configured, packets using SRv6 HMAC will be dropped during encapsulation or decapsulation. Fixes: 4f4853dc1c9c ("ipv6: sr: implement API to control SR HMAC structure") Signed-off-by: Minhong He Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250815063845.85426-1-heminhong@kylinos.cn Signed-off-by: Jakub Kicinski --- net/ipv6/seg6_hmac.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index f78ecb6ad838..d77b52523b6a 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -304,6 +304,9 @@ int seg6_hmac_info_add(struct net *net, u32 key, struct seg6_hmac_info *hinfo) struct seg6_pernet_data *sdata = seg6_pernet(net); int err; + if (!__hmac_get_algo(hinfo->alg_id)) + return -EINVAL; + err = rhashtable_lookup_insert_fast(&sdata->hmac_infos, &hinfo->node, rht_params); -- cgit v1.2.3 From ccab044697980c6c01ab51f43f48f13b8a3e5c33 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Fri, 15 Aug 2025 19:28:19 +0200 Subject: mptcp: drop skb if MPTCP skb extension allocation fails When skb_ext_add(skb, SKB_EXT_MPTCP) fails in mptcp_incoming_options(), we used to return true, letting the segment proceed through the TCP receive path without a DSS mapping. Such segments can leave inconsistent mapping state and trigger a mid-stream fallback to TCP, which in testing collapsed (by artificially forcing failures in skb_ext_add) throughput to zero. Return false instead so the TCP input path drops the skb (see tcp_data_queue() and step-7 processing). This is the safer choice under memory pressure: it preserves MPTCP correctness and provides backpressure to the sender. Control packets remain unaffected: ACK updates and DATA_FIN handling happen before attempting the extension allocation, and tcp_reset() continues to ignore the return value. With this change, MPTCP continues to work at high throughput if we artificially inject failures into skb_ext_add. Fixes: 6787b7e350d3 ("mptcp: avoid processing packet if a subflow reset") Cc: stable@vger.kernel.org Signed-off-by: Christoph Paasch Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250815-net-mptcp-misc-fixes-6-17-rc2-v1-1-521fe9957892@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 70c0ab0ecf90..2a8ea28442b2 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -1118,7 +1118,9 @@ static bool add_addr_hmac_valid(struct mptcp_sock *msk, return hmac == mp_opt->ahmac; } -/* Return false if a subflow has been reset, else return true */ +/* Return false in case of error (or subflow has been reset), + * else return true. + */ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); @@ -1222,7 +1224,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) mpext = skb_ext_add(skb, SKB_EXT_MPTCP); if (!mpext) - return true; + return false; memset(mpext, 0, sizeof(*mpext)); -- cgit v1.2.3 From 68fc0f4b0d25692940cdc85c68e366cae63e1757 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 15 Aug 2025 19:28:20 +0200 Subject: mptcp: pm: kernel: flush: do not reset ADD_ADDR limit A flush of the MPTCP endpoints should not affect the MPTCP limits. In other words, 'ip mptcp endpoint flush' should not change 'ip mptcp limits'. But it was the case: the MPTCP_PM_ATTR_RCV_ADD_ADDRS (add_addr_accepted) limit was reset by accident. Removing the reset of this counter during a flush fixes this issue. Fixes: 01cacb00b35c ("mptcp: add netlink-based PM") Cc: stable@vger.kernel.org Reported-by: Thomas Dreibholz Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/579 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250815-net-mptcp-misc-fixes-6-17-rc2-v1-2-521fe9957892@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_kernel.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index d39e7c178460..667803d72b64 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -1085,7 +1085,6 @@ static void __flush_addrs(struct list_head *list) static void __reset_counters(struct pm_nl_pernet *pernet) { WRITE_ONCE(pernet->add_addr_signal_max, 0); - WRITE_ONCE(pernet->add_addr_accept_max, 0); WRITE_ONCE(pernet->local_addr_max, 0); pernet->addrs = 0; } -- cgit v1.2.3 From 5d13349472ac8abcbcb94407969aa0fdc2e1f1be Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 15 Aug 2025 19:28:22 +0200 Subject: mptcp: remove duplicate sk_reset_timer call sk_reset_timer() was called twice in mptcp_pm_alloc_anno_list. Simplify the code by using a 'goto' statement to eliminate the duplication. Note that this is not a fix, but it will help backporting the following patch. The same "Fixes" tag has been added for this reason. Fixes: 93f323b9cccc ("mptcp: add a new sysctl add_addr_timeout") Cc: stable@vger.kernel.org Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250815-net-mptcp-misc-fixes-6-17-rc2-v1-4-521fe9957892@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 420d416e2603..c5f6a53ce5f1 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -353,9 +353,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, if (WARN_ON_ONCE(mptcp_pm_is_kernel(msk))) return false; - sk_reset_timer(sk, &add_entry->add_timer, - jiffies + mptcp_get_add_addr_timeout(net)); - return true; + goto reset_timer; } add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC); @@ -369,6 +367,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, add_entry->retrans_times = 0; timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); +reset_timer: sk_reset_timer(sk, &add_entry->add_timer, jiffies + mptcp_get_add_addr_timeout(net)); -- cgit v1.2.3 From f5ce0714623cffd00bf2a83e890d09c609b7f50a Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 15 Aug 2025 19:28:23 +0200 Subject: mptcp: disable add_addr retransmission when timeout is 0 When add_addr_timeout was set to 0, this caused the ADD_ADDR to be retransmitted immediately, which looks like a buggy behaviour. Instead, interpret 0 as "no retransmissions needed". The documentation is updated to explicitly state that setting the timeout to 0 disables retransmission. Fixes: 93f323b9cccc ("mptcp: add a new sysctl add_addr_timeout") Cc: stable@vger.kernel.org Suggested-by: Matthieu Baerts Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250815-net-mptcp-misc-fixes-6-17-rc2-v1-5-521fe9957892@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/mptcp-sysctl.rst | 2 ++ net/mptcp/pm.c | 13 ++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/Documentation/networking/mptcp-sysctl.rst b/Documentation/networking/mptcp-sysctl.rst index 5bfab01eff5a..1683c139821e 100644 --- a/Documentation/networking/mptcp-sysctl.rst +++ b/Documentation/networking/mptcp-sysctl.rst @@ -12,6 +12,8 @@ add_addr_timeout - INTEGER (seconds) resent to an MPTCP peer that has not acknowledged a previous ADD_ADDR message. + Do not retransmit if set to 0. + The default value matches TCP_RTO_MAX. This is a per-namespace sysctl. diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index c5f6a53ce5f1..136a380602ca 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -274,6 +274,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer) add_timer); struct mptcp_sock *msk = entry->sock; struct sock *sk = (struct sock *)msk; + unsigned int timeout; pr_debug("msk=%p\n", msk); @@ -291,6 +292,10 @@ static void mptcp_pm_add_timer(struct timer_list *timer) goto out; } + timeout = mptcp_get_add_addr_timeout(sock_net(sk)); + if (!timeout) + goto out; + spin_lock_bh(&msk->pm.lock); if (!mptcp_pm_should_add_signal_addr(msk)) { @@ -302,7 +307,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer) if (entry->retrans_times < ADD_ADDR_RETRANS_MAX) sk_reset_timer(sk, timer, - jiffies + mptcp_get_add_addr_timeout(sock_net(sk))); + jiffies + timeout); spin_unlock_bh(&msk->pm.lock); @@ -344,6 +349,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, struct mptcp_pm_add_entry *add_entry = NULL; struct sock *sk = (struct sock *)msk; struct net *net = sock_net(sk); + unsigned int timeout; lockdep_assert_held(&msk->pm.lock); @@ -368,8 +374,9 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); reset_timer: - sk_reset_timer(sk, &add_entry->add_timer, - jiffies + mptcp_get_add_addr_timeout(net)); + timeout = mptcp_get_add_addr_timeout(net); + if (timeout) + sk_reset_timer(sk, &add_entry->add_timer, jiffies + timeout); return true; } -- cgit v1.2.3 From f179f5bc158f07693b74c264f8933c8b0f07503f Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Fri, 15 Aug 2025 10:53:17 -0300 Subject: net/sched: sch_dualpi2: Run prob update timer in softirq to avoid deadlock When a user creates a dualpi2 qdisc it automatically sets a timer. This timer will run constantly and update the qdisc's probability field. The issue is that the timer acquires the qdisc root lock and runs in hardirq. The qdisc root lock is also acquired in dev.c whenever a packet arrives for this qdisc. Since the dualpi2 timer callback runs in hardirq, it may interrupt the packet processing running in softirq. If that happens and it runs on the same CPU, it will acquire the same lock and cause a deadlock. The following splat shows up when running a kernel compiled with lock debugging: [ +0.000224] WARNING: inconsistent lock state [ +0.000224] 6.16.0+ #10 Not tainted [ +0.000169] -------------------------------- [ +0.000029] inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage. [ +0.000000] ping/156 [HC0[0]:SC0[2]:HE1:SE0] takes: [ +0.000000] ffff897841242110 (&sch->root_lock_key){?.-.}-{3:3}, at: __dev_queue_xmit+0x86d/0x1140 [ +0.000000] {IN-HARDIRQ-W} state was registered at: [ +0.000000] lock_acquire.part.0+0xb6/0x220 [ +0.000000] _raw_spin_lock+0x31/0x80 [ +0.000000] dualpi2_timer+0x6f/0x270 [ +0.000000] __hrtimer_run_queues+0x1c5/0x360 [ +0.000000] hrtimer_interrupt+0x115/0x260 [ +0.000000] __sysvec_apic_timer_interrupt+0x6d/0x1a0 [ +0.000000] sysvec_apic_timer_interrupt+0x6e/0x80 [ +0.000000] asm_sysvec_apic_timer_interrupt+0x1a/0x20 [ +0.000000] pv_native_safe_halt+0xf/0x20 [ +0.000000] default_idle+0x9/0x10 [ +0.000000] default_idle_call+0x7e/0x1e0 [ +0.000000] do_idle+0x1e8/0x250 [ +0.000000] cpu_startup_entry+0x29/0x30 [ +0.000000] rest_init+0x151/0x160 [ +0.000000] start_kernel+0x6f3/0x700 [ +0.000000] x86_64_start_reservations+0x24/0x30 [ +0.000000] x86_64_start_kernel+0xc8/0xd0 [ +0.000000] common_startup_64+0x13e/0x148 [ +0.000000] irq event stamp: 6884 [ +0.000000] hardirqs last enabled at (6883): [] neigh_resolve_output+0x223/0x270 [ +0.000000] hardirqs last disabled at (6882): [] neigh_resolve_output+0x1e8/0x270 [ +0.000000] softirqs last enabled at (6880): [] neigh_resolve_output+0x1db/0x270 [ +0.000000] softirqs last disabled at (6884): [] __dev_queue_xmit+0x73/0x1140 [ +0.000000] other info that might help us debug this: [ +0.000000] Possible unsafe locking scenario: [ +0.000000] CPU0 [ +0.000000] ---- [ +0.000000] lock(&sch->root_lock_key); [ +0.000000] [ +0.000000] lock(&sch->root_lock_key); [ +0.000000] *** DEADLOCK *** [ +0.000000] 4 locks held by ping/156: [ +0.000000] #0: ffff897842332e08 (sk_lock-AF_INET){+.+.}-{0:0}, at: raw_sendmsg+0x41e/0xf40 [ +0.000000] #1: ffffffffa816f880 (rcu_read_lock){....}-{1:3}, at: ip_output+0x2c/0x190 [ +0.000000] #2: ffffffffa816f880 (rcu_read_lock){....}-{1:3}, at: ip_finish_output2+0xad/0x950 [ +0.000000] #3: ffffffffa816f840 (rcu_read_lock_bh){....}-{1:3}, at: __dev_queue_xmit+0x73/0x1140 I am able to reproduce it consistently when running the following: tc qdisc add dev lo handle 1: root dualpi2 ping -f 127.0.0.1 To fix it, make the timer run in softirq. Fixes: 320d031ad6e4 ("sched: Struct definition and parsing of dualpi2 qdisc") Reviewed-by: Jamal Hadi Salim Signed-off-by: Victor Nogueira Link: https://patch.msgid.link/20250815135317.664993-1-victor@mojatatu.com Signed-off-by: Jakub Kicinski --- net/sched/sch_dualpi2.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c index 845375ebd4ea..4b975feb52b1 100644 --- a/net/sched/sch_dualpi2.c +++ b/net/sched/sch_dualpi2.c @@ -927,7 +927,8 @@ static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt, q->sch = sch; dualpi2_reset_default(sch); - hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED_SOFT); if (opt && nla_len(opt)) { err = dualpi2_change(sch, opt, extack); @@ -937,7 +938,7 @@ static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt, } hrtimer_start(&q->pi2_timer, next_pi2_timeout(q), - HRTIMER_MODE_ABS_PINNED); + HRTIMER_MODE_ABS_PINNED_SOFT); return 0; } -- cgit v1.2.3 From 5236f57e7c033d869fe8f2080a977ea47882b26f Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Sat, 16 Aug 2025 16:12:48 -0700 Subject: net: Make nexthop-dumps scale linearly with the number of nexthops When we have a (very) large number of nexthops, they do not fit within a single message. rtm_dump_walk_nexthops() thus will be called repeatedly and ctx->idx is used to avoid dumping the same nexthops again. The approach in which we avoid dumping the same nexthops is by basically walking the entire nexthop rb-tree from the left-most node until we find a node whose id is >= s_idx. That does not scale well. Instead of this inefficient approach, rather go directly through the tree to the nexthop that should be dumped (the one whose nh_id >= s_idx). This allows us to find the relevant node in O(log(n)). We have quite a nice improvement with this: Before: ======= --> ~1M nexthops: $ time ~/libnl/src/nl-nh-list | wc -l 1050624 real 0m21.080s user 0m0.666s sys 0m20.384s --> ~2M nexthops: $ time ~/libnl/src/nl-nh-list | wc -l 2101248 real 1m51.649s user 0m1.540s sys 1m49.908s After: ====== --> ~1M nexthops: $ time ~/libnl/src/nl-nh-list | wc -l 1050624 real 0m1.157s user 0m0.926s sys 0m0.259s --> ~2M nexthops: $ time ~/libnl/src/nl-nh-list | wc -l 2101248 real 0m2.763s user 0m2.042s sys 0m0.776s Signed-off-by: Christoph Paasch Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250816-nexthop_dump-v2-1-491da3462118@openai.com Signed-off-by: Jakub Kicinski --- net/ipv4/nexthop.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 29118c43ebf5..509004bfd08e 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -3511,12 +3511,42 @@ static int rtm_dump_walk_nexthops(struct sk_buff *skb, int err; s_idx = ctx->idx; - for (node = rb_first(root); node; node = rb_next(node)) { + + /* If this is not the first invocation, ctx->idx will contain the id of + * the last nexthop we processed. Instead of starting from the very + * first element of the red/black tree again and linearly skipping the + * (potentially large) set of nodes with an id smaller than s_idx, walk + * the tree and find the left-most node whose id is >= s_idx. This + * provides an efficient O(log n) starting point for the dump + * continuation. + */ + if (s_idx != 0) { + struct rb_node *tmp = root->rb_node; + + node = NULL; + while (tmp) { + struct nexthop *nh; + + nh = rb_entry(tmp, struct nexthop, rb_node); + if (nh->id < s_idx) { + tmp = tmp->rb_right; + } else { + /* Track current candidate and keep looking on + * the left side to find the left-most + * (smallest id) that is still >= s_idx. + */ + node = tmp; + tmp = tmp->rb_left; + } + } + } else { + node = rb_first(root); + } + + for (; node; node = rb_next(node)) { struct nexthop *nh; nh = rb_entry(node, struct nexthop, rb_node); - if (nh->id < s_idx) - continue; ctx->idx = nh->id; err = nh_cb(skb, cb, nh, data); -- cgit v1.2.3 From b0ac6d3b56a2384db151696cfda2836a8a961b6d Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Sat, 16 Aug 2025 16:12:49 -0700 Subject: net: When removing nexthops, don't call synchronize_net if it is not necessary When removing a nexthop, commit 90f33bffa382 ("nexthops: don't modify published nexthop groups") added a call to synchronize_rcu() (later changed to _net()) to make sure everyone sees the new nexthop-group before the rtnl-lock is released. When one wants to delete a large number of groups and nexthops, it is fastest to first flush the groups (ip nexthop flush groups) and then flush the nexthops themselves (ip -6 nexthop flush). As that way the groups don't need to be rebalanced. However, `ip -6 nexthop flush` will still take a long time if there is a very large number of nexthops because of the call to synchronize_net(). Now, if there are no more groups, there is no point in calling synchronize_net(). So, let's skip that entirely by checking if nh->grp_list is empty. This gives us a nice speedup: BEFORE: ======= $ time sudo ip -6 nexthop flush Dump was interrupted and may be inconsistent. Flushed 2097152 nexthops real 1m45.345s user 0m0.001s sys 0m0.005s $ time sudo ip -6 nexthop flush Dump was interrupted and may be inconsistent. Flushed 4194304 nexthops real 3m10.430s user 0m0.002s sys 0m0.004s AFTER: ====== $ time sudo ip -6 nexthop flush Dump was interrupted and may be inconsistent. Flushed 2097152 nexthops real 0m17.545s user 0m0.003s sys 0m0.003s $ time sudo ip -6 nexthop flush Dump was interrupted and may be inconsistent. Flushed 4194304 nexthops real 0m35.823s user 0m0.002s sys 0m0.004s Signed-off-by: Christoph Paasch Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250816-nexthop_dump-v2-2-491da3462118@openai.com Signed-off-by: Jakub Kicinski --- net/ipv4/nexthop.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 509004bfd08e..0a20625f5ffb 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -2087,6 +2087,12 @@ static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, { struct nh_grp_entry *nhge, *tmp; + /* If there is nothing to do, let's avoid the costly call to + * synchronize_net() + */ + if (list_empty(&nh->grp_list)) + return; + list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) remove_nh_grp_entry(net, nhge, nlinfo); -- cgit v1.2.3 From c829aab21ed55d9e38346604259aa3ff88d17274 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 18 Aug 2025 08:40:27 -0700 Subject: xfrm: Switch to skb_dstref_steal to clear dst_entry Going forward skb_dst_set will assert that skb dst_entry is empty during skb_dst_set. skb_dstref_steal is added to reset existing entry without doing refcnt. Switch to skb_dstref_steal in __xfrm_route_forward and add a comment on why it's safe to skip skb_dstref_restore. Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250818154032.3173645-3-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- net/xfrm/xfrm_policy.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index c5035a9bc3bb..7111184eef59 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3881,12 +3881,18 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) } skb_dst_force(skb); - if (!skb_dst(skb)) { + dst = skb_dst(skb); + if (!dst) { XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR); return 0; } - dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE); + /* ignore return value from skb_dstref_steal, xfrm_lookup takes + * care of dropping the refcnt if needed. + */ + skb_dstref_steal(skb); + + dst = xfrm_lookup(net, dst, &fl, NULL, XFRM_LOOKUP_QUEUE); if (IS_ERR(dst)) { res = 0; dst = NULL; -- cgit v1.2.3 From 15488d4d8dc10a575f32cc692b6c9aa99f66bc7b Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 18 Aug 2025 08:40:28 -0700 Subject: netfilter: Switch to skb_dstref_steal to clear dst_entry Going forward skb_dst_set will assert that skb dst_entry is empty during skb_dst_set. skb_dstref_steal is added to reset existing entry without doing refcnt. Switch to skb_dstref_steal in ip[6]_route_me_harder and add a comment on why it's safe to skip skb_dstref_restore. Acked-by: Florian Westphal Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250818154032.3173645-4-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- net/ipv4/netfilter.c | 5 ++++- net/ipv6/netfilter.c | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 0565f001120d..e60e54e7945d 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -65,7 +65,10 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && xfrm_decode_session(net, skb, flowi4_to_flowi(&fl4), AF_INET) == 0) { struct dst_entry *dst = skb_dst(skb); - skb_dst_set(skb, NULL); + /* ignore return value from skb_dstref_steal, xfrm_lookup takes + * care of dropping the refcnt if needed. + */ + skb_dstref_steal(skb); dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), sk, 0); if (IS_ERR(dst)) return PTR_ERR(dst); diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 45f9105f9ac1..46540a5a4331 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -63,7 +63,10 @@ int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff #ifdef CONFIG_XFRM if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && xfrm_decode_session(net, skb, flowi6_to_flowi(&fl6), AF_INET6) == 0) { - skb_dst_set(skb, NULL); + /* ignore return value from skb_dstref_steal, xfrm_lookup takes + * care of dropping the refcnt if needed. + */ + skb_dstref_steal(skb); dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0); if (IS_ERR(dst)) return PTR_ERR(dst); -- cgit v1.2.3 From e97e6a1830ddb5885ba312e56b6fa3aa39b5f47e Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 18 Aug 2025 08:40:29 -0700 Subject: net: Switch to skb_dstref_steal/skb_dstref_restore for ip_route_input callers Going forward skb_dst_set will assert that skb dst_entry is empty during skb_dst_set. skb_dstref_steal is added to reset existing entry without doing refcnt. skb_dstref_restore should be used to restore the previous entry. Convert icmp_route_lookup and ip_options_rcv_srr to these helpers. Add extra call to skb_dstref_reset to icmp_route_lookup to clear the ip_route_input entry. Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250818154032.3173645-5-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- net/ipv4/icmp.c | 7 ++++--- net/ipv4/ip_options.c | 5 ++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 2ffe73ea644f..91765057aa1d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -544,14 +544,15 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, goto relookup_failed; } /* Ugh! */ - orefdst = skb_in->_skb_refdst; /* save old refdst */ - skb_dst_set(skb_in, NULL); + orefdst = skb_dstref_steal(skb_in); err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr, dscp, rt2->dst.dev) ? -EINVAL : 0; dst_release(&rt2->dst); rt2 = skb_rtable(skb_in); - skb_in->_skb_refdst = orefdst; /* restore old refdst */ + /* steal dst entry from skb_in, don't drop refcnt */ + skb_dstref_steal(skb_in); + skb_dstref_restore(skb_in, orefdst); } if (err) diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index e3321932bec0..be8815ce3ac2 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -615,14 +615,13 @@ int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev) } memcpy(&nexthop, &optptr[srrptr-1], 4); - orefdst = skb->_skb_refdst; - skb_dst_set(skb, NULL); + orefdst = skb_dstref_steal(skb); err = ip_route_input(skb, nexthop, iph->saddr, ip4h_dscp(iph), dev) ? -EINVAL : 0; rt2 = skb_rtable(skb); if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { skb_dst_drop(skb); - skb->_skb_refdst = orefdst; + skb_dstref_restore(skb, orefdst); return -EINVAL; } refdst_drop(orefdst); -- cgit v1.2.3 From 09bde6fdcd752b4512e7b554a3259e4f6b77c6d1 Mon Sep 17 00:00:00 2001 From: Miguel García Date: Tue, 19 Aug 2025 00:02:03 +0200 Subject: ipv6: ip6_gre: replace strcpy with strscpy for tunnel name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the strcpy() call that copies the device name into tunnel->parms.name with strscpy(), to avoid potential overflow and guarantee NULL termination. This uses the two-argument form of strscpy(), where the destination size is inferred from the array type. Destination is tunnel->parms.name (size IFNAMSIZ). Tested in QEMU (Alpine rootfs): - Created IPv6 GRE tunnels over loopback - Assigned overlay IPv6 addresses - Verified bidirectional ping through the tunnel - Changed tunnel parameters at runtime (`ip -6 tunnel change`) Signed-off-by: Miguel García Link: https://patch.msgid.link/20250818220203.899338-1-miguelgarciaroman8@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_gre.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 74d49dd6124d..c82a75510c0e 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -329,9 +329,9 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, if (parms->name[0]) { if (!dev_valid_name(parms->name)) return NULL; - strscpy(name, parms->name, IFNAMSIZ); + strscpy(name, parms->name); } else { - strcpy(name, "ip6gre%d"); + strscpy(name, "ip6gre%d"); } dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, ip6gre_tunnel_setup); @@ -1469,7 +1469,7 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) tunnel = netdev_priv(dev); tunnel->dev = dev; - strcpy(tunnel->parms.name, dev->name); + strscpy(tunnel->parms.name, dev->name); ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (ret) @@ -1529,7 +1529,7 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev) tunnel->dev = dev; tunnel->net = dev_net(dev); - strcpy(tunnel->parms.name, dev->name); + strscpy(tunnel->parms.name, dev->name); tunnel->hlen = sizeof(struct ipv6hdr) + 4; } @@ -1842,7 +1842,7 @@ static int ip6erspan_tap_init(struct net_device *dev) tunnel = netdev_priv(dev); tunnel->dev = dev; - strcpy(tunnel->parms.name, dev->name); + strscpy(tunnel->parms.name, dev->name); ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (ret) -- cgit v1.2.3 From d9cef55ed49117bd63695446fb84b4b91815c0b4 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Mon, 18 Aug 2025 13:46:18 +0800 Subject: net/smc: fix UAF on smcsk after smc_listen_out() BPF CI testing report a UAF issue: [ 16.446633] BUG: kernel NULL pointer dereference, address: 000000000000003 0 [ 16.447134] #PF: supervisor read access in kernel mod e [ 16.447516] #PF: error_code(0x0000) - not-present pag e [ 16.447878] PGD 0 P4D 0 [ 16.448063] Oops: Oops: 0000 [#1] PREEMPT SMP NOPT I [ 16.448409] CPU: 0 UID: 0 PID: 9 Comm: kworker/0:1 Tainted: G OE 6.13.0-rc3-g89e8a75fda73-dirty #4 2 [ 16.449124] Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODUL E [ 16.449502] Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/201 4 [ 16.450201] Workqueue: smc_hs_wq smc_listen_wor k [ 16.450531] RIP: 0010:smc_listen_work+0xc02/0x159 0 [ 16.452158] RSP: 0018:ffffb5ab40053d98 EFLAGS: 0001024 6 [ 16.452526] RAX: 0000000000000001 RBX: 0000000000000002 RCX: 000000000000030 0 [ 16.452994] RDX: 0000000000000280 RSI: 00003513840053f0 RDI: 000000000000000 0 [ 16.453492] RBP: ffffa097808e3800 R08: ffffa09782dba1e0 R09: 000000000000000 5 [ 16.453987] R10: 0000000000000000 R11: 0000000000000000 R12: ffffa0978274640 0 [ 16.454497] R13: 0000000000000000 R14: 0000000000000000 R15: ffffa09782d4092 0 [ 16.454996] FS: 0000000000000000(0000) GS:ffffa097bbc00000(0000) knlGS:000000000000000 0 [ 16.455557] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003 3 [ 16.455961] CR2: 0000000000000030 CR3: 0000000102788004 CR4: 0000000000770ef 0 [ 16.456459] PKRU: 5555555 4 [ 16.456654] Call Trace : [ 16.456832] [ 16.456989] ? __die+0x23/0x7 0 [ 16.457215] ? page_fault_oops+0x180/0x4c 0 [ 16.457508] ? __lock_acquire+0x3e6/0x249 0 [ 16.457801] ? exc_page_fault+0x68/0x20 0 [ 16.458080] ? asm_exc_page_fault+0x26/0x3 0 [ 16.458389] ? smc_listen_work+0xc02/0x159 0 [ 16.458689] ? smc_listen_work+0xc02/0x159 0 [ 16.458987] ? lock_is_held_type+0x8f/0x10 0 [ 16.459284] process_one_work+0x1ea/0x6d 0 [ 16.459570] worker_thread+0x1c3/0x38 0 [ 16.459839] ? __pfx_worker_thread+0x10/0x1 0 [ 16.460144] kthread+0xe0/0x11 0 [ 16.460372] ? __pfx_kthread+0x10/0x1 0 [ 16.460640] ret_from_fork+0x31/0x5 0 [ 16.460896] ? __pfx_kthread+0x10/0x1 0 [ 16.461166] ret_from_fork_asm+0x1a/0x3 0 [ 16.461453] [ 16.461616] Modules linked in: bpf_testmod(OE) [last unloaded: bpf_testmod(OE) ] [ 16.462134] CR2: 000000000000003 0 [ 16.462380] ---[ end trace 0000000000000000 ]--- [ 16.462710] RIP: 0010:smc_listen_work+0xc02/0x1590 The direct cause of this issue is that after smc_listen_out_connected(), newclcsock->sk may be NULL since it will releases the smcsk. Therefore, if the application closes the socket immediately after accept, newclcsock->sk can be NULL. A possible execution order could be as follows: smc_listen_work | userspace ----------------------------------------------------------------- lock_sock(sk) | smc_listen_out_connected() | | \- smc_listen_out | | | \- release_sock | | |- sk->sk_data_ready() | | fd = accept(); | close(fd); | \- socket->sk = NULL; /* newclcsock->sk is NULL now */ SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk)) Since smc_listen_out_connected() will not fail, simply swapping the order of the code can easily fix this issue. Fixes: 3b2dec2603d5 ("net/smc: restructure client and server code in af_smc") Signed-off-by: D. Wythe Reviewed-by: Guangguan Wang Reviewed-by: Alexandra Winter Reviewed-by: Dust Li Link: https://patch.msgid.link/20250818054618.41615-1-alibuda@linux.alibaba.com Signed-off-by: Jakub Kicinski --- net/smc/af_smc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9311c38f7abe..e0e48f24cd61 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2568,8 +2568,9 @@ static void smc_listen_work(struct work_struct *work) goto out_decl; } - smc_listen_out_connected(new_smc); SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini); + /* smc_listen_out() will release smcsk */ + smc_listen_out_connected(new_smc); goto out_free; out_unlock: -- cgit v1.2.3 From 68889dfd547bd8eabc5a98b58475d7b901cf5129 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Aug 2025 20:16:09 +0000 Subject: mptcp: Fix up subflow's memcg when CONFIG_SOCK_CGROUP_DATA=n. When sk_alloc() allocates a socket, mem_cgroup_sk_alloc() sets sk->sk_memcg based on the current task. MPTCP subflow socket creation is triggered from userspace or an in-kernel worker. In the latter case, sk->sk_memcg is not what we want. So, we fix it up from the parent socket's sk->sk_memcg in mptcp_attach_cgroup(). Although the code is placed under #ifdef CONFIG_MEMCG, it is buried under #ifdef CONFIG_SOCK_CGROUP_DATA. The two configs are orthogonal. If CONFIG_MEMCG is enabled without CONFIG_SOCK_CGROUP_DATA, the subflow's memory usage is not charged correctly. Let's move the code out of the wrong ifdef guard. Note that sk->sk_memcg is freed in sk_prot_free() and the parent sk holds the refcnt of memcg->css here, so we don't need to use css_tryget(). Fixes: 3764b0c5651e3 ("mptcp: attach subflow socket to parent cgroup") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Acked-by: Matthieu Baerts (NGI0) Acked-by: Shakeel Butt Link: https://patch.msgid.link/20250815201712.1745332-2-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/memcontrol.h | 6 ++++++ mm/memcontrol.c | 13 +++++++++++++ net/mptcp/subflow.c | 11 +++-------- 3 files changed, 22 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 785173aa0739..25921fbec685 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1604,6 +1604,7 @@ extern struct static_key_false memcg_sockets_enabled_key; #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key) void mem_cgroup_sk_alloc(struct sock *sk); void mem_cgroup_sk_free(struct sock *sk); +void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk); #if BITS_PER_LONG < 64 static inline void mem_cgroup_set_socket_pressure(struct mem_cgroup *memcg) @@ -1661,6 +1662,11 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg); #define mem_cgroup_sockets_enabled 0 static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; static inline void mem_cgroup_sk_free(struct sock *sk) { }; + +static inline void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk) +{ +} + static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { return false; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8dd7fbed5a94..46713b9ece06 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5024,6 +5024,19 @@ void mem_cgroup_sk_free(struct sock *sk) css_put(&sk->sk_memcg->css); } +void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk) +{ + if (sk->sk_memcg == newsk->sk_memcg) + return; + + mem_cgroup_sk_free(newsk); + + if (sk->sk_memcg) + css_get(&sk->sk_memcg->css); + + newsk->sk_memcg = sk->sk_memcg; +} + /** * mem_cgroup_charge_skmem - charge socket memory * @memcg: memcg to charge diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 3f1b62a9fe88..c8a7e4b59db1 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1717,19 +1717,14 @@ static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) /* only the additional subflows created by kworkers have to be modified */ if (cgroup_id(sock_cgroup_ptr(parent_skcd)) != cgroup_id(sock_cgroup_ptr(child_skcd))) { -#ifdef CONFIG_MEMCG - struct mem_cgroup *memcg = parent->sk_memcg; - - mem_cgroup_sk_free(child); - if (memcg && css_tryget(&memcg->css)) - child->sk_memcg = memcg; -#endif /* CONFIG_MEMCG */ - cgroup_sk_free(child_skcd); *child_skcd = *parent_skcd; cgroup_sk_clone(child_skcd); } #endif /* CONFIG_SOCK_CGROUP_DATA */ + + if (mem_cgroup_sockets_enabled) + mem_cgroup_sk_inherit(parent, child); } static void mptcp_subflow_ops_override(struct sock *ssk) -- cgit v1.2.3 From 1068b48ed10805b61be0668cd774af97163479a7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Aug 2025 20:16:10 +0000 Subject: mptcp: Use tcp_under_memory_pressure() in mptcp_epollin_ready(). Some conditions used in mptcp_epollin_ready() are the same as tcp_under_memory_pressure(). We will modify tcp_under_memory_pressure() in the later patch. Let's use tcp_under_memory_pressure() instead. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Matthieu Baerts (NGI0) Reviewed-by: Shakeel Butt Link: https://patch.msgid.link/20250815201712.1745332-3-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index b15d7fab5c4b..a1787a1344ac 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -788,9 +788,7 @@ static inline bool mptcp_epollin_ready(const struct sock *sk) * as it can always coalesce them */ return (data_avail >= sk->sk_rcvlowat) || - (mem_cgroup_sockets_enabled && sk->sk_memcg && - mem_cgroup_under_socket_pressure(sk->sk_memcg)) || - READ_ONCE(tcp_memory_pressure); + tcp_under_memory_pressure(sk); } int mptcp_set_rcvlowat(struct sock *sk, int val); -- cgit v1.2.3 From e2afa83296bbac40829624b508492b562a21e4d4 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Aug 2025 20:16:11 +0000 Subject: tcp: Simplify error path in inet_csk_accept(). When an error occurs in inet_csk_accept(), what we should do is only call release_sock() and set the errno to arg->err. But the path jumps to another label, which introduces unnecessary initialisation and tests for newsk. Let's simplify the error path and remove the redundant NULL checks for newsk. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Shakeel Butt Link: https://patch.msgid.link/20250815201712.1745332-4-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_connection_sock.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 1e2df51427fe..724bd9ed6cd4 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -706,9 +706,9 @@ struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg) spin_unlock_bh(&queue->fastopenq.lock); } -out: release_sock(sk); - if (newsk && mem_cgroup_sockets_enabled) { + + if (mem_cgroup_sockets_enabled) { gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL; int amt = 0; @@ -732,18 +732,17 @@ out: release_sock(newsk); } + if (req) reqsk_put(req); - if (newsk) - inet_init_csk_locks(newsk); - + inet_init_csk_locks(newsk); return newsk; + out_err: - newsk = NULL; - req = NULL; + release_sock(sk); arg->err = error; - goto out; + return NULL; } EXPORT_SYMBOL(inet_csk_accept); -- cgit v1.2.3 From 9d85c565a7b7c78b732393c02bcaa4d5c275fe58 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Aug 2025 20:16:12 +0000 Subject: net: Call trace_sock_exceed_buf_limit() for memcg failure with SK_MEM_RECV. Initially, trace_sock_exceed_buf_limit() was invoked when __sk_mem_raise_allocated() failed due to the memcg limit or the global limit. However, commit d6f19938eb031 ("net: expose sk wmem in sock_exceed_buf_limit tracepoint") somehow suppressed the event only when memcg failed to charge for SK_MEM_RECV, although the memcg failure for SK_MEM_SEND still triggers the event. Let's restore the event for SK_MEM_RECV. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Shakeel Butt Link: https://patch.msgid.link/20250815201712.1745332-5-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/core/sock.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 7c26ec8dce63..380bc1aa6982 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3354,8 +3354,7 @@ suppress_allocation: } } - if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) - trace_sock_exceed_buf_limit(sk, prot, allocated, kind); + trace_sock_exceed_buf_limit(sk, prot, allocated, kind); sk_memory_allocated_sub(sk, amt); -- cgit v1.2.3 From bd4aa2337374dd04b8627efd26227ebd49f69285 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Aug 2025 20:16:13 +0000 Subject: net: Clean up __sk_mem_raise_allocated(). In __sk_mem_raise_allocated(), charged is initialised as true due to the weird condition removed in the previous patch. It makes the variable unreliable by itself, so we have to check another variable, memcg, in advance. Also, we will factorise the common check below for memcg later. if (mem_cgroup_sockets_enabled && sk->sk_memcg) As a prep, let's initialise charged as false and memcg as NULL. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Shakeel Butt Link: https://patch.msgid.link/20250815201712.1745332-6-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/core/sock.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 380bc1aa6982..000940ecf360 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3263,15 +3263,16 @@ EXPORT_SYMBOL(sk_wait_data); */ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { - struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL; struct proto *prot = sk->sk_prot; - bool charged = true; + struct mem_cgroup *memcg = NULL; + bool charged = false; long allocated; sk_memory_allocated_add(sk, amt); allocated = sk_memory_allocated(sk); - if (memcg) { + if (mem_cgroup_sockets_enabled && sk->sk_memcg) { + memcg = sk->sk_memcg; charged = mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()); if (!charged) goto suppress_allocation; @@ -3358,7 +3359,7 @@ suppress_allocation: sk_memory_allocated_sub(sk, amt); - if (memcg && charged) + if (charged) mem_cgroup_uncharge_skmem(memcg, amt); return 0; -- cgit v1.2.3 From f7161b234f2ec7f18999009c4becc04eeb6b12a7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Aug 2025 20:16:14 +0000 Subject: net-memcg: Introduce mem_cgroup_from_sk(). We will store a flag in the lowest bit of sk->sk_memcg. Then, directly dereferencing sk->sk_memcg will be illegal, and we do not want to allow touching the raw sk->sk_memcg in many places. Let's introduce mem_cgroup_from_sk(). Other places accessing the raw sk->sk_memcg will be converted later. Note that we cannot define the helper as an inline function in memcontrol.h as we cannot access any fields of struct sock there due to circular dependency, so it is placed in sock.h. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Acked-by: Roman Gushchin Acked-by: Shakeel Butt Link: https://patch.msgid.link/20250815201712.1745332-7-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 12 ++++++++++++ mm/memcontrol.c | 13 +++++++++---- net/ipv4/inet_connection_sock.c | 2 +- 3 files changed, 22 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index c8a4b283df6f..811f95ea8d00 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2594,6 +2594,18 @@ static inline gfp_t gfp_memcg_charge(void) return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; } +#ifdef CONFIG_MEMCG +static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk) +{ + return sk->sk_memcg; +} +#else +static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk) +{ + return NULL; +} +#endif + static inline long sock_rcvtimeo(const struct sock *sk, bool noblock) { return noblock ? 0 : READ_ONCE(sk->sk_rcvtimeo); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 46713b9ece06..d8a52d1d08fa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5020,19 +5020,24 @@ out: void mem_cgroup_sk_free(struct sock *sk) { - if (sk->sk_memcg) - css_put(&sk->sk_memcg->css); + struct mem_cgroup *memcg = mem_cgroup_from_sk(sk); + + if (memcg) + css_put(&memcg->css); } void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk) { + struct mem_cgroup *memcg; + if (sk->sk_memcg == newsk->sk_memcg) return; mem_cgroup_sk_free(newsk); - if (sk->sk_memcg) - css_get(&sk->sk_memcg->css); + memcg = mem_cgroup_from_sk(sk); + if (memcg) + css_get(&memcg->css); newsk->sk_memcg = sk->sk_memcg; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 724bd9ed6cd4..93569bbe00f4 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -718,7 +718,7 @@ struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg) lock_sock(newsk); mem_cgroup_sk_alloc(newsk); - if (newsk->sk_memcg) { + if (mem_cgroup_from_sk(newsk)) { /* The socket has not been accepted yet, no need * to look at newsk->sk_wmem_queued. */ -- cgit v1.2.3 From 43049b0db03823c2cd003ca7d3dddcd3924da8dc Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Aug 2025 20:16:15 +0000 Subject: net-memcg: Introduce mem_cgroup_sk_enabled(). The socket memcg feature is enabled by a static key and only works for non-root cgroup. We check both conditions in many places. Let's factorise it as a helper function. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Acked-by: Roman Gushchin Acked-by: Shakeel Butt Link: https://patch.msgid.link/20250815201712.1745332-8-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/proto_memory.h | 2 +- include/net/sock.h | 10 ++++++++++ include/net/tcp.h | 2 +- net/core/sock.c | 6 +++--- net/ipv4/tcp_output.c | 2 +- 5 files changed, 16 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/proto_memory.h b/include/net/proto_memory.h index a6ab2f4f5e28..859e63de81c4 100644 --- a/include/net/proto_memory.h +++ b/include/net/proto_memory.h @@ -31,7 +31,7 @@ static inline bool sk_under_memory_pressure(const struct sock *sk) if (!sk->sk_prot->memory_pressure) return false; - if (mem_cgroup_sockets_enabled && sk->sk_memcg && + if (mem_cgroup_sk_enabled(sk) && mem_cgroup_under_socket_pressure(sk->sk_memcg)) return true; diff --git a/include/net/sock.h b/include/net/sock.h index 811f95ea8d00..3efdf680401d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2599,11 +2599,21 @@ static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk) { return sk->sk_memcg; } + +static inline bool mem_cgroup_sk_enabled(const struct sock *sk) +{ + return mem_cgroup_sockets_enabled && mem_cgroup_from_sk(sk); +} #else static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk) { return NULL; } + +static inline bool mem_cgroup_sk_enabled(const struct sock *sk) +{ + return false; +} #endif static inline long sock_rcvtimeo(const struct sock *sk, bool noblock) diff --git a/include/net/tcp.h b/include/net/tcp.h index 526a26e7a150..9f01b6be6444 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -275,7 +275,7 @@ extern unsigned long tcp_memory_pressure; /* optimized version of sk_under_memory_pressure() for TCP sockets */ static inline bool tcp_under_memory_pressure(const struct sock *sk) { - if (mem_cgroup_sockets_enabled && sk->sk_memcg && + if (mem_cgroup_sk_enabled(sk) && mem_cgroup_under_socket_pressure(sk->sk_memcg)) return true; diff --git a/net/core/sock.c b/net/core/sock.c index 000940ecf360..ab658fe23e1e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1032,7 +1032,7 @@ static int sock_reserve_memory(struct sock *sk, int bytes) bool charged; int pages; - if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk)) + if (!mem_cgroup_sk_enabled(sk) || !sk_has_account(sk)) return -EOPNOTSUPP; if (!bytes) @@ -3271,7 +3271,7 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) sk_memory_allocated_add(sk, amt); allocated = sk_memory_allocated(sk); - if (mem_cgroup_sockets_enabled && sk->sk_memcg) { + if (mem_cgroup_sk_enabled(sk)) { memcg = sk->sk_memcg; charged = mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()); if (!charged) @@ -3398,7 +3398,7 @@ void __sk_mem_reduce_allocated(struct sock *sk, int amount) { sk_memory_allocated_sub(sk, amount); - if (mem_cgroup_sockets_enabled && sk->sk_memcg) + if (mem_cgroup_sk_enabled(sk)) mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); if (sk_under_global_memory_pressure(sk) && diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index caf11920a878..37fb320e6f70 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3578,7 +3578,7 @@ void sk_forced_mem_schedule(struct sock *sk, int size) sk_forward_alloc_add(sk, amt << PAGE_SHIFT); sk_memory_allocated_add(sk, amt); - if (mem_cgroup_sockets_enabled && sk->sk_memcg) + if (mem_cgroup_sk_enabled(sk)) mem_cgroup_charge_skmem(sk->sk_memcg, amt, gfp_memcg_charge() | __GFP_NOFAIL); } -- cgit v1.2.3 From bb178c6bc08525d758a57775458d644304011bf8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Aug 2025 20:16:16 +0000 Subject: net-memcg: Pass struct sock to mem_cgroup_sk_(un)?charge(). We will store a flag in the lowest bit of sk->sk_memcg. Then, we cannot pass the raw pointer to mem_cgroup_charge_skmem() and mem_cgroup_uncharge_skmem(). Let's pass struct sock to the functions. While at it, they are renamed to match other functions starting with mem_cgroup_sk_. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Acked-by: Roman Gushchin Acked-by: Shakeel Butt Link: https://patch.msgid.link/20250815201712.1745332-9-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/memcontrol.h | 29 ++++++++++++++++++++++++----- mm/memcontrol.c | 18 +++++++++++------- net/core/sock.c | 24 +++++++++++------------- net/ipv4/inet_connection_sock.c | 2 +- net/ipv4/tcp_output.c | 3 +-- 5 files changed, 48 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 25921fbec685..0837d3de3a68 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1596,15 +1596,16 @@ static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb) #endif /* CONFIG_CGROUP_WRITEBACK */ struct sock; -bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, - gfp_t gfp_mask); -void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); #ifdef CONFIG_MEMCG extern struct static_key_false memcg_sockets_enabled_key; #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key) + void mem_cgroup_sk_alloc(struct sock *sk); void mem_cgroup_sk_free(struct sock *sk); void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk); +bool mem_cgroup_sk_charge(const struct sock *sk, unsigned int nr_pages, + gfp_t gfp_mask); +void mem_cgroup_sk_uncharge(const struct sock *sk, unsigned int nr_pages); #if BITS_PER_LONG < 64 static inline void mem_cgroup_set_socket_pressure(struct mem_cgroup *memcg) @@ -1660,13 +1661,31 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); void reparent_shrinker_deferred(struct mem_cgroup *memcg); #else #define mem_cgroup_sockets_enabled 0 -static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; -static inline void mem_cgroup_sk_free(struct sock *sk) { }; + +static inline void mem_cgroup_sk_alloc(struct sock *sk) +{ +} + +static inline void mem_cgroup_sk_free(struct sock *sk) +{ +} static inline void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk) { } +static inline bool mem_cgroup_sk_charge(const struct sock *sk, + unsigned int nr_pages, + gfp_t gfp_mask) +{ + return false; +} + +static inline void mem_cgroup_sk_uncharge(const struct sock *sk, + unsigned int nr_pages) +{ +} + static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { return false; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d8a52d1d08fa..df3e9205c9e6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5043,17 +5043,19 @@ void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk) } /** - * mem_cgroup_charge_skmem - charge socket memory - * @memcg: memcg to charge + * mem_cgroup_sk_charge - charge socket memory + * @sk: socket in memcg to charge * @nr_pages: number of pages to charge * @gfp_mask: reclaim mode * * Charges @nr_pages to @memcg. Returns %true if the charge fit within * @memcg's configured limit, %false if it doesn't. */ -bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, - gfp_t gfp_mask) +bool mem_cgroup_sk_charge(const struct sock *sk, unsigned int nr_pages, + gfp_t gfp_mask) { + struct mem_cgroup *memcg = mem_cgroup_from_sk(sk); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return memcg1_charge_skmem(memcg, nr_pages, gfp_mask); @@ -5066,12 +5068,14 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, } /** - * mem_cgroup_uncharge_skmem - uncharge socket memory - * @memcg: memcg to uncharge + * mem_cgroup_sk_uncharge - uncharge socket memory + * @sk: socket in memcg to uncharge * @nr_pages: number of pages to uncharge */ -void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) +void mem_cgroup_sk_uncharge(const struct sock *sk, unsigned int nr_pages) { + struct mem_cgroup *memcg = mem_cgroup_from_sk(sk); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { memcg1_uncharge_skmem(memcg, nr_pages); return; diff --git a/net/core/sock.c b/net/core/sock.c index ab658fe23e1e..5537ca263858 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1041,8 +1041,8 @@ static int sock_reserve_memory(struct sock *sk, int bytes) pages = sk_mem_pages(bytes); /* pre-charge to memcg */ - charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages, - GFP_KERNEL | __GFP_RETRY_MAYFAIL); + charged = mem_cgroup_sk_charge(sk, pages, + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!charged) return -ENOMEM; @@ -1054,7 +1054,7 @@ static int sock_reserve_memory(struct sock *sk, int bytes) */ if (allocated > sk_prot_mem_limits(sk, 1)) { sk_memory_allocated_sub(sk, pages); - mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); + mem_cgroup_sk_uncharge(sk, pages); return -ENOMEM; } sk_forward_alloc_add(sk, pages << PAGE_SHIFT); @@ -3263,17 +3263,16 @@ EXPORT_SYMBOL(sk_wait_data); */ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { + bool memcg_enabled = false, charged = false; struct proto *prot = sk->sk_prot; - struct mem_cgroup *memcg = NULL; - bool charged = false; long allocated; sk_memory_allocated_add(sk, amt); allocated = sk_memory_allocated(sk); if (mem_cgroup_sk_enabled(sk)) { - memcg = sk->sk_memcg; - charged = mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()); + memcg_enabled = true; + charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge()); if (!charged) goto suppress_allocation; } @@ -3347,10 +3346,9 @@ suppress_allocation: */ if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { /* Force charge with __GFP_NOFAIL */ - if (memcg && !charged) { - mem_cgroup_charge_skmem(memcg, amt, - gfp_memcg_charge() | __GFP_NOFAIL); - } + if (memcg_enabled && !charged) + mem_cgroup_sk_charge(sk, amt, + gfp_memcg_charge() | __GFP_NOFAIL); return 1; } } @@ -3360,7 +3358,7 @@ suppress_allocation: sk_memory_allocated_sub(sk, amt); if (charged) - mem_cgroup_uncharge_skmem(memcg, amt); + mem_cgroup_sk_uncharge(sk, amt); return 0; } @@ -3399,7 +3397,7 @@ void __sk_mem_reduce_allocated(struct sock *sk, int amount) sk_memory_allocated_sub(sk, amount); if (mem_cgroup_sk_enabled(sk)) - mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); + mem_cgroup_sk_uncharge(sk, amount); if (sk_under_global_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 93569bbe00f4..0ef1eacd539d 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -727,7 +727,7 @@ struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg) } if (amt) - mem_cgroup_charge_skmem(newsk->sk_memcg, amt, gfp); + mem_cgroup_sk_charge(newsk, amt, gfp); kmem_cache_charge(newsk, gfp); release_sock(newsk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 37fb320e6f70..dfbac0876d96 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3579,8 +3579,7 @@ void sk_forced_mem_schedule(struct sock *sk, int size) sk_memory_allocated_add(sk, amt); if (mem_cgroup_sk_enabled(sk)) - mem_cgroup_charge_skmem(sk->sk_memcg, amt, - gfp_memcg_charge() | __GFP_NOFAIL); + mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL); } /* Send a FIN. The caller locks the socket for us. -- cgit v1.2.3 From bf64002c94fc330b996bc438f3d1b6bd3d781659 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Aug 2025 20:16:18 +0000 Subject: net: Define sk_memcg under CONFIG_MEMCG. Except for sk_clone_lock(), all accesses to sk->sk_memcg is done under CONFIG_MEMCG. As a bonus, let's define sk->sk_memcg under CONFIG_MEMCG. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Acked-by: Roman Gushchin Acked-by: Shakeel Butt Link: https://patch.msgid.link/20250815201712.1745332-11-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 2 ++ net/core/sock.c | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 3bc4d566f7d0..1c49ea13af4a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -443,7 +443,9 @@ struct sock { __cacheline_group_begin(sock_read_rxtx); int sk_err; struct socket *sk_socket; +#ifdef CONFIG_MEMCG struct mem_cgroup *sk_memcg; +#endif #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif diff --git a/net/core/sock.c b/net/core/sock.c index 5537ca263858..ab6953d295df 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2512,8 +2512,10 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sock_reset_flag(newsk, SOCK_DONE); +#ifdef CONFIG_MEMCG /* sk->sk_memcg will be populated at accept() time */ newsk->sk_memcg = NULL; +#endif cgroup_sk_clone(&newsk->sk_cgrp_data); @@ -4452,7 +4454,9 @@ static int __init sock_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket); +#ifdef CONFIG_MEMCG CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); +#endif CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); -- cgit v1.2.3 From dd91c79e4f58fbe2898dac84858033700e0e99fb Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Aug 2025 13:54:23 -0700 Subject: sctp: Fix MAC comparison to be constant-time To prevent timing attacks, MACs need to be compared in constant time. Use the appropriate helper function for this. Fixes: bbd0d59809f9 ("[SCTP]: Implement the receive and verification of AUTH chunk") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Link: https://patch.msgid.link/20250818205426.30222-3-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- net/sctp/sm_make_chunk.c | 3 ++- net/sctp/sm_statefuns.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 3ead591c72fd..d099b605e44a 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -31,6 +31,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include @@ -1788,7 +1789,7 @@ struct sctp_association *sctp_unpack_cookie( } } - if (memcmp(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) { + if (crypto_memneq(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) { *error = -SCTP_IERROR_BAD_SIG; goto fail; } diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index a0524ba8d787..d4d5b14b49b3 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -30,6 +30,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -4416,7 +4417,7 @@ static enum sctp_ierror sctp_sf_authenticate( sh_key, GFP_ATOMIC); /* Discard the packet if the digests do not match */ - if (memcmp(save_digest, digest, sig_len)) { + if (crypto_memneq(save_digest, digest, sig_len)) { kfree(save_digest); return SCTP_IERROR_BAD_SIG; } -- cgit v1.2.3 From bf40785fa437c1752117df2edb3220e9c37d98a6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Aug 2025 13:54:24 -0700 Subject: sctp: Use HMAC-SHA1 and HMAC-SHA256 library for chunk authentication For SCTP chunk authentication, use the HMAC-SHA1 and HMAC-SHA256 library functions instead of crypto_shash. This is simpler and faster. There's no longer any need to pre-allocate 'crypto_shash' objects; the SCTP code now simply calls into the HMAC code directly. As part of this, make SCTP always support both HMAC-SHA1 and HMAC-SHA256. Previously, it only guaranteed support for HMAC-SHA1. However, HMAC-SHA256 tended to be supported too anyway, as it was supported if CONFIG_CRYPTO_SHA256 was enabled elsewhere in the kconfig. Acked-by: Xin Long Signed-off-by: Eric Biggers Link: https://patch.msgid.link/20250818205426.30222-4-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/net/sctp/auth.h | 17 ++--- include/net/sctp/constants.h | 4 -- include/net/sctp/structs.h | 5 -- net/sctp/Kconfig | 16 +++-- net/sctp/auth.c | 166 ++++++++----------------------------------- net/sctp/chunk.c | 3 +- net/sctp/sm_make_chunk.c | 2 +- net/sctp/sm_statefuns.c | 2 +- net/sctp/socket.c | 10 --- 9 files changed, 48 insertions(+), 177 deletions(-) (limited to 'net') diff --git a/include/net/sctp/auth.h b/include/net/sctp/auth.h index d4b3b2dcd15b..3d5879e08e78 100644 --- a/include/net/sctp/auth.h +++ b/include/net/sctp/auth.h @@ -22,16 +22,11 @@ struct sctp_endpoint; struct sctp_association; struct sctp_authkey; struct sctp_hmacalgo; -struct crypto_shash; -/* - * Define a generic struct that will hold all the info - * necessary for an HMAC transform - */ +/* Defines an HMAC algorithm supported by SCTP chunk authentication */ struct sctp_hmac { - __u16 hmac_id; /* one of the above ids */ - char *hmac_name; /* name for loading */ - __u16 hmac_len; /* length of the signature */ + __u16 hmac_id; /* one of SCTP_AUTH_HMAC_ID_* */ + __u16 hmac_len; /* length of the HMAC value in bytes */ }; /* This is generic structure that containst authentication bytes used @@ -78,9 +73,9 @@ int sctp_auth_asoc_copy_shkeys(const struct sctp_endpoint *ep, struct sctp_association *asoc, gfp_t gfp); int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp); -void sctp_auth_destroy_hmacs(struct crypto_shash *auth_hmacs[]); -struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id); -struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc); +const struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id); +const struct sctp_hmac * +sctp_auth_asoc_get_hmac(const struct sctp_association *asoc); void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc, struct sctp_hmac_algo_param *hmacs); int sctp_auth_asoc_verify_hmac_id(const struct sctp_association *asoc, diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h index 5859e0a16a58..8e0f4c4f7750 100644 --- a/include/net/sctp/constants.h +++ b/include/net/sctp/constants.h @@ -417,16 +417,12 @@ enum { SCTP_AUTH_HMAC_ID_RESERVED_0, SCTP_AUTH_HMAC_ID_SHA1, SCTP_AUTH_HMAC_ID_RESERVED_2, -#if defined (CONFIG_CRYPTO_SHA256) || defined (CONFIG_CRYPTO_SHA256_MODULE) SCTP_AUTH_HMAC_ID_SHA256, -#endif __SCTP_AUTH_HMAC_MAX }; #define SCTP_AUTH_HMAC_ID_MAX __SCTP_AUTH_HMAC_MAX - 1 #define SCTP_AUTH_NUM_HMACS __SCTP_AUTH_HMAC_MAX -#define SCTP_SHA1_SIG_SIZE 20 -#define SCTP_SHA256_SIG_SIZE 32 /* SCTP-AUTH, Section 3.2 * The chunk types for INIT, INIT-ACK, SHUTDOWN-COMPLETE and AUTH chunks diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 8a540ad9b509..6be6aec25731 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1329,11 +1329,6 @@ struct sctp_endpoint { /* rcvbuf acct. policy. */ __u32 rcvbuf_policy; - /* SCTP AUTH: array of the HMACs that will be allocated - * we need this per association so that we don't serialize - */ - struct crypto_shash **auth_hmacs; - /* SCTP-AUTH: hmacs for the endpoint encoded into parameter */ struct sctp_hmac_algo_param *auth_hmacs_list; diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig index 24d5a35ce894..09c77b4d161b 100644 --- a/net/sctp/Kconfig +++ b/net/sctp/Kconfig @@ -7,9 +7,9 @@ menuconfig IP_SCTP tristate "The SCTP Protocol" depends on INET depends on IPV6 || IPV6=n - select CRYPTO - select CRYPTO_HMAC - select CRYPTO_SHA1 + select CRYPTO_LIB_SHA1 + select CRYPTO_LIB_SHA256 + select CRYPTO_LIB_UTILS select NET_CRC32C select NET_UDP_TUNNEL help @@ -79,15 +79,17 @@ config SCTP_COOKIE_HMAC_MD5 bool "Enable optional MD5 hmac cookie generation" help Enable optional MD5 hmac based SCTP cookie generation - select CRYPTO_HMAC if SCTP_COOKIE_HMAC_MD5 - select CRYPTO_MD5 if SCTP_COOKIE_HMAC_MD5 + select CRYPTO + select CRYPTO_HMAC + select CRYPTO_MD5 config SCTP_COOKIE_HMAC_SHA1 bool "Enable optional SHA1 hmac cookie generation" help Enable optional SHA1 hmac based SCTP cookie generation - select CRYPTO_HMAC if SCTP_COOKIE_HMAC_SHA1 - select CRYPTO_SHA1 if SCTP_COOKIE_HMAC_SHA1 + select CRYPTO + select CRYPTO_HMAC + select CRYPTO_SHA1 config INET_SCTP_DIAG depends on INET_DIAG diff --git a/net/sctp/auth.c b/net/sctp/auth.c index c58fffc86a0c..82aad477590e 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -12,36 +12,37 @@ * Vlad Yasevich */ -#include +#include +#include #include #include -#include #include #include -static struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = { +static const struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = { { /* id 0 is reserved. as all 0 */ .hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_0, }, { .hmac_id = SCTP_AUTH_HMAC_ID_SHA1, - .hmac_name = "hmac(sha1)", - .hmac_len = SCTP_SHA1_SIG_SIZE, + .hmac_len = SHA1_DIGEST_SIZE, }, { /* id 2 is reserved as well */ .hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_2, }, -#if IS_ENABLED(CONFIG_CRYPTO_SHA256) { .hmac_id = SCTP_AUTH_HMAC_ID_SHA256, - .hmac_name = "hmac(sha256)", - .hmac_len = SCTP_SHA256_SIG_SIZE, + .hmac_len = SHA256_DIGEST_SIZE, } -#endif }; +static bool sctp_hmac_supported(__u16 hmac_id) +{ + return hmac_id < ARRAY_SIZE(sctp_hmac_list) && + sctp_hmac_list[hmac_id].hmac_len != 0; +} void sctp_auth_key_put(struct sctp_auth_bytes *key) { @@ -444,76 +445,7 @@ struct sctp_shared_key *sctp_auth_get_shkey( return NULL; } -/* - * Initialize all the possible digest transforms that we can use. Right - * now, the supported digests are SHA1 and SHA256. We do this here once - * because of the restrictiong that transforms may only be allocated in - * user context. This forces us to pre-allocated all possible transforms - * at the endpoint init time. - */ -int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp) -{ - struct crypto_shash *tfm = NULL; - __u16 id; - - /* If the transforms are already allocated, we are done */ - if (ep->auth_hmacs) - return 0; - - /* Allocated the array of pointers to transorms */ - ep->auth_hmacs = kcalloc(SCTP_AUTH_NUM_HMACS, - sizeof(struct crypto_shash *), - gfp); - if (!ep->auth_hmacs) - return -ENOMEM; - - for (id = 0; id < SCTP_AUTH_NUM_HMACS; id++) { - - /* See is we support the id. Supported IDs have name and - * length fields set, so that we can allocated and use - * them. We can safely just check for name, for without the - * name, we can't allocate the TFM. - */ - if (!sctp_hmac_list[id].hmac_name) - continue; - - /* If this TFM has been allocated, we are all set */ - if (ep->auth_hmacs[id]) - continue; - - /* Allocate the ID */ - tfm = crypto_alloc_shash(sctp_hmac_list[id].hmac_name, 0, 0); - if (IS_ERR(tfm)) - goto out_err; - - ep->auth_hmacs[id] = tfm; - } - - return 0; - -out_err: - /* Clean up any successful allocations */ - sctp_auth_destroy_hmacs(ep->auth_hmacs); - ep->auth_hmacs = NULL; - return -ENOMEM; -} - -/* Destroy the hmac tfm array */ -void sctp_auth_destroy_hmacs(struct crypto_shash *auth_hmacs[]) -{ - int i; - - if (!auth_hmacs) - return; - - for (i = 0; i < SCTP_AUTH_NUM_HMACS; i++) { - crypto_free_shash(auth_hmacs[i]); - } - kfree(auth_hmacs); -} - - -struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id) +const struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id) { return &sctp_hmac_list[hmac_id]; } @@ -521,7 +453,8 @@ struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id) /* Get an hmac description information that we can use to build * the AUTH chunk */ -struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc) +const struct sctp_hmac * +sctp_auth_asoc_get_hmac(const struct sctp_association *asoc) { struct sctp_hmac_algo_param *hmacs; __u16 n_elt; @@ -543,26 +476,10 @@ struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc) sizeof(struct sctp_paramhdr)) >> 1; for (i = 0; i < n_elt; i++) { id = ntohs(hmacs->hmac_ids[i]); - - /* Check the id is in the supported range. And - * see if we support the id. Supported IDs have name and - * length fields set, so that we can allocate and use - * them. We can safely just check for name, for without the - * name, we can't allocate the TFM. - */ - if (id > SCTP_AUTH_HMAC_ID_MAX || - !sctp_hmac_list[id].hmac_name) { - id = 0; - continue; - } - - break; + if (sctp_hmac_supported(id)) + return &sctp_hmac_list[id]; } - - if (id == 0) - return NULL; - - return &sctp_hmac_list[id]; + return NULL; } static int __sctp_auth_find_hmacid(__be16 *hmacs, int n_elts, __be16 hmac_id) @@ -606,7 +523,6 @@ int sctp_auth_asoc_verify_hmac_id(const struct sctp_association *asoc, void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc, struct sctp_hmac_algo_param *hmacs) { - struct sctp_endpoint *ep; __u16 id; int i; int n_params; @@ -617,16 +533,9 @@ void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc, n_params = (ntohs(hmacs->param_hdr.length) - sizeof(struct sctp_paramhdr)) >> 1; - ep = asoc->ep; for (i = 0; i < n_params; i++) { id = ntohs(hmacs->hmac_ids[i]); - - /* Check the id is in the supported range */ - if (id > SCTP_AUTH_HMAC_ID_MAX) - continue; - - /* If this TFM has been allocated, use this id */ - if (ep->auth_hmacs[id]) { + if (sctp_hmac_supported(id)) { asoc->default_hmac_id = id; break; } @@ -709,10 +618,9 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc, struct sctp_shared_key *ep_key, gfp_t gfp) { struct sctp_auth_bytes *asoc_key; - struct crypto_shash *tfm; __u16 key_id, hmac_id; - unsigned char *end; int free_key = 0; + size_t data_len; __u8 *digest; /* Extract the info we need: @@ -733,19 +641,17 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc, free_key = 1; } - /* set up scatter list */ - end = skb_tail_pointer(skb); - - tfm = asoc->ep->auth_hmacs[hmac_id]; - + data_len = skb_tail_pointer(skb) - (unsigned char *)auth; digest = (u8 *)(&auth->auth_hdr + 1); - if (crypto_shash_setkey(tfm, &asoc_key->data[0], asoc_key->len)) - goto free; - - crypto_shash_tfm_digest(tfm, (u8 *)auth, end - (unsigned char *)auth, - digest); + if (hmac_id == SCTP_AUTH_HMAC_ID_SHA1) { + hmac_sha1_usingrawkey(asoc_key->data, asoc_key->len, + (const u8 *)auth, data_len, digest); + } else { + WARN_ON_ONCE(hmac_id != SCTP_AUTH_HMAC_ID_SHA256); + hmac_sha256_usingrawkey(asoc_key->data, asoc_key->len, + (const u8 *)auth, data_len, digest); + } -free: if (free_key) sctp_auth_key_put(asoc_key); } @@ -788,14 +694,11 @@ int sctp_auth_ep_set_hmacs(struct sctp_endpoint *ep, for (i = 0; i < hmacs->shmac_num_idents; i++) { id = hmacs->shmac_idents[i]; - if (id > SCTP_AUTH_HMAC_ID_MAX) + if (!sctp_hmac_supported(id)) return -EOPNOTSUPP; if (SCTP_AUTH_HMAC_ID_SHA1 == id) has_sha1 = 1; - - if (!sctp_hmac_list[id].hmac_name) - return -EOPNOTSUPP; } if (!has_sha1) @@ -1021,8 +924,6 @@ int sctp_auth_deact_key_id(struct sctp_endpoint *ep, int sctp_auth_init(struct sctp_endpoint *ep, gfp_t gfp) { - int err = -ENOMEM; - /* Allocate space for HMACS and CHUNKS authentication * variables. There are arrays that we encode directly * into parameters to make the rest of the operations easier. @@ -1060,13 +961,6 @@ int sctp_auth_init(struct sctp_endpoint *ep, gfp_t gfp) ep->auth_chunk_list = auth_chunks; } - /* Allocate and initialize transorms arrays for supported - * HMACs. - */ - err = sctp_auth_init_hmacs(ep, gfp); - if (err) - goto nomem; - return 0; nomem: @@ -1075,7 +969,7 @@ nomem: kfree(ep->auth_chunk_list); ep->auth_hmacs_list = NULL; ep->auth_chunk_list = NULL; - return err; + return -ENOMEM; } void sctp_auth_free(struct sctp_endpoint *ep) @@ -1084,6 +978,4 @@ void sctp_auth_free(struct sctp_endpoint *ep) kfree(ep->auth_chunk_list); ep->auth_hmacs_list = NULL; ep->auth_chunk_list = NULL; - sctp_auth_destroy_hmacs(ep->auth_hmacs); - ep->auth_hmacs = NULL; } diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index fd4f8243cc35..c655b571ca01 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -184,7 +184,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, * DATA. */ if (sctp_auth_send_cid(SCTP_CID_DATA, asoc)) { - struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc); + const struct sctp_hmac *hmac_desc = + sctp_auth_asoc_get_hmac(asoc); if (hmac_desc) max_data -= SCTP_PAD4(sizeof(struct sctp_auth_chunk) + diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index d099b605e44a..a1a3c8494c5d 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -1320,7 +1320,7 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc, __u16 key_id) { struct sctp_authhdr auth_hdr; - struct sctp_hmac *hmac_desc; + const struct sctp_hmac *hmac_desc; struct sctp_chunk *retval; /* Get the first hmac that the peer told us to use */ diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index d4d5b14b49b3..4cb8f393434d 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -4362,7 +4362,7 @@ static enum sctp_ierror sctp_sf_authenticate( struct sctp_shared_key *sh_key = NULL; struct sctp_authhdr *auth_hdr; __u8 *save_digest, *digest; - struct sctp_hmac *hmac; + const struct sctp_hmac *hmac; unsigned int sig_len; __u16 key_id; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 4921416434f9..0292881a847c 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -9581,16 +9581,6 @@ static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, if (err) return err; - /* New ep's auth_hmacs should be set if old ep's is set, in case - * that net->sctp.auth_enable has been changed to 0 by users and - * new ep's auth_hmacs couldn't be set in sctp_endpoint_init(). - */ - if (oldsp->ep->auth_hmacs) { - err = sctp_auth_init_hmacs(newsp->ep, GFP_KERNEL); - if (err) - return err; - } - sctp_auto_asconf_init(newsp); /* Move any messages in the old socket's receive queue that are for the -- cgit v1.2.3 From 2f3dd6ec901f29aef5fff3d7a63b1371d67c1760 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Aug 2025 13:54:25 -0700 Subject: sctp: Convert cookie authentication to use HMAC-SHA256 Convert SCTP cookies to use HMAC-SHA256, instead of the previous choice of the legacy algorithms HMAC-MD5 and HMAC-SHA1. Simplify and optimize the code by using the HMAC-SHA256 library instead of crypto_shash, and by preparing the HMAC key when it is generated instead of per-operation. This doesn't break compatibility, since the cookie format is an implementation detail, not part of the SCTP protocol itself. Note that the cookie size doesn't change either. The HMAC field was already 32 bytes, even though previously at most 20 bytes were actually compared. 32 bytes exactly fits an untruncated HMAC-SHA256 value. So, although we could safely truncate the MAC to something slightly shorter, for now just keep the cookie size the same. I also considered SipHash, but that would generate only 8-byte MACs. An 8-byte MAC *might* suffice here. However, there's quite a lot of information in the SCTP cookies: more than in TCP SYN cookies. So absent an analysis that occasional forgeries of all that information is okay in SCTP, I errored on the side of caution. Remove HMAC-MD5 and HMAC-SHA1 as options, since the new HMAC-SHA256 option is just better. It's faster as well as more secure. For example, benchmarking on x86_64, cookie authentication is now nearly 3x as fast as the previous default choice and implementation of HMAC-MD5. Also just make the kernel always support cookie authentication if SCTP is supported at all, rather than making it optional in the build. (It was sort of optional before, but it didn't really work properly. E.g., a kernel with CONFIG_SCTP_COOKIE_HMAC_MD5=n still supported HMAC-MD5 cookie authentication if CONFIG_CRYPTO_HMAC and CONFIG_CRYPTO_MD5 happened to be enabled in the kconfig for other reasons.) Acked-by: Xin Long Signed-off-by: Eric Biggers Link: https://patch.msgid.link/20250818205426.30222-5-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/ip-sysctl.rst | 11 +++---- include/net/netns/sctp.h | 4 +-- include/net/sctp/constants.h | 5 ++- include/net/sctp/structs.h | 30 +++++------------- net/sctp/Kconfig | 43 ++++++------------------- net/sctp/endpointola.c | 23 +++++++------- net/sctp/protocol.c | 11 ++----- net/sctp/sm_make_chunk.c | 57 ++++++++++------------------------ net/sctp/socket.c | 31 +----------------- net/sctp/sysctl.c | 51 +++++++++++++----------------- 10 files changed, 79 insertions(+), 187 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 9756d16e3df1..3d6782683eee 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -3508,16 +3508,13 @@ cookie_hmac_alg - STRING a listening sctp socket to a connecting client in the INIT-ACK chunk. Valid values are: - * md5 - * sha1 + * sha256 * none - Ability to assign md5 or sha1 as the selected alg is predicated on the - configuration of those algorithms at build time (CONFIG_CRYPTO_MD5 and - CONFIG_CRYPTO_SHA1). + md5 and sha1 are also accepted for backwards compatibility, but cause + sha256 to be selected. - Default: Dependent on configuration. MD5 if available, else SHA1 if - available, else none. + Default: sha256 rcvbuf_policy - INTEGER Determines if the receive buffer is attributed to the socket or to diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h index d25cd7a9c5ff..c0f97f36389e 100644 --- a/include/net/netns/sctp.h +++ b/include/net/netns/sctp.h @@ -75,8 +75,8 @@ struct netns_sctp { /* Whether Cookie Preservative is enabled(1) or not(0) */ int cookie_preserve_enable; - /* The namespace default hmac alg */ - char *sctp_hmac_alg; + /* Whether cookie authentication is enabled(1) or not(0) */ + int cookie_auth_enable; /* Valid.Cookie.Life - 60 seconds */ unsigned int valid_cookie_life; diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h index 8e0f4c4f7750..ae3376ba0b99 100644 --- a/include/net/sctp/constants.h +++ b/include/net/sctp/constants.h @@ -296,9 +296,8 @@ enum { SCTP_MAX_GABS = 16 }; */ #define SCTP_DEFAULT_MINSEGMENT 512 /* MTU size ... if no mtu disc */ -#define SCTP_SECRET_SIZE 32 /* Number of octets in a 256 bits. */ - -#define SCTP_SIGNATURE_SIZE 20 /* size of a SLA-1 signature */ +#define SCTP_COOKIE_KEY_SIZE 32 /* size of cookie HMAC key */ +#define SCTP_COOKIE_MAC_SIZE 32 /* size of HMAC field in cookies */ #define SCTP_COOKIE_MULTIPLE 32 /* Pad out our cookie to make our hash * functions simpler to write. diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 6be6aec25731..2ae390219efd 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -32,6 +32,7 @@ #ifndef __sctp_structs_h__ #define __sctp_structs_h__ +#include #include #include #include @@ -68,7 +69,6 @@ struct sctp_outq; struct sctp_bind_addr; struct sctp_ulpq; struct sctp_ep_common; -struct crypto_shash; struct sctp_stream; @@ -155,10 +155,6 @@ struct sctp_sock { /* PF_ family specific functions. */ struct sctp_pf *pf; - /* Access to HMAC transform. */ - struct crypto_shash *hmac; - char *sctp_hmac_alg; - /* What is our base endpointer? */ struct sctp_endpoint *ep; @@ -227,7 +223,8 @@ struct sctp_sock { frag_interleave:1, recvrcvinfo:1, recvnxtinfo:1, - data_ready_signalled:1; + data_ready_signalled:1, + cookie_auth_enable:1; atomic_t pd_mode; @@ -335,7 +332,7 @@ struct sctp_cookie { /* The format of our cookie that we send to our peer. */ struct sctp_signed_cookie { - __u8 signature[SCTP_SECRET_SIZE]; + __u8 mac[SCTP_COOKIE_MAC_SIZE]; __u32 __pad; /* force sctp_cookie alignment to 64 bits */ struct sctp_cookie c; } __packed; @@ -1307,22 +1304,9 @@ struct sctp_endpoint { /* This is really a list of struct sctp_association entries. */ struct list_head asocs; - /* Secret Key: A secret key used by this endpoint to compute - * the MAC. This SHOULD be a cryptographic quality - * random number with a sufficient length. - * Discussion in [RFC1750] can be helpful in - * selection of the key. - */ - __u8 secret_key[SCTP_SECRET_SIZE]; - - /* digest: This is a digest of the sctp cookie. This field is - * only used on the receive path when we try to validate - * that the cookie has not been tampered with. We put - * this here so we pre-allocate this once and can re-use - * on every receive. - */ - __u8 *digest; - + /* Cookie authentication key used by this endpoint */ + struct hmac_sha256_key cookie_auth_key; + /* sendbuf acct. policy. */ __u32 sndbuf_policy; diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig index 09c77b4d161b..e947646a380c 100644 --- a/net/sctp/Kconfig +++ b/net/sctp/Kconfig @@ -49,48 +49,25 @@ config SCTP_DBG_OBJCNT 'cat /proc/net/sctp/sctp_dbg_objcnt' If unsure, say N + choice - prompt "Default SCTP cookie HMAC encoding" - default SCTP_DEFAULT_COOKIE_HMAC_MD5 + prompt "Default SCTP cookie authentication method" + default SCTP_DEFAULT_COOKIE_HMAC_SHA256 help - This option sets the default sctp cookie hmac algorithm - when in doubt select 'md5' + This option sets the default SCTP cookie authentication method, for + when a method hasn't been explicitly selected via the + net.sctp.cookie_hmac_alg sysctl. -config SCTP_DEFAULT_COOKIE_HMAC_MD5 - bool "Enable optional MD5 hmac cookie generation" - help - Enable optional MD5 hmac based SCTP cookie generation - select SCTP_COOKIE_HMAC_MD5 + If unsure, choose the default (HMAC-SHA256). -config SCTP_DEFAULT_COOKIE_HMAC_SHA1 - bool "Enable optional SHA1 hmac cookie generation" - help - Enable optional SHA1 hmac based SCTP cookie generation - select SCTP_COOKIE_HMAC_SHA1 +config SCTP_DEFAULT_COOKIE_HMAC_SHA256 + bool "HMAC-SHA256" config SCTP_DEFAULT_COOKIE_HMAC_NONE - bool "Use no hmac alg in SCTP cookie generation" - help - Use no hmac algorithm in SCTP cookie generation + bool "None" endchoice -config SCTP_COOKIE_HMAC_MD5 - bool "Enable optional MD5 hmac cookie generation" - help - Enable optional MD5 hmac based SCTP cookie generation - select CRYPTO - select CRYPTO_HMAC - select CRYPTO_MD5 - -config SCTP_COOKIE_HMAC_SHA1 - bool "Enable optional SHA1 hmac cookie generation" - help - Enable optional SHA1 hmac based SCTP cookie generation - select CRYPTO - select CRYPTO_HMAC - select CRYPTO_SHA1 - config INET_SCTP_DIAG depends on INET_DIAG def_tristate INET_DIAG diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 7e77b450697c..31e989dfe846 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -35,6 +35,15 @@ /* Forward declarations for internal helpers. */ static void sctp_endpoint_bh_rcv(struct work_struct *work); +static void gen_cookie_auth_key(struct hmac_sha256_key *key) +{ + u8 raw_key[SCTP_COOKIE_KEY_SIZE]; + + get_random_bytes(raw_key, sizeof(raw_key)); + hmac_sha256_preparekey(key, raw_key, sizeof(raw_key)); + memzero_explicit(raw_key, sizeof(raw_key)); +} + /* * Initialize the base fields of the endpoint structure. */ @@ -45,10 +54,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, struct net *net = sock_net(sk); struct sctp_shared_key *null_key; - ep->digest = kzalloc(SCTP_SIGNATURE_SIZE, gfp); - if (!ep->digest) - return NULL; - ep->asconf_enable = net->sctp.addip_enable; ep->auth_enable = net->sctp.auth_enable; if (ep->auth_enable) { @@ -90,8 +95,8 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, /* Get the receive buffer policy for this endpoint */ ep->rcvbuf_policy = net->sctp.rcvbuf_policy; - /* Initialize the secret key used with cookie. */ - get_random_bytes(ep->secret_key, sizeof(ep->secret_key)); + /* Generate the cookie authentication key. */ + gen_cookie_auth_key(&ep->cookie_auth_key); /* SCTP-AUTH extensions*/ INIT_LIST_HEAD(&ep->endpoint_shared_keys); @@ -118,7 +123,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, nomem_shkey: sctp_auth_free(ep); nomem: - kfree(ep->digest); return NULL; } @@ -205,9 +209,6 @@ static void sctp_endpoint_destroy(struct sctp_endpoint *ep) return; } - /* Free the digest buffer */ - kfree(ep->digest); - /* SCTP-AUTH: Free up AUTH releated data such as shared keys * chunks and hmacs arrays that were allocated */ @@ -218,7 +219,7 @@ static void sctp_endpoint_destroy(struct sctp_endpoint *ep) sctp_inq_free(&ep->base.inqueue); sctp_bind_addr_free(&ep->base.bind_addr); - memset(ep->secret_key, 0, sizeof(ep->secret_key)); + memzero_explicit(&ep->cookie_auth_key, sizeof(ep->cookie_auth_key)); sk = ep->base.sk; /* Remove and free the port */ diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index a5ccada55f2b..3b2373b3bd5d 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1334,14 +1334,9 @@ static int __net_init sctp_defaults_init(struct net *net) /* Whether Cookie Preservative is enabled(1) or not(0) */ net->sctp.cookie_preserve_enable = 1; - /* Default sctp sockets to use md5 as their hmac alg */ -#if defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5) - net->sctp.sctp_hmac_alg = "md5"; -#elif defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1) - net->sctp.sctp_hmac_alg = "sha1"; -#else - net->sctp.sctp_hmac_alg = NULL; -#endif + /* Whether cookie authentication is enabled(1) or not(0) */ + net->sctp.cookie_auth_enable = + !IS_ENABLED(CONFIG_SCTP_DEFAULT_COOKIE_HMAC_NONE); /* Max.Burst - 4 */ net->sctp.max_burst = SCTP_DEFAULT_MAX_BURST; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index a1a3c8494c5d..2c0017d058d4 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -30,7 +30,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include #include #include #include @@ -1675,8 +1674,10 @@ static struct sctp_cookie_param *sctp_pack_cookie( * out on the network. */ retval = kzalloc(*cookie_len, GFP_ATOMIC); - if (!retval) - goto nodata; + if (!retval) { + *cookie_len = 0; + return NULL; + } cookie = (struct sctp_signed_cookie *) retval->body; @@ -1707,26 +1708,14 @@ static struct sctp_cookie_param *sctp_pack_cookie( memcpy((__u8 *)(cookie + 1) + ntohs(init_chunk->chunk_hdr->length), raw_addrs, addrs_len); - if (sctp_sk(ep->base.sk)->hmac) { - struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac; - int err; - - /* Sign the message. */ - err = crypto_shash_setkey(tfm, ep->secret_key, - sizeof(ep->secret_key)) ?: - crypto_shash_tfm_digest(tfm, (u8 *)&cookie->c, bodysize, - cookie->signature); - if (err) - goto free_cookie; + /* Sign the cookie, if cookie authentication is enabled. */ + if (sctp_sk(ep->base.sk)->cookie_auth_enable) { + static_assert(sizeof(cookie->mac) == SHA256_DIGEST_SIZE); + hmac_sha256(&ep->cookie_auth_key, (const u8 *)&cookie->c, + bodysize, cookie->mac); } return retval; - -free_cookie: - kfree(retval); -nodata: - *cookie_len = 0; - return NULL; } /* Unpack the cookie from COOKIE ECHO chunk, recreating the association. */ @@ -1741,7 +1730,6 @@ struct sctp_association *sctp_unpack_cookie( struct sctp_signed_cookie *cookie; struct sk_buff *skb = chunk->skb; struct sctp_cookie *bear_cookie; - __u8 *digest = ep->digest; enum sctp_scope scope; unsigned int len; ktime_t kt; @@ -1771,30 +1759,19 @@ struct sctp_association *sctp_unpack_cookie( cookie = chunk->subh.cookie_hdr; bear_cookie = &cookie->c; - if (!sctp_sk(ep->base.sk)->hmac) - goto no_hmac; + /* Verify the cookie's MAC, if cookie authentication is enabled. */ + if (sctp_sk(ep->base.sk)->cookie_auth_enable) { + u8 mac[SHA256_DIGEST_SIZE]; - /* Check the signature. */ - { - struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac; - int err; - - err = crypto_shash_setkey(tfm, ep->secret_key, - sizeof(ep->secret_key)) ?: - crypto_shash_tfm_digest(tfm, (u8 *)bear_cookie, bodysize, - digest); - if (err) { - *error = -SCTP_IERROR_NOMEM; + hmac_sha256(&ep->cookie_auth_key, (const u8 *)bear_cookie, + bodysize, mac); + static_assert(sizeof(cookie->mac) == sizeof(mac)); + if (crypto_memneq(mac, cookie->mac, sizeof(mac))) { + *error = -SCTP_IERROR_BAD_SIG; goto fail; } } - if (crypto_memneq(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) { - *error = -SCTP_IERROR_BAD_SIG; - goto fail; - } - -no_hmac: /* IG Section 2.35.2: * 3) Compare the port numbers and the verification tag contained * within the COOKIE ECHO chunk to the actual port numbers and the diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 0292881a847c..ed8293a34240 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -37,7 +37,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include #include #include #include @@ -4987,7 +4986,7 @@ static int sctp_init_sock(struct sock *sk) sp->default_rcv_context = 0; sp->max_burst = net->sctp.max_burst; - sp->sctp_hmac_alg = net->sctp.sctp_hmac_alg; + sp->cookie_auth_enable = net->sctp.cookie_auth_enable; /* Initialize default setup parameters. These parameters * can be modified with the SCTP_INITMSG socket option or @@ -5079,8 +5078,6 @@ static int sctp_init_sock(struct sock *sk) if (!sp->ep) return -ENOMEM; - sp->hmac = NULL; - sk->sk_destruct = sctp_destruct_sock; SCTP_DBG_OBJCNT_INC(sock); @@ -5117,18 +5114,8 @@ static void sctp_destroy_sock(struct sock *sk) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); } -/* Triggered when there are no references on the socket anymore */ -static void sctp_destruct_common(struct sock *sk) -{ - struct sctp_sock *sp = sctp_sk(sk); - - /* Free up the HMAC transform. */ - crypto_free_shash(sp->hmac); -} - static void sctp_destruct_sock(struct sock *sk) { - sctp_destruct_common(sk); inet_sock_destruct(sk); } @@ -8530,22 +8517,8 @@ static int sctp_listen_start(struct sock *sk, int backlog) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_endpoint *ep = sp->ep; - struct crypto_shash *tfm = NULL; - char alg[32]; int err; - /* Allocate HMAC for generating cookie. */ - if (!sp->hmac && sp->sctp_hmac_alg) { - sprintf(alg, "hmac(%s)", sp->sctp_hmac_alg); - tfm = crypto_alloc_shash(alg, 0, 0); - if (IS_ERR(tfm)) { - net_info_ratelimited("failed to load transform for %s: %ld\n", - sp->sctp_hmac_alg, PTR_ERR(tfm)); - return -ENOSYS; - } - sctp_sk(sk)->hmac = tfm; - } - /* * If a bind() or sctp_bindx() is not called prior to a listen() * call that allows new associations to be accepted, the system @@ -9561,7 +9534,6 @@ static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, * copy. */ newsp->ep = newep; - newsp->hmac = NULL; /* Hook this new socket in to the bind_hash list. */ head = &sctp_port_hashtable[sctp_phashfn(sock_net(oldsk), @@ -9713,7 +9685,6 @@ struct proto sctp_prot = { static void sctp_v6_destruct_sock(struct sock *sk) { - sctp_destruct_common(sk); inet6_sock_destruct(sk); } diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index ee3eac338a9d..19acc57c3ed9 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -174,7 +174,7 @@ static struct ctl_table sctp_net_table[] = { }, { .procname = "cookie_hmac_alg", - .data = &init_net.sctp.sctp_hmac_alg, + .data = &init_net.sctp.cookie_auth_enable, .maxlen = 8, .mode = 0644, .proc_handler = proc_sctp_do_hmac_alg, @@ -388,10 +388,8 @@ static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(ctl->data, struct net, - sctp.sctp_hmac_alg); + sctp.cookie_auth_enable); struct ctl_table tbl; - bool changed = false; - char *none = "none"; char tmp[8] = {0}; int ret; @@ -399,35 +397,28 @@ static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write, if (write) { tbl.data = tmp; - tbl.maxlen = sizeof(tmp); - } else { - tbl.data = net->sctp.sctp_hmac_alg ? : none; - tbl.maxlen = strlen(tbl.data); - } - - ret = proc_dostring(&tbl, write, buffer, lenp, ppos); - if (write && ret == 0) { -#ifdef CONFIG_CRYPTO_MD5 - if (!strncmp(tmp, "md5", 3)) { - net->sctp.sctp_hmac_alg = "md5"; - changed = true; + tbl.maxlen = sizeof(tmp) - 1; + ret = proc_dostring(&tbl, 1, buffer, lenp, ppos); + if (ret) + return ret; + if (!strcmp(tmp, "sha256") || + /* for backwards compatibility */ + !strcmp(tmp, "md5") || !strcmp(tmp, "sha1")) { + net->sctp.cookie_auth_enable = 1; + return 0; } -#endif -#ifdef CONFIG_CRYPTO_SHA1 - if (!strncmp(tmp, "sha1", 4)) { - net->sctp.sctp_hmac_alg = "sha1"; - changed = true; + if (!strcmp(tmp, "none")) { + net->sctp.cookie_auth_enable = 0; + return 0; } -#endif - if (!strncmp(tmp, "none", 4)) { - net->sctp.sctp_hmac_alg = NULL; - changed = true; - } - if (!changed) - ret = -EINVAL; + return -EINVAL; } - - return ret; + if (net->sctp.cookie_auth_enable) + tbl.data = (char *)"sha256"; + else + tbl.data = (char *)"none"; + tbl.maxlen = strlen(tbl.data); + return proc_dostring(&tbl, 0, buffer, lenp, ppos); } static int proc_sctp_do_rto_min(const struct ctl_table *ctl, int write, -- cgit v1.2.3 From d5a253702add0da3e1e19252ae2a251ee24b486d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Aug 2025 13:54:26 -0700 Subject: sctp: Stop accepting md5 and sha1 for net.sctp.cookie_hmac_alg The upgrade of the cookie authentication algorithm to HMAC-SHA256 kept some backwards compatibility for the net.sctp.cookie_hmac_alg sysctl by still accepting the values 'md5' and 'sha1'. Those algorithms are no longer actually used, but rather those values were just treated as requests to enable cookie authentication. As requested at https://lore.kernel.org/netdev/CADvbK_fmCRARc8VznH8cQa-QKaCOQZ6yFbF=1-VDK=zRqv_cXw@mail.gmail.com/ and https://lore.kernel.org/netdev/20250818084345.708ac796@kernel.org/ , go further and start rejecting 'md5' and 'sha1' completely. Signed-off-by: Eric Biggers Link: https://patch.msgid.link/20250818205426.30222-6-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/ip-sysctl.rst | 3 --- net/sctp/sysctl.c | 4 +--- 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 3d6782683eee..43badb338d22 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -3511,9 +3511,6 @@ cookie_hmac_alg - STRING * sha256 * none - md5 and sha1 are also accepted for backwards compatibility, but cause - sha256 to be selected. - Default: sha256 rcvbuf_policy - INTEGER diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 19acc57c3ed9..15e7db9a3ab2 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -401,9 +401,7 @@ static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write, ret = proc_dostring(&tbl, 1, buffer, lenp, ppos); if (ret) return ret; - if (!strcmp(tmp, "sha256") || - /* for backwards compatibility */ - !strcmp(tmp, "md5") || !strcmp(tmp, "sha1")) { + if (!strcmp(tmp, "sha256")) { net->sctp.cookie_auth_enable = 1; return 0; } -- cgit v1.2.3 From 08d07f25fd5e2ca3d32444f8c41e9e6e59e8a54e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 15 Aug 2025 14:55:49 +0200 Subject: netfilter: ctnetlink: remove refcounting in dying list dumping There is no need to keep the object alive via refcount, use a cookie and then use that as the skip hint for dump resumption. Unlike the two earlier, similar patches in this file, this is a cleanup without intended side effects. Signed-off-by: Florian Westphal --- net/netfilter/nf_conntrack_netlink.c | 39 +++++++++--------------------------- 1 file changed, 10 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 50fd6809380f..3a04665adf99 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -60,7 +60,7 @@ MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("List and change connection tracking table"); struct ctnetlink_list_dump_ctx { - struct nf_conn *last; + unsigned long last_id; unsigned int cpu; bool done; }; @@ -1733,16 +1733,6 @@ static int ctnetlink_get_conntrack(struct sk_buff *skb, return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } -static int ctnetlink_done_list(struct netlink_callback *cb) -{ - struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx; - - if (ctx->last) - nf_ct_put(ctx->last); - - return 0; -} - #ifdef CONFIG_NF_CONNTRACK_EVENTS static int ctnetlink_dump_one_entry(struct sk_buff *skb, struct netlink_callback *cb, @@ -1757,11 +1747,11 @@ static int ctnetlink_dump_one_entry(struct sk_buff *skb, if (l3proto && nf_ct_l3num(ct) != l3proto) return 0; - if (ctx->last) { - if (ct != ctx->last) + if (ctx->last_id) { + if (ctnetlink_get_id(ct) != ctx->last_id) return 0; - ctx->last = NULL; + ctx->last_id = 0; } /* We can't dump extension info for the unconfirmed @@ -1775,12 +1765,8 @@ static int ctnetlink_dump_one_entry(struct sk_buff *skb, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), ct, dying, 0); - if (res < 0) { - if (!refcount_inc_not_zero(&ct->ct_general.use)) - return 0; - - ctx->last = ct; - } + if (res < 0) + ctx->last_id = ctnetlink_get_id(ct); return res; } @@ -1796,10 +1782,10 @@ static int ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) { struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx; - struct nf_conn *last = ctx->last; #ifdef CONFIG_NF_CONNTRACK_EVENTS const struct net *net = sock_net(skb->sk); struct nf_conntrack_net_ecache *ecache_net; + unsigned long last_id = ctx->last_id; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; #endif @@ -1807,7 +1793,7 @@ ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) if (ctx->done) return 0; - ctx->last = NULL; + ctx->last_id = 0; #ifdef CONFIG_NF_CONNTRACK_EVENTS ecache_net = nf_conn_pernet_ecache(net); @@ -1818,24 +1804,21 @@ ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) int res; ct = nf_ct_tuplehash_to_ctrack(h); - if (last && last != ct) + if (last_id && last_id != ctnetlink_get_id(ct)) continue; res = ctnetlink_dump_one_entry(skb, cb, ct, true); if (res < 0) { spin_unlock_bh(&ecache_net->dying_lock); - nf_ct_put(last); return skb->len; } - nf_ct_put(last); - last = NULL; + last_id = 0; } spin_unlock_bh(&ecache_net->dying_lock); #endif ctx->done = true; - nf_ct_put(last); return skb->len; } @@ -1847,7 +1830,6 @@ static int ctnetlink_get_ct_dying(struct sk_buff *skb, if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_dump_dying, - .done = ctnetlink_done_list, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } @@ -1862,7 +1844,6 @@ static int ctnetlink_get_ct_unconfirmed(struct sk_buff *skb, if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_dump_unconfirmed, - .done = ctnetlink_done_list, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } -- cgit v1.2.3 From d11b26402a33f5c45389e0a288430c457434c9cd Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 15 Aug 2025 18:09:36 +0200 Subject: netfilter: nft_set_pipapo_avx2: Drop the comment regarding protection The comment claims that the kernel_fpu_begin_mask() below protects access to the scratch map. This is not true because the access is only protected by local_bh_disable() above. Remove the misleading comment. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Florian Westphal --- net/netfilter/nft_set_pipapo_avx2.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index 2f090e253caf..fc734a8545b4 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1171,9 +1171,7 @@ nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, m = rcu_dereference(priv->match); - /* This also protects access to all data related to scratch maps. - * - * Note that we don't need a valid MXCSR state for any of the + /* Note that we don't need a valid MXCSR state for any of the * operations we use here, so pass 0 as mask and spare a LDMXCSR * instruction. */ -- cgit v1.2.3 From 416e53e39516714057d7d06d561e49d1a89fa524 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 15 Aug 2025 16:36:57 +0200 Subject: netfilter: nft_set_pipapo_avx2: split lookup function in two parts Split the main avx2 lookup function into a helper. This is a preparation patch: followup change will use the new helper from the insertion path if possible. This greatly improves insertion performance when avx2 is supported. Reviewed-by: Stefano Brivio Signed-off-by: Florian Westphal --- net/netfilter/nft_set_pipapo_avx2.c | 126 ++++++++++++++++++++++-------------- 1 file changed, 77 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index fc734a8545b4..994a2ad2d9b1 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1133,56 +1133,35 @@ static inline void pipapo_resmap_init_avx2(const struct nft_pipapo_match *m, uns } /** - * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation - * @net: Network namespace - * @set: nftables API set representation - * @key: nftables API element representation containing key data + * pipapo_get_avx2() - Lookup function for AVX2 implementation + * @m: Storage containing the set elements + * @data: Key data to be matched against existing elements + * @genmask: If set, check that element is active in given genmask + * @tstamp: Timestamp to check for expired elements * * For more details, see DOC: Theory of Operation in nft_set_pipapo.c. * * This implementation exploits the repetitive characteristic of the algorithm * to provide a fast, vectorised version using the AVX2 SIMD instruction set. * - * Return: true on match, false otherwise. + * The caller must check that the FPU is usable. + * This function must be called with BH disabled. + * + * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise. */ -const struct nft_set_ext * -nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, - const u32 *key) +static struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m, + const u8 *data, u8 genmask, + u64 tstamp) { - struct nft_pipapo *priv = nft_set_priv(set); - const struct nft_set_ext *ext = NULL; struct nft_pipapo_scratch *scratch; - u8 genmask = nft_genmask_cur(net); - const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; - const u8 *rp = (const u8 *)key; unsigned long *res, *fill; bool map_index; int i; - local_bh_disable(); - - if (unlikely(!irq_fpu_usable())) { - ext = nft_pipapo_lookup(net, set, key); - - local_bh_enable(); - return ext; - } - - m = rcu_dereference(priv->match); - - /* Note that we don't need a valid MXCSR state for any of the - * operations we use here, so pass 0 as mask and spare a LDMXCSR - * instruction. - */ - kernel_fpu_begin_mask(0); - scratch = *raw_cpu_ptr(m->scratch); - if (unlikely(!scratch)) { - kernel_fpu_end(); - local_bh_enable(); + if (unlikely(!scratch)) return NULL; - } map_index = scratch->map_index; @@ -1191,6 +1170,12 @@ nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, pipapo_resmap_init_avx2(m, res); + /* Note that we don't need a valid MXCSR state for any of the + * operations we use here, so pass 0 as mask and spare a LDMXCSR + * instruction. + */ + kernel_fpu_begin_mask(0); + nft_pipapo_avx2_prepare(); next_match: @@ -1200,7 +1185,7 @@ next_match: #define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \ (ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \ - ret, rp, \ + ret, data, \ first, last)) if (likely(f->bb == 8)) { @@ -1216,7 +1201,7 @@ next_match: NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16); } else { ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f, - ret, rp, + ret, data, first, last); } } else { @@ -1232,7 +1217,7 @@ next_match: NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32); } else { ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f, - ret, rp, + ret, data, first, last); } } @@ -1240,29 +1225,72 @@ next_match: #undef NFT_SET_PIPAPO_AVX2_LOOKUP - if (ret < 0) - goto out; + if (ret < 0) { + scratch->map_index = map_index; + kernel_fpu_end(); + return NULL; + } if (last) { - const struct nft_set_ext *e = &f->mt[ret].e->ext; + struct nft_pipapo_elem *e; - if (unlikely(nft_set_elem_expired(e) || - !nft_set_elem_active(e, genmask))) + e = f->mt[ret].e; + if (unlikely(__nft_set_elem_expired(&e->ext, tstamp) || + !nft_set_elem_active(&e->ext, genmask))) goto next_match; - ext = e; - goto out; + scratch->map_index = map_index; + kernel_fpu_end(); + return e; } + map_index = !map_index; swap(res, fill); - rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); + data += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); } -out: - if (i % 2) - scratch->map_index = !map_index; kernel_fpu_end(); + return NULL; +} + +/** + * nft_pipapo_avx2_lookup() - Dataplane frontend for AVX2 implementation + * @net: Network namespace + * @set: nftables API set representation + * @key: nftables API element representation containing key data + * + * This function is called from the data path. It will search for + * an element matching the given key in the current active copy using + * the AVX2 routines if the fpu is usable or fall back to the generic + * implementation of the algorithm otherwise. + * + * Return: nftables API extension pointer or NULL if no match. + */ +const struct nft_set_ext * +nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) +{ + struct nft_pipapo *priv = nft_set_priv(set); + u8 genmask = nft_genmask_cur(net); + const struct nft_pipapo_match *m; + const u8 *rp = (const u8 *)key; + const struct nft_pipapo_elem *e; + + local_bh_disable(); + + if (unlikely(!irq_fpu_usable())) { + const struct nft_set_ext *ext; + + ext = nft_pipapo_lookup(net, set, key); + + local_bh_enable(); + return ext; + } + + m = rcu_dereference(priv->match); + + e = pipapo_get_avx2(m, rp, genmask, get_jiffies_64()); local_bh_enable(); - return ext; + return e ? &e->ext : NULL; } -- cgit v1.2.3 From 84c1da7b38d9ad8fadd5b0b76034a41f7761e404 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 15 Aug 2025 16:36:58 +0200 Subject: netfilter: nft_set_pipapo: use avx2 algorithm for insertions too Always prefer the avx2 implementation if its available. This greatly improves insertion performance (each insertion checks if the new element would overlap with an existing one): time nft -f - < Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Florian Westphal --- net/netfilter/nft_set_pipapo.c | 45 ++++++++++++++++++++++++++++++++----- net/netfilter/nft_set_pipapo_avx2.c | 8 +++---- net/netfilter/nft_set_pipapo_avx2.h | 4 ++++ 3 files changed, 48 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 9a10251228fd..7ed9c5f0e233 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -397,7 +397,7 @@ int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules, } /** - * pipapo_get() - Get matching element reference given key data + * pipapo_get_slow() - Get matching element reference given key data * @m: storage containing the set elements * @data: Key data to be matched against existing elements * @genmask: If set, check that element is active in given genmask @@ -414,9 +414,9 @@ int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules, * * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise. */ -static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m, - const u8 *data, u8 genmask, - u64 tstamp) +static struct nft_pipapo_elem *pipapo_get_slow(const struct nft_pipapo_match *m, + const u8 *data, u8 genmask, + u64 tstamp) { struct nft_pipapo_scratch *scratch; unsigned long *res_map, *fill_map; @@ -502,6 +502,41 @@ out: return NULL; } +/** + * pipapo_get() - Get matching element reference given key data + * @m: Storage containing the set elements + * @data: Key data to be matched against existing elements + * @genmask: If set, check that element is active in given genmask + * @tstamp: Timestamp to check for expired elements + * + * This is a dispatcher function, either calling out the generic C + * implementation or, if available, the AVX2 one. + * This helper is only called from the control plane, with either RCU + * read lock or transaction mutex held. + * + * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise. + */ +static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m, + const u8 *data, u8 genmask, + u64 tstamp) +{ + struct nft_pipapo_elem *e; + + local_bh_disable(); + +#if defined(CONFIG_X86_64) && !defined(CONFIG_UML) + if (boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX) && + irq_fpu_usable()) { + e = pipapo_get_avx2(m, data, genmask, tstamp); + local_bh_enable(); + return e; + } +#endif + e = pipapo_get_slow(m, data, genmask, tstamp); + local_bh_enable(); + return e; +} + /** * nft_pipapo_lookup() - Dataplane fronted for main lookup function * @net: Network namespace @@ -523,7 +558,7 @@ nft_pipapo_lookup(const struct net *net, const struct nft_set *set, const struct nft_pipapo_elem *e; m = rcu_dereference(priv->match); - e = pipapo_get(m, (const u8 *)key, genmask, get_jiffies_64()); + e = pipapo_get_slow(m, (const u8 *)key, genmask, get_jiffies_64()); return e ? &e->ext : NULL; } diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index 994a2ad2d9b1..028c11724b42 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1149,9 +1149,9 @@ static inline void pipapo_resmap_init_avx2(const struct nft_pipapo_match *m, uns * * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise. */ -static struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m, - const u8 *data, u8 genmask, - u64 tstamp) +struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m, + const u8 *data, u8 genmask, + u64 tstamp) { struct nft_pipapo_scratch *scratch; const struct nft_pipapo_field *f; @@ -1261,7 +1261,7 @@ next_match: * * This function is called from the data path. It will search for * an element matching the given key in the current active copy using - * the AVX2 routines if the fpu is usable or fall back to the generic + * the AVX2 routines if the FPU is usable or fall back to the generic * implementation of the algorithm otherwise. * * Return: nftables API extension pointer or NULL if no match. diff --git a/net/netfilter/nft_set_pipapo_avx2.h b/net/netfilter/nft_set_pipapo_avx2.h index dbb6aaca8a7a..c2999b63da3f 100644 --- a/net/netfilter/nft_set_pipapo_avx2.h +++ b/net/netfilter/nft_set_pipapo_avx2.h @@ -5,8 +5,12 @@ #include #define NFT_PIPAPO_ALIGN (XSAVE_YMM_SIZE / BITS_PER_BYTE) +struct nft_pipapo_match; bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est); +struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m, + const u8 *data, u8 genmask, + u64 tstamp); #endif /* defined(CONFIG_X86_64) && !defined(CONFIG_UML) */ #endif /* _NFT_SET_PIPAPO_AVX2_H */ -- cgit v1.2.3 From 6aa67d5706f031f24cd486d8df7dc7fddca62b22 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 15 Aug 2025 18:09:35 +0200 Subject: netfilter: nft_set_pipapo: Store real pointer, adjust later. The struct nft_pipapo_scratch is allocated, then aligned to the required alignment and difference (in bytes) is then saved in align_off. The aligned pointer is used later. While this works, it gets complicated with all the extra checks if all member before map are larger than the required alignment. Instead of saving the aligned pointer, just save the returned pointer and align the map pointer in nft_pipapo_lookup() before using it. The alignment later on shouldn't be that expensive. With this change, the align_off can be removed and the pointer can be passed to kfree() as is. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Florian Westphal --- net/netfilter/nft_set_pipapo.c | 40 ++++++++----------------------------- net/netfilter/nft_set_pipapo.h | 6 ++---- net/netfilter/nft_set_pipapo_avx2.c | 8 ++++---- 3 files changed, 14 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 7ed9c5f0e233..96b7539f5506 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -418,8 +418,8 @@ static struct nft_pipapo_elem *pipapo_get_slow(const struct nft_pipapo_match *m, const u8 *data, u8 genmask, u64 tstamp) { + unsigned long *res_map, *fill_map, *map; struct nft_pipapo_scratch *scratch; - unsigned long *res_map, *fill_map; const struct nft_pipapo_field *f; bool map_index; int i; @@ -432,8 +432,9 @@ static struct nft_pipapo_elem *pipapo_get_slow(const struct nft_pipapo_match *m, map_index = scratch->map_index; - res_map = scratch->map + (map_index ? m->bsize_max : 0); - fill_map = scratch->map + (map_index ? 0 : m->bsize_max); + map = NFT_PIPAPO_LT_ALIGN(&scratch->__map[0]); + res_map = map + (map_index ? m->bsize_max : 0); + fill_map = map + (map_index ? 0 : m->bsize_max); pipapo_resmap_init(m, res_map); @@ -1171,22 +1172,17 @@ static void pipapo_map(struct nft_pipapo_match *m, } /** - * pipapo_free_scratch() - Free per-CPU map at original (not aligned) address + * pipapo_free_scratch() - Free per-CPU map at original address * @m: Matching data * @cpu: CPU number */ static void pipapo_free_scratch(const struct nft_pipapo_match *m, unsigned int cpu) { struct nft_pipapo_scratch *s; - void *mem; s = *per_cpu_ptr(m->scratch, cpu); - if (!s) - return; - mem = s; - mem -= s->align_off; - kvfree(mem); + kvfree(s); } /** @@ -1203,11 +1199,8 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, for_each_possible_cpu(i) { struct nft_pipapo_scratch *scratch; -#ifdef NFT_PIPAPO_ALIGN - void *scratch_aligned; - u32 align_off; -#endif - scratch = kvzalloc_node(struct_size(scratch, map, bsize_max * 2) + + + scratch = kvzalloc_node(struct_size(scratch, __map, bsize_max * 2) + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL_ACCOUNT, cpu_to_node(i)); if (!scratch) { @@ -1222,23 +1215,6 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, } pipapo_free_scratch(clone, i); - -#ifdef NFT_PIPAPO_ALIGN - /* Align &scratch->map (not the struct itself): the extra - * %NFT_PIPAPO_ALIGN_HEADROOM bytes passed to kzalloc_node() - * above guarantee we can waste up to those bytes in order - * to align the map field regardless of its offset within - * the struct. - */ - BUILD_BUG_ON(offsetof(struct nft_pipapo_scratch, map) > NFT_PIPAPO_ALIGN_HEADROOM); - - scratch_aligned = NFT_PIPAPO_LT_ALIGN(&scratch->map); - scratch_aligned -= offsetof(struct nft_pipapo_scratch, map); - align_off = scratch_aligned - (void *)scratch; - - scratch = scratch_aligned; - scratch->align_off = align_off; -#endif *per_cpu_ptr(clone->scratch, i) = scratch; } diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h index 4a2ff85ce1c4..e10cdbaa65d8 100644 --- a/net/netfilter/nft_set_pipapo.h +++ b/net/netfilter/nft_set_pipapo.h @@ -125,13 +125,11 @@ struct nft_pipapo_field { /** * struct nft_pipapo_scratch - percpu data used for lookup and matching * @map_index: Current working bitmap index, toggled between field matches - * @align_off: Offset to get the originally allocated address - * @map: store partial matching results during lookup + * @__map: store partial matching results during lookup */ struct nft_pipapo_scratch { u8 map_index; - u32 align_off; - unsigned long map[]; + unsigned long __map[]; }; /** diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index 028c11724b42..f0d8c796d731 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1155,7 +1155,7 @@ struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m, { struct nft_pipapo_scratch *scratch; const struct nft_pipapo_field *f; - unsigned long *res, *fill; + unsigned long *res, *fill, *map; bool map_index; int i; @@ -1164,9 +1164,9 @@ struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m, return NULL; map_index = scratch->map_index; - - res = scratch->map + (map_index ? m->bsize_max : 0); - fill = scratch->map + (map_index ? 0 : m->bsize_max); + map = NFT_PIPAPO_LT_ALIGN(&scratch->__map[0]); + res = map + (map_index ? m->bsize_max : 0); + fill = map + (map_index ? 0 : m->bsize_max); pipapo_resmap_init_avx2(m, res); -- cgit v1.2.3 From 456010c8b99e65231160d4c706122ac5502fbcff Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 18 Aug 2025 13:02:13 +0200 Subject: netfilter: nft_set_pipapo: Use nested-BH locking for nft_pipapo_scratch nft_pipapo_scratch is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Add a local_lock_t to the data structure and use local_lock_nested_bh() for locking. This change adds only lockdep coverage and does not alter the functional behaviour for !PREEMPT_RT. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Florian Westphal --- net/netfilter/nft_set_pipapo.c | 5 +++++ net/netfilter/nft_set_pipapo.h | 2 ++ net/netfilter/nft_set_pipapo_avx2.c | 4 ++++ 3 files changed, 11 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 96b7539f5506..b385cfcf886f 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -429,6 +429,7 @@ static struct nft_pipapo_elem *pipapo_get_slow(const struct nft_pipapo_match *m, scratch = *raw_cpu_ptr(m->scratch); if (unlikely(!scratch)) goto out; + __local_lock_nested_bh(&scratch->bh_lock); map_index = scratch->map_index; @@ -465,6 +466,7 @@ next_match: last); if (b < 0) { scratch->map_index = map_index; + __local_unlock_nested_bh(&scratch->bh_lock); local_bh_enable(); return NULL; @@ -484,6 +486,7 @@ next_match: * *next* bitmap (not initial) for the next packet. */ scratch->map_index = map_index; + __local_unlock_nested_bh(&scratch->bh_lock); local_bh_enable(); return e; } @@ -498,6 +501,7 @@ next_match: data += NFT_PIPAPO_GROUPS_PADDING(f); } + __local_unlock_nested_bh(&scratch->bh_lock); out: local_bh_enable(); return NULL; @@ -1215,6 +1219,7 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, } pipapo_free_scratch(clone, i); + local_lock_init(&scratch->bh_lock); *per_cpu_ptr(clone->scratch, i) = scratch; } diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h index e10cdbaa65d8..eaab422aa56a 100644 --- a/net/netfilter/nft_set_pipapo.h +++ b/net/netfilter/nft_set_pipapo.h @@ -124,10 +124,12 @@ struct nft_pipapo_field { /** * struct nft_pipapo_scratch - percpu data used for lookup and matching + * @bh_lock: PREEMPT_RT local spinlock * @map_index: Current working bitmap index, toggled between field matches * @__map: store partial matching results during lookup */ struct nft_pipapo_scratch { + local_lock_t bh_lock; u8 map_index; unsigned long __map[]; }; diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index f0d8c796d731..29326f3fcaf3 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1163,6 +1163,7 @@ struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m, if (unlikely(!scratch)) return NULL; + __local_lock_nested_bh(&scratch->bh_lock); map_index = scratch->map_index; map = NFT_PIPAPO_LT_ALIGN(&scratch->__map[0]); res = map + (map_index ? m->bsize_max : 0); @@ -1228,6 +1229,7 @@ next_match: if (ret < 0) { scratch->map_index = map_index; kernel_fpu_end(); + __local_unlock_nested_bh(&scratch->bh_lock); return NULL; } @@ -1241,6 +1243,7 @@ next_match: scratch->map_index = map_index; kernel_fpu_end(); + __local_unlock_nested_bh(&scratch->bh_lock); return e; } @@ -1250,6 +1253,7 @@ next_match: } kernel_fpu_end(); + __local_unlock_nested_bh(&scratch->bh_lock); return NULL; } -- cgit v1.2.3 From 8f2c72f2252cf228879de0224d5055470fc20c06 Mon Sep 17 00:00:00 2001 From: Pengtao He Date: Tue, 19 Aug 2025 10:15:51 +0800 Subject: net: avoid one loop iteration in __skb_splice_bits If *len is equal to 0 at the beginning of __splice_segment it returns true directly. But when decreasing *len from a positive number to 0 in __splice_segment, it returns false. The __skb_splice_bits needs to call __splice_segment again. Recheck *len if it changes, return true in time. Reduce unnecessary calls to __splice_segment. Signed-off-by: Pengtao He Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250819021551.8361-1-hept.hept.hept@gmail.com Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ee0274417948..23b776cd9879 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3112,7 +3112,9 @@ static bool __splice_segment(struct page *page, unsigned int poff, poff += flen; plen -= flen; *len -= flen; - } while (*len && plen); + if (!*len) + return true; + } while (plen); return false; } -- cgit v1.2.3 From 15de71d06a400f7fdc15bf377a2552b0ec437cf5 Mon Sep 17 00:00:00 2001 From: William Liu Date: Tue, 19 Aug 2025 03:36:28 +0000 Subject: net/sched: Make cake_enqueue return NET_XMIT_CN when past buffer_limit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following setup can trigger a WARNING in htb_activate due to the condition: !cl->leaf.q->q.qlen tc qdisc del dev lo root tc qdisc add dev lo root handle 1: htb default 1 tc class add dev lo parent 1: classid 1:1 \ htb rate 64bit tc qdisc add dev lo parent 1:1 handle f: \ cake memlimit 1b ping -I lo -f -c1 -s64 -W0.001 127.0.0.1 This is because the low memlimit leads to a low buffer_limit, which causes packet dropping. However, cake_enqueue still returns NET_XMIT_SUCCESS, causing htb_enqueue to call htb_activate with an empty child qdisc. We should return NET_XMIT_CN when packets are dropped from the same tin and flow. I do not believe return value of NET_XMIT_CN is necessary for packet drops in the case of ack filtering, as that is meant to optimize performance, not to signal congestion. Fixes: 046f6fd5daef ("sched: Add Common Applications Kept Enhanced (cake) qdisc") Signed-off-by: William Liu Reviewed-by: Savino Dicanosa Acked-by: Toke Høiland-Jørgensen Reviewed-by: Jamal Hadi Salim Link: https://patch.msgid.link/20250819033601.579821-1-will@willsroot.io Signed-off-by: Jakub Kicinski --- net/sched/sch_cake.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index dbcfb948c867..32bacfc314c2 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1750,7 +1750,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, ktime_t now = ktime_get(); struct cake_tin_data *b; struct cake_flow *flow; - u32 idx; + u32 idx, tin; /* choose flow to insert into */ idx = cake_classify(sch, &b, skb, q->flow_mode, &ret); @@ -1760,6 +1760,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, __qdisc_drop(skb, to_free); return ret; } + tin = (u32)(b - q->tins); idx--; flow = &b->flows[idx]; @@ -1927,13 +1928,22 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, q->buffer_max_used = q->buffer_used; if (q->buffer_used > q->buffer_limit) { + bool same_flow = false; u32 dropped = 0; + u32 drop_id; while (q->buffer_used > q->buffer_limit) { dropped++; - cake_drop(sch, to_free); + drop_id = cake_drop(sch, to_free); + + if ((drop_id >> 16) == tin && + (drop_id & 0xFFFF) == idx) + same_flow = true; } b->drop_overlimit += dropped; + + if (same_flow) + return NET_XMIT_CN; } return NET_XMIT_SUCCESS; } -- cgit v1.2.3 From 2c2192e5f9c7c2892fe2363244d1387f62710d83 Mon Sep 17 00:00:00 2001 From: William Liu Date: Tue, 19 Aug 2025 03:36:59 +0000 Subject: net/sched: Remove unnecessary WARNING condition for empty child qdisc in htb_activate The WARN_ON trigger based on !cl->leaf.q->q.qlen is unnecessary in htb_activate. htb_dequeue_tree already accounts for that scenario. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: William Liu Reviewed-by: Savino Dicanosa Link: https://patch.msgid.link/20250819033632.579854-1-will@willsroot.io Signed-off-by: Jakub Kicinski --- net/sched/sch_htb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index c968ea763774..b5e40c51655a 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -592,7 +592,7 @@ htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff) */ static inline void htb_activate(struct htb_sched *q, struct htb_class *cl) { - WARN_ON(cl->level || !cl->leaf.q || !cl->leaf.q->q.qlen); + WARN_ON(cl->level || !cl->leaf.q); if (!cl->prio_activity) { cl->prio_activity = 1 << cl->prio; -- cgit v1.2.3 From 7af76e9d18a9fd6f8611b3313c86c190f9b6a5a7 Mon Sep 17 00:00:00 2001 From: Jakub Acs Date: Tue, 19 Aug 2025 08:28:42 +0000 Subject: net, hsr: reject HSR frame if skb can't hold tag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Receiving HSR frame with insufficient space to hold HSR tag in the skb can result in a crash (kernel BUG): [ 45.390915] skbuff: skb_under_panic: text:ffffffff86f32cac len:26 put:14 head:ffff888042418000 data:ffff888042417ff4 tail:0xe end:0x180 dev:bridge_slave_1 [ 45.392559] ------------[ cut here ]------------ [ 45.392912] kernel BUG at net/core/skbuff.c:211! [ 45.393276] Oops: invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN NOPTI [ 45.393809] CPU: 1 UID: 0 PID: 2496 Comm: reproducer Not tainted 6.15.0 #12 PREEMPT(undef) [ 45.394433] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 [ 45.395273] RIP: 0010:skb_panic+0x15b/0x1d0 [ 45.402911] Call Trace: [ 45.403105] [ 45.404470] skb_push+0xcd/0xf0 [ 45.404726] br_dev_queue_push_xmit+0x7c/0x6c0 [ 45.406513] br_forward_finish+0x128/0x260 [ 45.408483] __br_forward+0x42d/0x590 [ 45.409464] maybe_deliver+0x2eb/0x420 [ 45.409763] br_flood+0x174/0x4a0 [ 45.410030] br_handle_frame_finish+0xc7c/0x1bc0 [ 45.411618] br_handle_frame+0xac3/0x1230 [ 45.413674] __netif_receive_skb_core.constprop.0+0x808/0x3df0 [ 45.422966] __netif_receive_skb_one_core+0xb4/0x1f0 [ 45.424478] __netif_receive_skb+0x22/0x170 [ 45.424806] process_backlog+0x242/0x6d0 [ 45.425116] __napi_poll+0xbb/0x630 [ 45.425394] net_rx_action+0x4d1/0xcc0 [ 45.427613] handle_softirqs+0x1a4/0x580 [ 45.427926] do_softirq+0x74/0x90 [ 45.428196] This issue was found by syzkaller. The panic happens in br_dev_queue_push_xmit() once it receives a corrupted skb with ETH header already pushed in linear data. When it attempts the skb_push() call, there's not enough headroom and skb_push() panics. The corrupted skb is put on the queue by HSR layer, which makes a sequence of unintended transformations when it receives a specific corrupted HSR frame (with incomplete TAG). Fix it by dropping and consuming frames that are not long enough to contain both ethernet and hsr headers. Alternative fix would be to check for enough headroom before skb_push() in br_dev_queue_push_xmit(). In the reproducer, this is injected via AF_PACKET, but I don't easily see why it couldn't be sent over the wire from adjacent network. Further Details: In the reproducer, the following network interface chain is set up: ┌────────────────┐ ┌────────────────┐ │ veth0_to_hsr ├───┤ hsr_slave0 ┼───┐ └────────────────┘ └────────────────┘ │ │ ┌──────┐ ├─┤ hsr0 ├───┐ │ └──────┘ │ ┌────────────────┐ ┌────────────────┐ │ │┌────────┐ │ veth1_to_hsr ┼───┤ hsr_slave1 ├───┘ └┤ │ └────────────────┘ └────────────────┘ ┌┼ bridge │ ││ │ │└────────┘ │ ┌───────┐ │ │ ... ├──────┘ └───────┘ To trigger the events leading up to crash, reproducer sends a corrupted HSR frame with incomplete TAG, via AF_PACKET socket on 'veth0_to_hsr'. The first HSR-layer function to process this frame is hsr_handle_frame(). It and then checks if the protocol is ETH_P_PRP or ETH_P_HSR. If it is, it calls skb_set_network_header(skb, ETH_HLEN + HSR_HLEN), without checking that the skb is long enough. For the crashing frame it is not, and hence the skb->network_header and skb->mac_len fields are set incorrectly, pointing after the end of the linear buffer. I will call this a BUG#1 and it is what is addressed by this patch. In the crashing scenario before the fix, the skb continues to go down the hsr path as follows. hsr_handle_frame() then calls this sequence hsr_forward_skb() fill_frame_info() hsr->proto_ops->fill_frame_info() hsr_fill_frame_info() hsr_fill_frame_info() contains a check that intends to check whether the skb actually contains the HSR header. But the check relies on the skb->mac_len field which was erroneously setup due to BUG#1, so the check passes and the execution continues back in the hsr_forward_skb(): hsr_forward_skb() hsr_forward_do() hsr->proto_ops->get_untagged_frame() hsr_get_untagged_frame() create_stripped_skb_hsr() In create_stripped_skb_hsr(), a copy of the skb is created and is further corrupted by operation that attempts to strip the HSR tag in a call to __pskb_copy(). The skb enters create_stripped_skb_hsr() with ethernet header pushed in linear buffer. The skb_pull(skb_in, HSR_HLEN) thus pulls 6 bytes of ethernet header into the headroom, creating skb_in with a headroom of size 8. The subsequent __pskb_copy() then creates an skb with headroom of just 2 and skb->len of just 12, this is how it looks after the copy: gdb) p skb->len $10 = 12 (gdb) p skb->data $11 = (unsigned char *) 0xffff888041e45382 "\252\252\252\252\252!\210\373", (gdb) p skb->head $12 = (unsigned char *) 0xffff888041e45380 "" It seems create_stripped_skb_hsr() assumes that ETH header is pulled in the headroom when it's entered, because it just pulls HSR header on top. But that is not the case in our code-path and we end up with the corrupted skb instead. I will call this BUG#2 *I got confused here because it seems that under no conditions can create_stripped_skb_hsr() work well, the assumption it makes is not true during the processing of hsr frames - since the skb_push() in hsr_handle_frame to skb_pull in hsr_deliver_master(). I wonder whether I missed something here.* Next, the execution arrives in hsr_deliver_master(). It calls skb_pull(ETH_HLEN), which just returns NULL - the SKB does not have enough space for the pull (as it only has 12 bytes in total at this point). *The skb_pull() here further suggests that ethernet header is meant to be pushed through the whole hsr processing and create_stripped_skb_hsr() should pull it before doing the HSR header pull.* hsr_deliver_master() then puts the corrupted skb on the queue, it is then picked up from there by bridge frame handling layer and finally lands in br_dev_queue_push_xmit where it panics. Cc: stable@kernel.org Fixes: 48b491a5cc74 ("net: hsr: fix mac_len checks") Reported-by: syzbot+a81f2759d022496b40ab@syzkaller.appspotmail.com Signed-off-by: Jakub Acs Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250819082842.94378-1-acsjakub@amazon.de Signed-off-by: Jakub Kicinski --- net/hsr/hsr_slave.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index b87b6a6fe070..102eccf5ead7 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -63,8 +63,14 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); if ((!hsr->prot_version && protocol == htons(ETH_P_PRP)) || - protocol == htons(ETH_P_HSR)) + protocol == htons(ETH_P_HSR)) { + if (!pskb_may_pull(skb, ETH_HLEN + HSR_HLEN)) { + kfree_skb(skb); + goto finish_consume; + } + skb_set_network_header(skb, ETH_HLEN + HSR_HLEN); + } skb_reset_mac_len(skb); /* Only the frames received over the interlink port will assign a -- cgit v1.2.3 From a458b2902115b26a25d67393b12ddd57d1216aaa Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Aug 2025 13:27:24 -0700 Subject: ipv6: sr: Fix MAC comparison to be constant-time To prevent timing attacks, MACs need to be compared in constant time. Use the appropriate helper function for this. Fixes: bf355b8d2c30 ("ipv6: sr: add core files for SR HMAC support") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Reviewed-by: Andrea Mayer Link: https://patch.msgid.link/20250818202724.15713-1-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv6/seg6_hmac.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index d77b52523b6a..fd58426f222b 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -35,6 +35,7 @@ #include #include +#include #include #include #include @@ -280,7 +281,7 @@ bool seg6_hmac_validate_skb(struct sk_buff *skb) if (seg6_hmac_compute(hinfo, srh, &ipv6_hdr(skb)->saddr, hmac_output)) return false; - if (memcmp(hmac_output, tlv->hmac, SEG6_HMAC_FIELD_LEN) != 0) + if (crypto_memneq(hmac_output, tlv->hmac, SEG6_HMAC_FIELD_LEN)) return false; return true; -- cgit v1.2.3 From a6d4f25888b83b8300aef28d9ee22765c1cc9b34 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Aug 2025 17:40:30 +0000 Subject: net: set net.core.rmem_max and net.core.wmem_max to 4 MB SO_RCVBUF and SO_SNDBUF have limited range today, unless distros or system admins change rmem_max and wmem_max. Even iproute2 uses 1 MB SO_RCVBUF which is capped by the kernel. Decouple [rw]mem_max and [rw]mem_default and increase [rw]mem_max to 4 MB. Before: $ sysctl net.core.rmem_default net.core.rmem_max net.core.wmem_default net.core.wmem_max net.core.rmem_default = 212992 net.core.rmem_max = 212992 net.core.wmem_default = 212992 net.core.wmem_max = 212992 After: $ sysctl net.core.rmem_default net.core.rmem_max net.core.wmem_default net.core.wmem_max net.core.rmem_default = 212992 net.core.rmem_max = 4194304 net.core.wmem_default = 212992 net.core.wmem_max = 4194304 Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20250819174030.1986278-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- Documentation/admin-guide/sysctl/net.rst | 4 ++++ Documentation/networking/ip-sysctl.rst | 6 +++--- include/net/sock.h | 4 ++-- net/core/sock.c | 8 ++++---- net/ipv4/arp.c | 2 +- net/ipv6/ndisc.c | 2 +- 6 files changed, 15 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index 7b0c4291c686..2ef50828aff1 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -222,6 +222,8 @@ rmem_max The maximum receive socket buffer size in bytes. +Default: 4194304 + rps_default_mask ---------------- @@ -247,6 +249,8 @@ wmem_max The maximum send socket buffer size in bytes. +Default: 4194304 + message_burst and message_cost ------------------------------ diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 43badb338d22..9f5891c9b07b 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -209,7 +209,7 @@ neigh/default/unres_qlen_bytes - INTEGER Setting negative value is meaningless and will return error. - Default: SK_WMEM_MAX, (same as net.core.wmem_default). + Default: SK_WMEM_DEFAULT, (same as net.core.wmem_default). Exact value depends on architecture and kernel options, but should be enough to allow queuing 256 packets @@ -805,8 +805,8 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max This value results in initial window of 65535. max: maximal size of receive buffer allowed for automatically - selected receiver buffers for TCP socket. This value does not override - net.core.rmem_max. Calling setsockopt() with SO_RCVBUF disables + selected receiver buffers for TCP socket. + Calling setsockopt() with SO_RCVBUF disables automatic tuning of that socket's receive buffer size, in which case this value is ignored. Default: between 131072 and 32MB, depending on RAM size. diff --git a/include/net/sock.h b/include/net/sock.h index 1c49ea13af4a..63a6a48afb48 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2970,8 +2970,8 @@ void sk_get_meminfo(const struct sock *sk, u32 *meminfo); */ #define _SK_MEM_PACKETS 256 #define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) -#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) -#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) +#define SK_WMEM_DEFAULT (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) +#define SK_RMEM_DEFAULT (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; diff --git a/net/core/sock.c b/net/core/sock.c index ab6953d295df..8002ac6293dc 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -281,12 +281,12 @@ static struct lock_class_key af_elock_keys[AF_MAX]; static struct lock_class_key af_kern_callback_keys[AF_MAX]; /* Run time adjustable parameters. */ -__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; +__u32 sysctl_wmem_max __read_mostly = 4 << 20; EXPORT_SYMBOL(sysctl_wmem_max); -__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; +__u32 sysctl_rmem_max __read_mostly = 4 << 20; EXPORT_SYMBOL(sysctl_rmem_max); -__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; -__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; +__u32 sysctl_wmem_default __read_mostly = SK_WMEM_DEFAULT; +__u32 sysctl_rmem_default __read_mostly = SK_RMEM_DEFAULT; DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); EXPORT_SYMBOL_GPL(memalloc_socks_key); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 5cfc1c939673..833f2cf97178 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -170,7 +170,7 @@ struct neigh_table arp_tbl = { [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, - [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, + [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_DEFAULT, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 7d5abb3158ec..57aaa7ae8ac3 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -130,7 +130,7 @@ struct neigh_table nd_tbl = { [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, - [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, + [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_DEFAULT, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, -- cgit v1.2.3 From 62a2b3502573091dc5de3f9acd9e47f4b5aac9a1 Mon Sep 17 00:00:00 2001 From: "Yury Norov (NVIDIA)" Date: Mon, 18 Aug 2025 13:28:05 -0400 Subject: net: openvswitch: Use for_each_cpu() where appropriate Due to legacy reasons, openswitch code opencodes for_each_cpu() to make sure that CPU0 is always considered. Since commit c4b2bf6b4a35 ("openvswitch: Optimize operations for OvS flow_stats."), the corresponding flow->cpu_used_mask is initialized such that CPU0 is explicitly set. So, switch the code to using plain for_each_cpu(). Suggested-by: Ilya Maximets Signed-off-by: Yury Norov (NVIDIA) Acked-by: Ilya Maximets Link: https://patch.msgid.link/20250818172806.189325-1-yury.norov@gmail.com Signed-off-by: Jakub Kicinski --- net/openvswitch/flow.c | 12 ++++-------- net/openvswitch/flow_table.c | 7 +++---- 2 files changed, 7 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index b80bd3a90773..66366982f604 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -129,15 +129,13 @@ void ovs_flow_stats_get(const struct sw_flow *flow, struct ovs_flow_stats *ovs_stats, unsigned long *used, __be16 *tcp_flags) { - int cpu; + unsigned int cpu; *used = 0; *tcp_flags = 0; memset(ovs_stats, 0, sizeof(*ovs_stats)); - /* We open code this to make sure cpu 0 is always considered */ - for (cpu = 0; cpu < nr_cpu_ids; - cpu = cpumask_next(cpu, flow->cpu_used_mask)) { + for_each_cpu(cpu, flow->cpu_used_mask) { struct sw_flow_stats *stats = rcu_dereference_ovsl(flow->stats[cpu]); if (stats) { @@ -158,11 +156,9 @@ void ovs_flow_stats_get(const struct sw_flow *flow, /* Called with ovs_mutex. */ void ovs_flow_stats_clear(struct sw_flow *flow) { - int cpu; + unsigned int cpu; - /* We open code this to make sure cpu 0 is always considered */ - for (cpu = 0; cpu < nr_cpu_ids; - cpu = cpumask_next(cpu, flow->cpu_used_mask)) { + for_each_cpu(cpu, flow->cpu_used_mask) { struct sw_flow_stats *stats = ovsl_dereference(flow->stats[cpu]); if (stats) { diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index d108ae0bd0ee..ffc72a741a50 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -107,16 +107,15 @@ int ovs_flow_tbl_count(const struct flow_table *table) static void flow_free(struct sw_flow *flow) { - int cpu; + unsigned int cpu; if (ovs_identifier_is_key(&flow->id)) kfree(flow->id.unmasked_key); if (flow->sf_acts) ovs_nla_free_flow_actions((struct sw_flow_actions __force *) flow->sf_acts); - /* We open code this to make sure cpu 0 is always considered */ - for (cpu = 0; cpu < nr_cpu_ids; - cpu = cpumask_next(cpu, flow->cpu_used_mask)) { + + for_each_cpu(cpu, flow->cpu_used_mask) { if (flow->stats[cpu]) kmem_cache_free(flow_stats_cache, (struct sw_flow_stats __force *)flow->stats[cpu]); -- cgit v1.2.3 From 833e43171b00caef456a7128718e43453f77b2e7 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Fri, 15 Aug 2025 17:33:33 +0200 Subject: net: pktgen: Use min()/min_t() to improve pktgen_finalize_skb() Use min() and min_t() to improve pktgen_finalize_skb() and avoid calculating 'datalen / frags' twice. Signed-off-by: Thorsten Blum Link: https://patch.msgid.link/20250815153334.295431-3-thorsten.blum@linux.dev Signed-off-by: Paolo Abeni --- net/core/pktgen.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 0ebe5461d4d9..d41b03fd1f63 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -114,6 +114,7 @@ #include #include +#include #include #include #include @@ -2841,8 +2842,7 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, } i = 0; - frag_len = (datalen/frags) < PAGE_SIZE ? - (datalen/frags) : PAGE_SIZE; + frag_len = min_t(int, datalen / frags, PAGE_SIZE); while (datalen > 0) { if (unlikely(!pkt_dev->page)) { int node = numa_node_id(); @@ -2859,8 +2859,7 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, if (i == (frags - 1)) skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i], pkt_dev->page, 0, - (datalen < PAGE_SIZE ? - datalen : PAGE_SIZE)); + min(datalen, PAGE_SIZE)); else skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i], pkt_dev->page, 0, frag_len); -- cgit v1.2.3 From 62708b9452f8eb77513115b17c4f8d1a22ebf843 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 19 Aug 2025 19:19:51 -0700 Subject: tls: fix handling of zero-length records on the rx_list Each recvmsg() call must process either - only contiguous DATA records (any number of them) - one non-DATA record If the next record has different type than what has already been processed we break out of the main processing loop. If the record has already been decrypted (which may be the case for TLS 1.3 where we don't know type until decryption) we queue the pending record to the rx_list. Next recvmsg() will pick it up from there. Queuing the skb to rx_list after zero-copy decrypt is not possible, since in that case we decrypted directly to the user space buffer, and we don't have an skb to queue (darg.skb points to the ciphertext skb for access to metadata like length). Only data records are allowed zero-copy, and we break the processing loop after each non-data record. So we should never zero-copy and then find out that the record type has changed. The corner case we missed is when the initial record comes from rx_list, and it's zero length. Reported-by: Muhammad Alifa Ramdhan Reported-by: Billy Jheng Bing-Jhong Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser") Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250820021952.143068-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 51c98a007dda..bac65d0d4e3e 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1808,6 +1808,9 @@ int decrypt_skb(struct sock *sk, struct scatterlist *sgout) return tls_decrypt_sg(sk, NULL, sgout, &darg); } +/* All records returned from a recvmsg() call must have the same type. + * 0 is not a valid content type. Use it as "no type reported, yet". + */ static int tls_record_content_type(struct msghdr *msg, struct tls_msg *tlm, u8 *control) { @@ -2051,8 +2054,10 @@ int tls_sw_recvmsg(struct sock *sk, if (err < 0) goto end; + /* process_rx_list() will set @control if it processed any records */ copied = err; - if (len <= copied || (copied && control != TLS_RECORD_TYPE_DATA) || rx_more) + if (len <= copied || rx_more || + (control && control != TLS_RECORD_TYPE_DATA)) goto end; target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); -- cgit v1.2.3 From 91a79b792204313153e1bdbbe5acbfc28903b3a5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 20 Aug 2025 14:37:07 +0200 Subject: netfilter: nf_reject: don't leak dst refcount for loopback packets recent patches to add a WARN() when replacing skb dst entry found an old bug: WARNING: include/linux/skbuff.h:1165 skb_dst_check_unset include/linux/skbuff.h:1164 [inline] WARNING: include/linux/skbuff.h:1165 skb_dst_set include/linux/skbuff.h:1210 [inline] WARNING: include/linux/skbuff.h:1165 nf_reject_fill_skb_dst+0x2a4/0x330 net/ipv4/netfilter/nf_reject_ipv4.c:234 [..] Call Trace: nf_send_unreach+0x17b/0x6e0 net/ipv4/netfilter/nf_reject_ipv4.c:325 nft_reject_inet_eval+0x4bc/0x690 net/netfilter/nft_reject_inet.c:27 expr_call_ops_eval net/netfilter/nf_tables_core.c:237 [inline] .. This is because blamed commit forgot about loopback packets. Such packets already have a dst_entry attached, even at PRE_ROUTING stage. Instead of checking hook just check if the skb already has a route attached to it. Fixes: f53b9b0bdc59 ("netfilter: introduce support for reject at prerouting stage") Signed-off-by: Florian Westphal Link: https://patch.msgid.link/20250820123707.10671-1-fw@strlen.de Signed-off-by: Jakub Kicinski --- net/ipv4/netfilter/nf_reject_ipv4.c | 6 ++---- net/ipv6/netfilter/nf_reject_ipv6.c | 5 ++--- 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 87fd945a0d27..0d3cb2ba6fc8 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -247,8 +247,7 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb, if (!oth) return; - if ((hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) && - nf_reject_fill_skb_dst(oldskb) < 0) + if (!skb_dst(oldskb) && nf_reject_fill_skb_dst(oldskb) < 0) return; if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) @@ -321,8 +320,7 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook) if (iph->frag_off & htons(IP_OFFSET)) return; - if ((hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) && - nf_reject_fill_skb_dst(skb_in) < 0) + if (!skb_dst(skb_in) && nf_reject_fill_skb_dst(skb_in) < 0) return; if (skb_csum_unnecessary(skb_in) || diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 838295fa32e3..cb2d38e80de9 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -293,7 +293,7 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, fl6.fl6_sport = otcph->dest; fl6.fl6_dport = otcph->source; - if (hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) { + if (!skb_dst(oldskb)) { nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false); if (!dst) return; @@ -397,8 +397,7 @@ void nf_send_unreach6(struct net *net, struct sk_buff *skb_in, if (hooknum == NF_INET_LOCAL_OUT && skb_in->dev == NULL) skb_in->dev = net->loopback_dev; - if ((hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_INGRESS) && - nf_reject6_fill_skb_dst(skb_in) < 0) + if (!skb_dst(skb_in) && nf_reject6_fill_skb_dst(skb_in) < 0) return; icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0); -- cgit v1.2.3 From b08a784a5d1495c42ff9b0c70887d49211cddfe0 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 18 Aug 2025 19:03:54 +0100 Subject: net: Introduce skb_copy_datagram_from_iter_full() In a similar manner to copy_from_iter()/copy_from_iter_full(), introduce skb_copy_datagram_from_iter_full() which reverts the iterator to its initial state when returning an error. A subsequent fix for a vsock regression will make use of this new function. Cc: Christian Brauner Cc: Alexander Viro Signed-off-by: Will Deacon Acked-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Link: https://patch.msgid.link/20250818180355.29275-2-will@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 2 ++ net/core/datagram.c | 14 ++++++++++++++ 2 files changed, 16 insertions(+) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 14b923ddb6df..fa633657e4c0 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4172,6 +4172,8 @@ int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, u32 *crcp); int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, struct iov_iter *from, int len); +int skb_copy_datagram_from_iter_full(struct sk_buff *skb, int offset, + struct iov_iter *from, int len); int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm); void skb_free_datagram(struct sock *sk, struct sk_buff *skb); int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags); diff --git a/net/core/datagram.c b/net/core/datagram.c index 94cc4705e91d..f474b9b120f9 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -618,6 +618,20 @@ fault: } EXPORT_SYMBOL(skb_copy_datagram_from_iter); +int skb_copy_datagram_from_iter_full(struct sk_buff *skb, int offset, + struct iov_iter *from, int len) +{ + struct iov_iter_state state; + int ret; + + iov_iter_save_state(from, &state); + ret = skb_copy_datagram_from_iter(skb, offset, from, len); + if (ret) + iov_iter_restore(from, &state); + return ret; +} +EXPORT_SYMBOL(skb_copy_datagram_from_iter_full); + int zerocopy_fill_skb_from_iter(struct sk_buff *skb, struct iov_iter *from, size_t length) { -- cgit v1.2.3 From 7fb1291257ea1e27dbc3f34c6a37b4d640aafdd7 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 18 Aug 2025 19:03:55 +0100 Subject: vsock/virtio: Fix message iterator handling on transmit path Commit 6693731487a8 ("vsock/virtio: Allocate nonlinear SKBs for handling large transmit buffers") converted the virtio vsock transmit path to utilise nonlinear SKBs when handling large buffers. As part of this change, virtio_transport_fill_skb() was updated to call skb_copy_datagram_from_iter() instead of memcpy_from_msg() as the latter expects a single destination buffer and cannot handle nonlinear SKBs correctly. Unfortunately, during this conversion, I overlooked the error case when the copying function returns -EFAULT due to a fault on the input buffer in userspace. In this case, memcpy_from_msg() reverts the iterator to its initial state thanks to copy_from_iter_full() whereas skb_copy_datagram_from_iter() leaves the iterator partially advanced. This results in a WARN_ONCE() from the vsock code, which expects the iterator to stay in sync with the number of bytes transmitted so that virtio_transport_send_pkt_info() can return -EFAULT when it is called again: ------------[ cut here ]------------ 'send_pkt()' returns 0, but 65536 expected WARNING: CPU: 0 PID: 5503 at net/vmw_vsock/virtio_transport_common.c:428 virtio_transport_send_pkt_info+0xd11/0xf00 net/vmw_vsock/virtio_transport_common.c:426 Modules linked in: CPU: 0 UID: 0 PID: 5503 Comm: syz.0.17 Not tainted 6.16.0-syzkaller-12063-g37816488247d #0 PREEMPT(full) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 Call virtio_transport_fill_skb_full() to restore the previous iterator behaviour. Cc: Jason Wang Cc: Stefano Garzarella Fixes: 6693731487a8 ("vsock/virtio: Allocate nonlinear SKBs for handling large transmit buffers") Reported-by: syzbot+b4d960daf7a3c7c2b7b1@syzkaller.appspotmail.com Signed-off-by: Will Deacon Acked-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Link: https://patch.msgid.link/20250818180355.29275-3-will@kernel.org Signed-off-by: Jakub Kicinski --- net/vmw_vsock/virtio_transport_common.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index fe92e5fa95b4..dcc8a1d5851e 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -105,12 +105,14 @@ static int virtio_transport_fill_skb(struct sk_buff *skb, size_t len, bool zcopy) { + struct msghdr *msg = info->msg; + if (zcopy) - return __zerocopy_sg_from_iter(info->msg, NULL, skb, - &info->msg->msg_iter, len, NULL); + return __zerocopy_sg_from_iter(msg, NULL, skb, + &msg->msg_iter, len, NULL); virtio_vsock_skb_put(skb, len); - return skb_copy_datagram_from_iter(skb, 0, &info->msg->msg_iter, len); + return skb_copy_datagram_from_iter_full(skb, 0, &msg->msg_iter, len); } static void virtio_transport_init_hdr(struct sk_buff *skb, -- cgit v1.2.3 From 5d7eba62e5eb68347de59b31b347b24f304cf21c Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 22 Aug 2025 13:40:18 -0400 Subject: Bluetooth: hci_conn: Make unacked packet handling more robust This attempts to make unacked packet handling more robust by detecting if there are no connections left then restore all buffers of the respective pool. Fixes: 5638d9ea9c01 ("Bluetooth: hci_conn: Fix not restoring ISO buffer count on disconnect") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 58 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 7a879290dd28..e524bb59bff2 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -149,8 +149,6 @@ static void hci_conn_cleanup(struct hci_conn *conn) hci_chan_list_flush(conn); - hci_conn_hash_del(hdev, conn); - if (HCI_CONN_HANDLE_UNSET(conn->handle)) ida_free(&hdev->unset_handle_ida, conn->handle); @@ -1152,28 +1150,54 @@ void hci_conn_del(struct hci_conn *conn) disable_delayed_work_sync(&conn->auto_accept_work); disable_delayed_work_sync(&conn->idle_work); - if (conn->type == ACL_LINK) { - /* Unacked frames */ - hdev->acl_cnt += conn->sent; - } else if (conn->type == LE_LINK) { - cancel_delayed_work(&conn->le_conn_timeout); + /* Remove the connection from the list so unacked logic can detect when + * a certain pool is not being utilized. + */ + hci_conn_hash_del(hdev, conn); - if (hdev->le_pkts) - hdev->le_cnt += conn->sent; + /* Handle unacked frames: + * + * - In case there are no connection, or if restoring the buffers + * considered in transist would overflow, restore all buffers to the + * pool. + * - Otherwise restore just the buffers considered in transit for the + * hci_conn + */ + switch (conn->type) { + case ACL_LINK: + if (!hci_conn_num(hdev, ACL_LINK) || + hdev->acl_cnt + conn->sent > hdev->acl_pkts) + hdev->acl_cnt = hdev->acl_pkts; else hdev->acl_cnt += conn->sent; - } else { - /* Unacked ISO frames */ - if (conn->type == CIS_LINK || - conn->type == BIS_LINK || - conn->type == PA_LINK) { - if (hdev->iso_pkts) - hdev->iso_cnt += conn->sent; - else if (hdev->le_pkts) + break; + case LE_LINK: + cancel_delayed_work(&conn->le_conn_timeout); + + if (hdev->le_pkts) { + if (!hci_conn_num(hdev, LE_LINK) || + hdev->le_cnt + conn->sent > hdev->le_pkts) + hdev->le_cnt = hdev->le_pkts; + else hdev->le_cnt += conn->sent; + } else { + if ((!hci_conn_num(hdev, LE_LINK) && + !hci_conn_num(hdev, ACL_LINK)) || + hdev->acl_cnt + conn->sent > hdev->acl_pkts) + hdev->acl_cnt = hdev->acl_pkts; else hdev->acl_cnt += conn->sent; } + break; + case CIS_LINK: + case BIS_LINK: + case PA_LINK: + if (!hci_iso_count(hdev) || + hdev->iso_cnt + conn->sent > hdev->iso_pkts) + hdev->iso_cnt = hdev->iso_pkts; + else + hdev->iso_cnt += conn->sent; + break; } skb_queue_purge(&conn->data_q); -- cgit v1.2.3 From 2f050a5392b7a0928bf836d9891df4851463512c Mon Sep 17 00:00:00 2001 From: Ludovico de Nittis Date: Tue, 12 Aug 2025 17:55:26 +0200 Subject: Bluetooth: hci_event: Treat UNKNOWN_CONN_ID on disconnect as success When the host sends an HCI_OP_DISCONNECT command, the controller may respond with the status HCI_ERROR_UNKNOWN_CONN_ID (0x02). E.g. this can happen on resume from suspend, if the link was terminated by the remote device before the event mask was correctly set. This is a btmon snippet that shows the issue: ``` > ACL Data RX: Handle 3 flags 0x02 dlen 12 L2CAP: Disconnection Request (0x06) ident 5 len 4 Destination CID: 65 Source CID: 72 < ACL Data TX: Handle 3 flags 0x00 dlen 12 L2CAP: Disconnection Response (0x07) ident 5 len 4 Destination CID: 65 Source CID: 72 > ACL Data RX: Handle 3 flags 0x02 dlen 12 L2CAP: Disconnection Request (0x06) ident 6 len 4 Destination CID: 64 Source CID: 71 < ACL Data TX: Handle 3 flags 0x00 dlen 12 L2CAP: Disconnection Response (0x07) ident 6 len 4 Destination CID: 64 Source CID: 71 < HCI Command: Set Event Mask (0x03|0x0001) plen 8 Mask: 0x3dbff807fffbffff Inquiry Complete Inquiry Result Connection Complete Connection Request Disconnection Complete Authentication Complete [...] < HCI Command: Disconnect (0x01|0x0006) plen 3 Handle: 3 Address: 78:20:A5:4A:DF:28 (Nintendo Co.,Ltd) Reason: Remote User Terminated Connection (0x13) > HCI Event: Command Status (0x0f) plen 4 Disconnect (0x01|0x0006) ncmd 1 Status: Unknown Connection Identifier (0x02) ``` Currently, the hci_cs_disconnect function treats any non-zero status as a command failure. This can be misleading because the connection is indeed being terminated and the controller is confirming that is has no knowledge of that connection handle. Meaning that the initial request of disconnecting a device should be treated as done. With this change we allow the function to proceed, following the success path, which correctly calls `mgmt_device_disconnected` and ensures a consistent state. Link: https://github.com/bluez/bluez/issues/1226 Fixes: 182ee45da083 ("Bluetooth: hci_sync: Rework hci_suspend_notifier") Signed-off-by: Ludovico de Nittis Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index fe7cdd67ad2a..6c67dfa139e2 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2703,7 +2703,7 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status) if (!conn) goto unlock; - if (status) { + if (status && status != HCI_ERROR_UNKNOWN_CONN_ID) { mgmt_disconnect_failed(hdev, &conn->dst, conn->type, conn->dst_type, status); -- cgit v1.2.3 From b7fafbc499b5ee164018eb0eefe9027f5a6aaad2 Mon Sep 17 00:00:00 2001 From: Ludovico de Nittis Date: Tue, 12 Aug 2025 17:55:27 +0200 Subject: Bluetooth: hci_event: Mark connection as closed during suspend disconnect When suspending, the disconnect command for an active Bluetooth connection could be issued, but the corresponding `HCI_EV_DISCONN_COMPLETE` event might not be received before the system completes the suspend process. This can lead to an inconsistent state. On resume, the controller may auto-accept reconnections from the same device (due to suspend event filters), but these new connections are rejected by the kernel which still has connection objects from before suspend. Resulting in errors like: ``` kernel: Bluetooth: hci0: ACL packet for unknown connection handle 1 kernel: Bluetooth: hci0: Ignoring HCI_Connection_Complete for existing connection ``` This is a btmon snippet that shows the issue: ``` < HCI Command: Disconnect (0x01|0x0006) plen 3 Handle: 1 Address: 78:20:A5:4A:DF:28 (Nintendo Co.,Ltd) Reason: Remote User Terminated Connection (0x13) > HCI Event: Command Status (0x0f) plen 4 Disconnect (0x01|0x0006) ncmd 2 Status: Success (0x00) [...] // Host suspends with the event filter set for the device // On resume, the device tries to reconnect with a new handle > HCI Event: Connect Complete (0x03) plen 11 Status: Success (0x00) Handle: 2 Address: 78:20:A5:4A:DF:28 (Nintendo Co.,Ltd) // Kernel ignores this event because there is an existing connection with // handle 1 ``` By explicitly setting the connection state to BT_CLOSED we can ensure a consistent state, even if we don't receive the disconnect complete event in time. Link: https://github.com/bluez/bluez/issues/1226 Fixes: 182ee45da083 ("Bluetooth: hci_sync: Rework hci_suspend_notifier") Signed-off-by: Ludovico de Nittis Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 6c67dfa139e2..ce0ff06f2f73 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2718,6 +2718,12 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status) goto done; } + /* During suspend, mark connection as closed immediately + * since we might not receive HCI_EV_DISCONN_COMPLETE + */ + if (hdev->suspended) + conn->state = BT_CLOSED; + mgmt_conn = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags); if (conn->type == ACL_LINK) { -- cgit v1.2.3 From 15bf2c6391bafb14a3020d06ec0761bce0803463 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 20 Aug 2025 17:04:00 -0400 Subject: Bluetooth: hci_event: Detect if HCI_EV_NUM_COMP_PKTS is unbalanced This attempts to detect if HCI_EV_NUM_COMP_PKTS contain an unbalanced (more than currently considered outstanding) number of packets otherwise it could cause the hcon->sent to underflow and loop around breaking the tracking of the outstanding packets pending acknowledgment. Fixes: f42809185896 ("Bluetooth: Simplify num_comp_pkts_evt function") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index ce0ff06f2f73..904bcff4f4ca 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4404,7 +4404,17 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data, if (!conn) continue; - conn->sent -= count; + /* Check if there is really enough packets outstanding before + * attempting to decrease the sent counter otherwise it could + * underflow.. + */ + if (conn->sent >= count) { + conn->sent -= count; + } else { + bt_dev_warn(hdev, "hcon %p sent %u < count %u", + conn, conn->sent, count); + conn->sent = 0; + } for (i = 0; i < count; ++i) hci_conn_tx_dequeue(conn); -- cgit v1.2.3 From 55b9551fcdf6a2fe7f3422918d5697b56794da72 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 20 Aug 2025 10:16:17 +0800 Subject: Bluetooth: hci_event: Disconnect device when BIG sync is lost When a BIG sync is lost, the device should be set to "disconnected". This ensures symmetry with the ISO path setup, where the device is marked as "connected" once the path is established. Without this change, the device state remains inconsistent and may lead to a memory leak. Fixes: b2a5f2e1c127 ("Bluetooth: hci_event: Add support for handling LE BIG Sync Lost event") Signed-off-by: Yang Li Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 5 +++++ net/bluetooth/mgmt.c | 4 +++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 904bcff4f4ca..7a2174851857 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -7024,6 +7024,7 @@ static void hci_le_big_sync_lost_evt(struct hci_dev *hdev, void *data, { struct hci_evt_le_big_sync_lost *ev = data; struct hci_conn *bis, *conn; + bool mgmt_conn; bt_dev_dbg(hdev, "big handle 0x%2.2x", ev->handle); @@ -7042,6 +7043,10 @@ static void hci_le_big_sync_lost_evt(struct hci_dev *hdev, void *data, while ((bis = hci_conn_hash_lookup_big_state(hdev, ev->handle, BT_CONNECTED, HCI_ROLE_SLAVE))) { + mgmt_conn = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &bis->flags); + mgmt_device_disconnected(hdev, &bis->dst, bis->type, bis->dst_type, + ev->reason, mgmt_conn); + clear_bit(HCI_CONN_BIG_SYNC, &bis->flags); hci_disconn_cfm(bis, ev->reason); hci_conn_del(bis); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 3166f5fb876b..90e37ff2c85d 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -9705,7 +9705,9 @@ void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr, if (!mgmt_connected) return; - if (link_type != ACL_LINK && link_type != LE_LINK) + if (link_type != ACL_LINK && + link_type != LE_LINK && + link_type != BIS_LINK) return; bacpy(&ev.addr.bdaddr, bdaddr); -- cgit v1.2.3 From 6bbd0d3f0c23fc53c17409dd7476f38ae0ff0cd9 Mon Sep 17 00:00:00 2001 From: Pavel Shpakovskiy Date: Fri, 22 Aug 2025 12:20:55 +0300 Subject: Bluetooth: hci_sync: fix set_local_name race condition Function set_name_sync() uses hdev->dev_name field to send HCI_OP_WRITE_LOCAL_NAME command, but copying from data to hdev->dev_name is called after mgmt cmd was queued, so it is possible that function set_name_sync() will read old name value. This change adds name as a parameter for function hci_update_name_sync() to avoid race condition. Fixes: 6f6ff38a1e14 ("Bluetooth: hci_sync: Convert MGMT_OP_SET_LOCAL_NAME") Signed-off-by: Pavel Shpakovskiy Reviewed-by: Paul Menzel Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_sync.h | 2 +- net/bluetooth/hci_sync.c | 6 +++--- net/bluetooth/mgmt.c | 5 ++++- 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h index 5224f57f6af2..e352a4e0ef8d 100644 --- a/include/net/bluetooth/hci_sync.h +++ b/include/net/bluetooth/hci_sync.h @@ -93,7 +93,7 @@ int hci_update_class_sync(struct hci_dev *hdev); int hci_update_eir_sync(struct hci_dev *hdev); int hci_update_class_sync(struct hci_dev *hdev); -int hci_update_name_sync(struct hci_dev *hdev); +int hci_update_name_sync(struct hci_dev *hdev, const u8 *name); int hci_write_ssp_mode_sync(struct hci_dev *hdev, u8 mode); int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 31d72b9683ef..b6f888d8354e 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -3481,13 +3481,13 @@ int hci_update_scan_sync(struct hci_dev *hdev) return hci_write_scan_enable_sync(hdev, scan); } -int hci_update_name_sync(struct hci_dev *hdev) +int hci_update_name_sync(struct hci_dev *hdev, const u8 *name) { struct hci_cp_write_local_name cp; memset(&cp, 0, sizeof(cp)); - memcpy(cp.name, hdev->dev_name, sizeof(cp.name)); + memcpy(cp.name, name, sizeof(cp.name)); return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_LOCAL_NAME, sizeof(cp), &cp, @@ -3540,7 +3540,7 @@ int hci_powered_update_sync(struct hci_dev *hdev) hci_write_fast_connectable_sync(hdev, false); hci_update_scan_sync(hdev); hci_update_class_sync(hdev); - hci_update_name_sync(hdev); + hci_update_name_sync(hdev, hdev->dev_name); hci_update_eir_sync(hdev); } diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 90e37ff2c85d..50634ef5c8b7 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3892,8 +3892,11 @@ static void set_name_complete(struct hci_dev *hdev, void *data, int err) static int set_name_sync(struct hci_dev *hdev, void *data) { + struct mgmt_pending_cmd *cmd = data; + struct mgmt_cp_set_local_name *cp = cmd->param; + if (lmp_bredr_capable(hdev)) { - hci_update_name_sync(hdev); + hci_update_name_sync(hdev, cp->name); hci_update_eir_sync(hdev); } -- cgit v1.2.3 From abadf0ff63be488dc502ecfc9f622929a21b7117 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Thu, 21 Aug 2025 03:03:46 +0000 Subject: page_pool: fix incorrect mp_ops error handling Minor fix to the memory provider error handling, we should be jumping to free_ptr_ring in this error case rather than returning directly. Found by code-inspection. Cc: skhawaja@google.com Fixes: b400f4b87430 ("page_pool: Set `dma_sync` to false for devmem memory provider") Signed-off-by: Mina Almasry Reviewed-by: Samiullah Khawaja Link: https://patch.msgid.link/20250821030349.705244-1-almasrymina@google.com Signed-off-by: Jakub Kicinski --- net/core/page_pool.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 343a6cac21e3..ba70569bd4b0 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -287,8 +287,10 @@ static int page_pool_init(struct page_pool *pool, } if (pool->mp_ops) { - if (!pool->dma_map || !pool->dma_sync) - return -EOPNOTSUPP; + if (!pool->dma_map || !pool->dma_sync) { + err = -EOPNOTSUPP; + goto free_ptr_ring; + } if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) { err = -EFAULT; -- cgit v1.2.3 From d5ffba0f254d29a13908d4510762b31d4247a94c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Aug 2025 14:19:00 +0000 Subject: tcp: annotate data-races around tp->rx_opt.user_mss This field is already read locklessly for listeners, next patch will make setsockopt(TCP_MAXSEG) lockless. Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20250821141901.18839-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c | 6 ++++-- drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h | 2 +- net/ipv4/tcp.c | 8 +++++--- net/ipv4/tcp_input.c | 8 ++++---- net/ipv4/tcp_output.c | 6 ++++-- 5 files changed, 18 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c index 2e7c2691a193..000116e47e38 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c @@ -951,6 +951,7 @@ static unsigned int chtls_select_mss(const struct chtls_sock *csk, struct tcp_sock *tp; unsigned int mss; struct sock *sk; + u16 user_mss; mss = ntohs(req->tcpopt.mss); sk = csk->sk; @@ -969,8 +970,9 @@ static unsigned int chtls_select_mss(const struct chtls_sock *csk, tcpoptsz += round_up(TCPOLEN_TIMESTAMP, 4); tp->advmss = dst_metric_advmss(dst); - if (USER_MSS(tp) && tp->advmss > USER_MSS(tp)) - tp->advmss = USER_MSS(tp); + user_mss = USER_MSS(tp); + if (user_mss && tp->advmss > user_mss) + tp->advmss = user_mss; if (tp->advmss > pmtu - iphdrsz) tp->advmss = pmtu - iphdrsz; if (mss && tp->advmss > mss) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h index 2285cf2df251..667effc2a23c 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h @@ -90,7 +90,7 @@ struct deferred_skb_cb { #define SND_WSCALE(tp) ((tp)->rx_opt.snd_wscale) #define RCV_WSCALE(tp) ((tp)->rx_opt.rcv_wscale) -#define USER_MSS(tp) ((tp)->rx_opt.user_mss) +#define USER_MSS(tp) (READ_ONCE((tp)->rx_opt.user_mss)) #define TS_RECENT_STAMP(tp) ((tp)->rx_opt.ts_recent_stamp) #define WSCALE_OK(tp) ((tp)->rx_opt.wscale_ok) #define TSTAMP_OK(tp) ((tp)->rx_opt.tstamp_ok) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 71a956fbfc55..a12d81e01b3f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3760,7 +3760,7 @@ int tcp_sock_set_maxseg(struct sock *sk, int val) if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) return -EINVAL; - tcp_sk(sk)->rx_opt.user_mss = val; + WRITE_ONCE(tcp_sk(sk)->rx_opt.user_mss, val); return 0; } @@ -4383,6 +4383,7 @@ int do_tcp_getsockopt(struct sock *sk, int level, struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); + int user_mss; int val, len; if (copy_from_sockptr(&len, optlen, sizeof(int))) @@ -4396,9 +4397,10 @@ int do_tcp_getsockopt(struct sock *sk, int level, switch (optname) { case TCP_MAXSEG: val = tp->mss_cache; - if (tp->rx_opt.user_mss && + user_mss = READ_ONCE(tp->rx_opt.user_mss); + if (user_mss && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) - val = tp->rx_opt.user_mss; + val = user_mss; if (tp->repair) val = tp->rx_opt.mss_clamp; break; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 71b76e98371a..7b537978dfe6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6297,7 +6297,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, u16 mss = tp->rx_opt.mss_clamp, try_exp = 0; bool syn_drop = false; - if (mss == tp->rx_opt.user_mss) { + if (mss == READ_ONCE(tp->rx_opt.user_mss)) { struct tcp_options_received opt; /* Get original SYNACK MSS value if user MSS sets mss_clamp */ @@ -7117,7 +7117,7 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, return 0; } - mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss); + mss = tcp_parse_mss_option(th, READ_ONCE(tp->rx_opt.user_mss)); if (!mss) mss = af_ops->mss_clamp; @@ -7131,7 +7131,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, { struct tcp_fastopen_cookie foc = { .len = -1 }; struct tcp_options_received tmp_opt; - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct sock *fastopen_sk = NULL; struct request_sock *req; @@ -7182,7 +7182,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = af_ops->mss_clamp; - tmp_opt.user_mss = tp->rx_opt.user_mss; + tmp_opt.user_mss = READ_ONCE(tp->rx_opt.user_mss); tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, want_cookie ? NULL : &foc); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index dfbac0876d96..86892c8672ed 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3890,6 +3890,7 @@ static void tcp_connect_init(struct sock *sk) const struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); __u8 rcv_wscale; + u16 user_mss; u32 rcv_wnd; /* We'll fix this up when we get a response from the other end. @@ -3902,8 +3903,9 @@ static void tcp_connect_init(struct sock *sk) tcp_ao_connect_init(sk); /* If user gave his TCP_MAXSEG, record it to clamp */ - if (tp->rx_opt.user_mss) - tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; + user_mss = READ_ONCE(tp->rx_opt.user_mss); + if (user_mss) + tp->rx_opt.mss_clamp = user_mss; tp->max_window = 0; tcp_mtup_init(sk); tcp_sync_mss(sk, dst_mtu(dst)); -- cgit v1.2.3 From 9217146fee49575dfe4ac9416587392fc31171f1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Aug 2025 14:19:01 +0000 Subject: tcp: lockless TCP_MAXSEG option setsockopt(TCP_MAXSEG) writes over a field that does not need socket lock protection anymore. Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20250821141901.18839-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a12d81e01b3f..99232903b03c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3890,15 +3890,13 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, WRITE_ONCE(inet_csk(sk)->icsk_delack_max, delack_max); return 0; } + case TCP_MAXSEG: + return tcp_sock_set_maxseg(sk, val); } sockopt_lock_sock(sk); switch (optname) { - case TCP_MAXSEG: - err = tcp_sock_set_maxseg(sk, val); - break; - case TCP_NODELAY: __tcp_sock_set_nodelay(sk, val); break; -- cgit v1.2.3 From 9308987803bbf289d088d5266c5c3598e3fb3ddf Mon Sep 17 00:00:00 2001 From: Ujwal Kundur Date: Wed, 20 Aug 2025 23:25:47 +0530 Subject: rds: Replace POLLERR with EPOLLERR Both constants are 1<<3, but EPOLLERR uses the correct annotations. Flagged by Sparse. Signed-off-by: Ujwal Kundur Reviewed-by: Allison Henderson Link: https://patch.msgid.link/20250820175550.498-2-ujwal.kundur@gmail.com Signed-off-by: Jakub Kicinski --- net/rds/af_rds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 086a13170e09..4a7217fbeab6 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -242,7 +242,7 @@ static __poll_t rds_poll(struct file *file, struct socket *sock, if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) mask |= (EPOLLOUT | EPOLLWRNORM); if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) - mask |= POLLERR; + mask |= EPOLLERR; read_unlock_irqrestore(&rs->rs_recv_lock, flags); /* clear state any time we wake a seen-congested socket */ -- cgit v1.2.3 From 92b925297a2f233e0b16694df7b524360b8abb93 Mon Sep 17 00:00:00 2001 From: Ujwal Kundur Date: Wed, 20 Aug 2025 23:25:48 +0530 Subject: rds: Fix endianness annotation of jhash wrappers __ipv6_addr_jhash (wrapper around jhash2()) and __inet_ehashfn (wrapper around jhash_3words()) work with u32 (host endian) values but accept big endian inputs. Declare the local variables as big endian to avoid unnecessary casts. Flagged by Sparse. Signed-off-by: Ujwal Kundur Reviewed-by: Allison Henderson Link: https://patch.msgid.link/20250820175550.498-3-ujwal.kundur@gmail.com Signed-off-by: Jakub Kicinski --- net/rds/connection.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index d62f486ab29f..68bc88cce84e 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -57,16 +57,17 @@ static struct hlist_head *rds_conn_bucket(const struct in6_addr *laddr, static u32 rds6_hash_secret __read_mostly; static u32 rds_hash_secret __read_mostly; - u32 lhash, fhash, hash; + __be32 lhash, fhash; + u32 hash; net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret)); net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret)); - lhash = (__force u32)laddr->s6_addr32[3]; + lhash = laddr->s6_addr32[3]; #if IS_ENABLED(CONFIG_IPV6) - fhash = __ipv6_addr_jhash(faddr, rds6_hash_secret); + fhash = (__force __be32)__ipv6_addr_jhash(faddr, rds6_hash_secret); #else - fhash = (__force u32)faddr->s6_addr32[3]; + fhash = faddr->s6_addr32[3]; #endif hash = __inet_ehashfn(lhash, 0, fhash, 0, rds_hash_secret); -- cgit v1.2.3 From 77907a068717fbefb25faf01fecca553aca6ccaa Mon Sep 17 00:00:00 2001 From: Ujwal Kundur Date: Wed, 20 Aug 2025 23:25:49 +0530 Subject: rds: Fix endianness annotation for RDS_MPATH_HASH jhash_1word accepts host endian inputs while rs_bound_port is a be16 value (sockaddr_in6.sin6_port). Use ntohs() for consistency. Flagged by Sparse. Signed-off-by: Ujwal Kundur Reviewed-by: Allison Henderson Link: https://patch.msgid.link/20250820175550.498-4-ujwal.kundur@gmail.com Signed-off-by: Jakub Kicinski --- net/rds/rds.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/rds.h b/net/rds/rds.h index dc360252c515..5b1c072e2e7f 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -93,7 +93,7 @@ enum { /* Max number of multipaths per RDS connection. Must be a power of 2 */ #define RDS_MPATH_WORKERS 8 -#define RDS_MPATH_HASH(rs, n) (jhash_1word((rs)->rs_bound_port, \ +#define RDS_MPATH_HASH(rs, n) (jhash_1word(ntohs((rs)->rs_bound_port), \ (rs)->rs_hash_initval) & ((n) - 1)) #define IS_CANONICAL(laddr, faddr) (htonl(laddr) < htonl(faddr)) -- cgit v1.2.3 From bcb28bee987a1e161eaa5cc4cf2fb0e21306d4a7 Mon Sep 17 00:00:00 2001 From: Ujwal Kundur Date: Wed, 20 Aug 2025 23:25:50 +0530 Subject: rds: Fix endianness annotations for RDS extension headers Per the RDS 3.1 spec [1], RDS extension headers EXTHDR_NPATHS and EXTHDR_GEN_NUM are be16 and be32 values respectively, exchanged during normal operations over-the-wire (RDS Ping/Pong). This contrasts their declarations as host endian unsigned ints. Fix the annotations across occurrences. Flagged by Sparse. [1] https://oss.oracle.com/projects/rds/dist/documentation/rds-3.1-spec.html Signed-off-by: Ujwal Kundur Reviewed-by: Allison Henderson Link: https://patch.msgid.link/20250820175550.498-5-ujwal.kundur@gmail.com Signed-off-by: Jakub Kicinski --- net/rds/message.c | 4 ++-- net/rds/recv.c | 4 ++-- net/rds/send.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/rds/message.c b/net/rds/message.c index 7af59d2443e5..199a899a43e9 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -44,8 +44,8 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { [RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version), [RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma), [RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest), -[RDS_EXTHDR_NPATHS] = sizeof(u16), -[RDS_EXTHDR_GEN_NUM] = sizeof(u32), +[RDS_EXTHDR_NPATHS] = sizeof(__be16), +[RDS_EXTHDR_GEN_NUM] = sizeof(__be32), }; void rds_message_addref(struct rds_message *rm) diff --git a/net/rds/recv.c b/net/rds/recv.c index 5627f80013f8..66205d6924bf 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -202,8 +202,8 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr, unsigned int pos = 0, type, len; union { struct rds_ext_header_version version; - u16 rds_npaths; - u32 rds_gen_num; + __be16 rds_npaths; + __be32 rds_gen_num; } buffer; u32 new_peer_gen_num = 0; diff --git a/net/rds/send.c b/net/rds/send.c index 42d991bc8543..0b3d0ef2f008 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1454,8 +1454,8 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport, if (RDS_HS_PROBE(be16_to_cpu(sport), be16_to_cpu(dport)) && cp->cp_conn->c_trans->t_mp_capable) { - u16 npaths = cpu_to_be16(RDS_MPATH_WORKERS); - u32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num); + __be16 npaths = cpu_to_be16(RDS_MPATH_WORKERS); + __be32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_NPATHS, &npaths, -- cgit v1.2.3 From ec79003c5f9d2c7f9576fc69b8dbda80305cbe3a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 21 Aug 2025 02:18:24 +0000 Subject: atm: atmtcp: Prevent arbitrary write in atmtcp_recv_control(). syzbot reported the splat below. [0] When atmtcp_v_open() or atmtcp_v_close() is called via connect() or close(), atmtcp_send_control() is called to send an in-kernel special message. The message has ATMTCP_HDR_MAGIC in atmtcp_control.hdr.length. Also, a pointer of struct atm_vcc is set to atmtcp_control.vcc. The notable thing is struct atmtcp_control is uAPI but has a space for an in-kernel pointer. struct atmtcp_control { struct atmtcp_hdr hdr; /* must be first */ ... atm_kptr_t vcc; /* both directions */ ... } __ATM_API_ALIGN; typedef struct { unsigned char _[8]; } __ATM_API_ALIGN atm_kptr_t; The special message is processed in atmtcp_recv_control() called from atmtcp_c_send(). atmtcp_c_send() is vcc->dev->ops->send() and called from 2 paths: 1. .ndo_start_xmit() (vcc->send() == atm_send_aal0()) 2. vcc_sendmsg() The problem is sendmsg() does not validate the message length and userspace can abuse atmtcp_recv_control() to overwrite any kptr by atmtcp_control. Let's add a new ->pre_send() hook to validate messages from sendmsg(). [0]: Oops: general protection fault, probably for non-canonical address 0xdffffc00200000ab: 0000 [#1] SMP KASAN PTI KASAN: probably user-memory-access in range [0x0000000100000558-0x000000010000055f] CPU: 0 UID: 0 PID: 5865 Comm: syz-executor331 Not tainted 6.17.0-rc1-syzkaller-00215-gbab3ce404553 #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/12/2025 RIP: 0010:atmtcp_recv_control drivers/atm/atmtcp.c:93 [inline] RIP: 0010:atmtcp_c_send+0x1da/0x950 drivers/atm/atmtcp.c:297 Code: 4d 8d 75 1a 4c 89 f0 48 c1 e8 03 42 0f b6 04 20 84 c0 0f 85 15 06 00 00 41 0f b7 1e 4d 8d b7 60 05 00 00 4c 89 f0 48 c1 e8 03 <42> 0f b6 04 20 84 c0 0f 85 13 06 00 00 66 41 89 1e 4d 8d 75 1c 4c RSP: 0018:ffffc90003f5f810 EFLAGS: 00010203 RAX: 00000000200000ab RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff88802a510000 RSI: 00000000ffffffff RDI: ffff888030a6068c RBP: ffff88802699fb40 R08: ffff888030a606eb R09: 1ffff1100614c0dd R10: dffffc0000000000 R11: ffffffff8718fc40 R12: dffffc0000000000 R13: ffff888030a60680 R14: 000000010000055f R15: 00000000ffffffff FS: 00007f8d7e9236c0(0000) GS:ffff888125c1c000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000045ad50 CR3: 0000000075bde000 CR4: 00000000003526f0 Call Trace: vcc_sendmsg+0xa10/0xc60 net/atm/common.c:645 sock_sendmsg_nosec net/socket.c:714 [inline] __sock_sendmsg+0x219/0x270 net/socket.c:729 ____sys_sendmsg+0x505/0x830 net/socket.c:2614 ___sys_sendmsg+0x21f/0x2a0 net/socket.c:2668 __sys_sendmsg net/socket.c:2700 [inline] __do_sys_sendmsg net/socket.c:2705 [inline] __se_sys_sendmsg net/socket.c:2703 [inline] __x64_sys_sendmsg+0x19b/0x260 net/socket.c:2703 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f8d7e96a4a9 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 51 18 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f8d7e923198 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f8d7e9f4308 RCX: 00007f8d7e96a4a9 RDX: 0000000000000000 RSI: 0000200000000240 RDI: 0000000000000005 RBP: 00007f8d7e9f4300 R08: 65732f636f72702f R09: 65732f636f72702f R10: 65732f636f72702f R11: 0000000000000246 R12: 00007f8d7e9c10ac R13: 00007f8d7e9231a0 R14: 0000200000000200 R15: 0000200000000250 Modules linked in: Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+1741b56d54536f4ec349@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/68a6767c.050a0220.3d78fd.0011.GAE@google.com/ Tested-by: syzbot+1741b56d54536f4ec349@syzkaller.appspotmail.com Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250821021901.2814721-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- drivers/atm/atmtcp.c | 17 ++++++++++++++--- include/linux/atmdev.h | 1 + net/atm/common.c | 15 ++++++++++++--- 3 files changed, 27 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/drivers/atm/atmtcp.c b/drivers/atm/atmtcp.c index eeae160c898d..fa3c76a2b49d 100644 --- a/drivers/atm/atmtcp.c +++ b/drivers/atm/atmtcp.c @@ -279,6 +279,19 @@ static struct atm_vcc *find_vcc(struct atm_dev *dev, short vpi, int vci) return NULL; } +static int atmtcp_c_pre_send(struct atm_vcc *vcc, struct sk_buff *skb) +{ + struct atmtcp_hdr *hdr; + + if (skb->len < sizeof(struct atmtcp_hdr)) + return -EINVAL; + + hdr = (struct atmtcp_hdr *)skb->data; + if (hdr->length == ATMTCP_HDR_MAGIC) + return -EINVAL; + + return 0; +} static int atmtcp_c_send(struct atm_vcc *vcc,struct sk_buff *skb) { @@ -288,9 +301,6 @@ static int atmtcp_c_send(struct atm_vcc *vcc,struct sk_buff *skb) struct sk_buff *new_skb; int result = 0; - if (skb->len < sizeof(struct atmtcp_hdr)) - goto done; - dev = vcc->dev_data; hdr = (struct atmtcp_hdr *) skb->data; if (hdr->length == ATMTCP_HDR_MAGIC) { @@ -347,6 +357,7 @@ static const struct atmdev_ops atmtcp_v_dev_ops = { static const struct atmdev_ops atmtcp_c_dev_ops = { .close = atmtcp_c_close, + .pre_send = atmtcp_c_pre_send, .send = atmtcp_c_send }; diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h index 45f2f278b50a..70807c679f1a 100644 --- a/include/linux/atmdev.h +++ b/include/linux/atmdev.h @@ -185,6 +185,7 @@ struct atmdev_ops { /* only send is required */ int (*compat_ioctl)(struct atm_dev *dev,unsigned int cmd, void __user *arg); #endif + int (*pre_send)(struct atm_vcc *vcc, struct sk_buff *skb); int (*send)(struct atm_vcc *vcc,struct sk_buff *skb); int (*send_bh)(struct atm_vcc *vcc, struct sk_buff *skb); int (*send_oam)(struct atm_vcc *vcc,void *cell,int flags); diff --git a/net/atm/common.c b/net/atm/common.c index d7f7976ea13a..881c7f259dbd 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -635,18 +635,27 @@ int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t size) skb->dev = NULL; /* for paths shared with net_device interfaces */ if (!copy_from_iter_full(skb_put(skb, size), size, &m->msg_iter)) { - atm_return_tx(vcc, skb); - kfree_skb(skb); error = -EFAULT; - goto out; + goto free_skb; } if (eff != size) memset(skb->data + size, 0, eff-size); + + if (vcc->dev->ops->pre_send) { + error = vcc->dev->ops->pre_send(vcc, skb); + if (error) + goto free_skb; + } + error = vcc->dev->ops->send(vcc, skb); error = error ? error : size; out: release_sock(sk); return error; +free_skb: + atm_return_tx(vcc, skb); + kfree_skb(skb); + goto out; } __poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait) -- cgit v1.2.3 From c04db81cd0288dfc68b7a0f7d09bd49b40bba451 Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Sun, 22 Jun 2025 22:39:56 +0900 Subject: net/9p: Fix buffer overflow in USB transport layer A buffer overflow vulnerability exists in the USB 9pfs transport layer where inconsistent size validation between packet header parsing and actual data copying allows a malicious USB host to overflow heap buffers. The issue occurs because: - usb9pfs_rx_header() validates only the declared size in packet header - usb9pfs_rx_complete() uses req->actual (actual received bytes) for memcpy This allows an attacker to craft packets with small declared size (bypassing validation) but large actual payload (triggering overflow in memcpy). Add validation in usb9pfs_rx_complete() to ensure req->actual does not exceed the buffer capacity before copying data. Reported-by: Yuhao Jiang Closes: https://lkml.kernel.org/r/20250616132539.63434-1-danisjiang@gmail.com Fixes: a3be076dc174 ("net/9p/usbg: Add new usb gadget function transport") Cc: stable@vger.kernel.org Message-ID: <20250622-9p-usb_overflow-v3-1-ab172691b946@codewreck.org> Signed-off-by: Dominique Martinet --- net/9p/trans_usbg.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/9p/trans_usbg.c b/net/9p/trans_usbg.c index 6b694f117aef..468f7e8f0277 100644 --- a/net/9p/trans_usbg.c +++ b/net/9p/trans_usbg.c @@ -231,6 +231,8 @@ static void usb9pfs_rx_complete(struct usb_ep *ep, struct usb_request *req) struct f_usb9pfs *usb9pfs = ep->driver_data; struct usb_composite_dev *cdev = usb9pfs->function.config->cdev; struct p9_req_t *p9_rx_req; + unsigned int req_size = req->actual; + int status = REQ_STATUS_RCVD; if (req->status) { dev_err(&cdev->gadget->dev, "%s usb9pfs complete --> %d, %d/%d\n", @@ -242,11 +244,19 @@ static void usb9pfs_rx_complete(struct usb_ep *ep, struct usb_request *req) if (!p9_rx_req) return; - memcpy(p9_rx_req->rc.sdata, req->buf, req->actual); + if (req_size > p9_rx_req->rc.capacity) { + dev_err(&cdev->gadget->dev, + "%s received data size %u exceeds buffer capacity %zu\n", + ep->name, req_size, p9_rx_req->rc.capacity); + req_size = 0; + status = REQ_STATUS_ERROR; + } + + memcpy(p9_rx_req->rc.sdata, req->buf, req_size); - p9_rx_req->rc.size = req->actual; + p9_rx_req->rc.size = req_size; - p9_client_cb(usb9pfs->client, p9_rx_req, REQ_STATUS_RCVD); + p9_client_cb(usb9pfs->client, p9_rx_req, status); p9_req_put(usb9pfs->client, p9_rx_req); complete(&usb9pfs->received); -- cgit v1.2.3 From 674b56aa57f9379854cb6798c3bbcef7e7b51ab7 Mon Sep 17 00:00:00 2001 From: Nalivayko Sergey Date: Tue, 15 Jul 2025 18:48:15 +0300 Subject: net/9p: fix double req put in p9_fd_cancelled Syzkaller reports a KASAN issue as below: general protection fault, probably for non-canonical address 0xfbd59c0000000021: 0000 [#1] PREEMPT SMP KASAN NOPTI KASAN: maybe wild-memory-access in range [0xdead000000000108-0xdead00000000010f] CPU: 0 PID: 5083 Comm: syz-executor.2 Not tainted 6.1.134-syzkaller-00037-g855bd1d7d838 #0 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 RIP: 0010:__list_del include/linux/list.h:114 [inline] RIP: 0010:__list_del_entry include/linux/list.h:137 [inline] RIP: 0010:list_del include/linux/list.h:148 [inline] RIP: 0010:p9_fd_cancelled+0xe9/0x200 net/9p/trans_fd.c:734 Call Trace: p9_client_flush+0x351/0x440 net/9p/client.c:614 p9_client_rpc+0xb6b/0xc70 net/9p/client.c:734 p9_client_version net/9p/client.c:920 [inline] p9_client_create+0xb51/0x1240 net/9p/client.c:1027 v9fs_session_init+0x1f0/0x18f0 fs/9p/v9fs.c:408 v9fs_mount+0xba/0xcb0 fs/9p/vfs_super.c:126 legacy_get_tree+0x108/0x220 fs/fs_context.c:632 vfs_get_tree+0x8e/0x300 fs/super.c:1573 do_new_mount fs/namespace.c:3056 [inline] path_mount+0x6a6/0x1e90 fs/namespace.c:3386 do_mount fs/namespace.c:3399 [inline] __do_sys_mount fs/namespace.c:3607 [inline] __se_sys_mount fs/namespace.c:3584 [inline] __x64_sys_mount+0x283/0x300 fs/namespace.c:3584 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x35/0x80 arch/x86/entry/common.c:81 entry_SYSCALL_64_after_hwframe+0x6e/0xd8 This happens because of a race condition between: - The 9p client sending an invalid flush request and later cleaning it up; - The 9p client in p9_read_work() canceled all pending requests. Thread 1 Thread 2 ... p9_client_create() ... p9_fd_create() ... p9_conn_create() ... // start Thread 2 INIT_WORK(&m->rq, p9_read_work); p9_read_work() ... p9_client_rpc() ... ... p9_conn_cancel() ... spin_lock(&m->req_lock); ... p9_fd_cancelled() ... ... spin_unlock(&m->req_lock); // status rewrite p9_client_cb(m->client, req, REQ_STATUS_ERROR) // first remove list_del(&req->req_list); ... spin_lock(&m->req_lock) ... // second remove list_del(&req->req_list); spin_unlock(&m->req_lock) ... Commit 74d6a5d56629 ("9p/trans_fd: Fix concurrency del of req_list in p9_fd_cancelled/p9_read_work") fixes a concurrency issue in the 9p filesystem client where the req_list could be deleted simultaneously by both p9_read_work and p9_fd_cancelled functions, but for the case where req->status equals REQ_STATUS_RCVD. Update the check for req->status in p9_fd_cancelled to skip processing not just received requests, but anything that is not SENT, as whatever changed the state from SENT also removed the request from its list. Found by Linux Verification Center (linuxtesting.org) with Syzkaller. Fixes: afd8d6541155 ("9P: Add cancelled() to the transport functions.") Cc: stable@vger.kernel.org Signed-off-by: Nalivayko Sergey Message-ID: <20250715154815.3501030-1-Sergey.Nalivayko@kaspersky.com> [updated the check from status == RECV || status == ERROR to status != SENT] Signed-off-by: Dominique Martinet --- net/9p/trans_fd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 339ec4e54778..8992d8bebbdd 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -726,10 +726,10 @@ static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req) p9_debug(P9_DEBUG_TRANS, "client %p req %p\n", client, req); spin_lock(&m->req_lock); - /* Ignore cancelled request if message has been received - * before lock. - */ - if (req->status == REQ_STATUS_RCVD) { + /* Ignore cancelled request if status changed since the request was + * processed in p9_client_flush() + */ + if (req->status != REQ_STATUS_SENT) { spin_unlock(&m->req_lock); return 0; } -- cgit v1.2.3 From 1b8c5fa0cb35efd08f07f700e6d78a541ebabe26 Mon Sep 17 00:00:00 2001 From: Oscar Maes Date: Tue, 19 Aug 2025 19:46:41 +0200 Subject: net: ipv4: allow directed broadcast routes to use dst hint Currently, ip_extract_route_hint uses RTN_BROADCAST to decide whether to use the route dst hint mechanism. This check is too strict, as it prevents directed broadcast routes from using the hint, resulting in poor performance during bursts of directed broadcast traffic. Fix this in ip_extract_route_hint and modify ip_route_use_hint to preserve the intended behaviour. Signed-off-by: Oscar Maes Reviewed-by: David Ahern Link: https://patch.msgid.link/20250819174642.5148-2-oscmaes92@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/ip_input.c | 11 +++++++---- net/ipv4/route.c | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index fc323994b1fa..a09aca2c8567 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -587,9 +587,13 @@ static void ip_sublist_rcv_finish(struct list_head *head) } static struct sk_buff *ip_extract_route_hint(const struct net *net, - struct sk_buff *skb, int rt_type) + struct sk_buff *skb) { - if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST || + const struct iphdr *iph = ip_hdr(skb); + + if (fib4_has_custom_rules(net) || + ipv4_is_lbcast(iph->daddr) || + ipv4_is_zeronet(iph->daddr) || IPCB(skb)->flags & IPSKB_MULTIPATH) return NULL; @@ -618,8 +622,7 @@ static void ip_list_rcv_finish(struct net *net, struct list_head *head) dst = skb_dst(skb); if (curr_dst != dst) { - hint = ip_extract_route_hint(net, skb, - dst_rtable(dst)->rt_type); + hint = ip_extract_route_hint(net, skb); /* dispatch old sublist */ if (!list_empty(&sublist)) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f639a2ae881a..1f212b2ce4c6 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2210,7 +2210,7 @@ ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto martian_source; } - if (rt->rt_type != RTN_LOCAL) + if (!(rt->rt_flags & RTCF_LOCAL)) goto skip_validate_source; reason = fib_validate_source_reason(skb, saddr, daddr, dscp, 0, dev, -- cgit v1.2.3 From e6f178be3c12cd6b8fb1b81dd0b9118e0a0d0333 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Aug 2025 09:17:25 +0000 Subject: tcp: annotate data-races around icsk->icsk_retransmits icsk->icsk_retransmits is read locklessly from inet_sk_diag_fill(), tcp_get_timestamping_opt_stats, get_tcp4_sock() and get_tcp6_sock(). Add corresponding READ_ONCE()/WRITE_ONCE() annotations. Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20250822091727.835869-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_diag.c | 2 +- net/ipv4/tcp.c | 3 ++- net/ipv4/tcp_input.c | 6 +++--- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_output.c | 2 +- net/ipv4/tcp_timer.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- net/mptcp/protocol.c | 3 ++- 8 files changed, 12 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 2fa53b16fe77..35c1579e5bd4 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -313,7 +313,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk_pending == ICSK_TIME_LOSS_PROBE) { r->idiag_timer = 1; - r->idiag_retrans = icsk->icsk_retransmits; + r->idiag_retrans = READ_ONCE(icsk->icsk_retransmits); r->idiag_expires = jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies); } else if (icsk_pending == ICSK_TIME_PROBE0) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 99232903b03c..4728801d06a7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4346,7 +4346,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering); nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp)); - nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); + nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, + READ_ONCE(inet_csk(sk)->icsk_retransmits)); nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh); nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7b537978dfe6..3baf3bef0d83 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2569,7 +2569,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) if (frto_undo) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); - inet_csk(sk)->icsk_retransmits = 0; + WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0); if (tcp_is_non_sack_preventing_reopen(sk)) return true; if (frto_undo || tcp_is_sack(tp)) { @@ -3851,7 +3851,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, prior_snd_una)) { flag |= FLAG_SND_UNA_ADVANCED; - icsk->icsk_retransmits = 0; + WRITE_ONCE(icsk->icsk_retransmits, 0); #if IS_ENABLED(CONFIG_TLS_DEVICE) if (static_branch_unlikely(&clean_acked_data_enabled.key)) @@ -6636,7 +6636,7 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) tcp_try_undo_recovery(sk); tcp_update_rto_time(tp); - inet_csk(sk)->icsk_retransmits = 0; + WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0); /* In tcp_fastopen_synack_timer() on the first SYNACK RTO we set * retrans_stamp but don't enter CA_Loss, so in case that happened we * need to zero retrans_stamp here to prevent spurious diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 84d3d556ed80..5d549dfd4e60 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2958,7 +2958,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) rx_queue, timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), - icsk->icsk_retransmits, + READ_ONCE(icsk->icsk_retransmits), from_kuid_munged(seq_user_ns(f), sk_uid(sk)), icsk->icsk_probes_out, sock_i_ino(sk), diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 86892c8672ed..72969c27eaaa 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3956,7 +3956,7 @@ static void tcp_connect_init(struct sock *sk) WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); inet_csk(sk)->icsk_rto = tcp_timeout_init(sk); - inet_csk(sk)->icsk_retransmits = 0; + WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0); tcp_clear_retrans(tp); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index a207877270fb..8b11ab4cc952 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -444,7 +444,7 @@ static void tcp_update_rto_stats(struct sock *sk) tp->total_rto_recoveries++; tp->rto_stamp = tcp_time_stamp_ms(tp); } - icsk->icsk_retransmits++; + WRITE_ONCE(icsk->icsk_retransmits, icsk->icsk_retransmits + 1); tp->total_rto++; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7577e7eb2c97..7b177054452b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2230,7 +2230,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) rx_queue, timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), - icsk->icsk_retransmits, + READ_ONCE(icsk->icsk_retransmits), from_kuid_munged(seq_user_ns(seq), sk_uid(sp)), icsk->icsk_probes_out, sock_i_ino(sp), diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 9a287b75c1b3..f2e728239480 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2587,7 +2587,8 @@ static void __mptcp_retrans(struct sock *sk) if (mptcp_data_fin_enabled(msk)) { struct inet_connection_sock *icsk = inet_csk(sk); - icsk->icsk_retransmits++; + WRITE_ONCE(icsk->icsk_retransmits, + icsk->icsk_retransmits + 1); mptcp_set_datafin_timeout(sk); mptcp_send_ack(msk); -- cgit v1.2.3 From 9bd999eb35cfcc404fb640712f9023f51a303cbe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Aug 2025 09:17:26 +0000 Subject: tcp: annotate data-races around icsk->icsk_probes_out icsk->icsk_probes_out is read locklessly from inet_sk_diag_fill(), get_tcp4_sock() and get_tcp6_sock(). Add corresponding READ_ONCE()/WRITE_ONCE() annotations. Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20250822091727.835869-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_diag.c | 4 ++-- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_output.c | 4 ++-- net/ipv4/tcp_timer.c | 4 ++-- net/ipv6/tcp_ipv6.c | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 35c1579e5bd4..549f1f521f4f 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -318,12 +318,12 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies); } else if (icsk_pending == ICSK_TIME_PROBE0) { r->idiag_timer = 4; - r->idiag_retrans = icsk->icsk_probes_out; + r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out); r->idiag_expires = jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies); } else if (timer_pending(&sk->sk_timer)) { r->idiag_timer = 2; - r->idiag_retrans = icsk->icsk_probes_out; + r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out); r->idiag_expires = jiffies_delta_to_msecs(sk->sk_timer.expires - jiffies); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4728801d06a7..9bc8317e92b7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3376,7 +3376,7 @@ int tcp_disconnect(struct sock *sk, int flags) WRITE_ONCE(tp->write_seq, seq); icsk->icsk_backoff = 0; - icsk->icsk_probes_out = 0; + WRITE_ONCE(icsk->icsk_probes_out, 0); icsk->icsk_probes_tstamp = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; WRITE_ONCE(icsk->icsk_rto_min, TCP_RTO_MIN); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3baf3bef0d83..a52a747d8a55 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3913,7 +3913,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) * log. Something worked... */ WRITE_ONCE(sk->sk_err_soft, 0); - icsk->icsk_probes_out = 0; + WRITE_ONCE(icsk->icsk_probes_out, 0); tp->rcv_tstamp = tcp_jiffies32; if (!prior_packets) goto no_queue; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5d549dfd4e60..9543f1538359 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2960,7 +2960,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) jiffies_delta_to_clock_t(timer_expires - jiffies), READ_ONCE(icsk->icsk_retransmits), from_kuid_munged(seq_user_ns(f), sk_uid(sk)), - icsk->icsk_probes_out, + READ_ONCE(icsk->icsk_probes_out), sock_i_ino(sk), refcount_read(&sk->sk_refcnt), sk, jiffies_to_clock_t(icsk->icsk_rto), diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 72969c27eaaa..06b26a6efd62 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -4394,13 +4394,13 @@ void tcp_send_probe0(struct sock *sk) if (tp->packets_out || tcp_write_queue_empty(sk)) { /* Cancel probe timer, if it is not required. */ - icsk->icsk_probes_out = 0; + WRITE_ONCE(icsk->icsk_probes_out, 0); icsk->icsk_backoff = 0; icsk->icsk_probes_tstamp = 0; return; } - icsk->icsk_probes_out++; + WRITE_ONCE(icsk->icsk_probes_out, icsk->icsk_probes_out + 1); if (err <= 0) { if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2)) icsk->icsk_backoff++; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 8b11ab4cc952..2dd73a4e8e51 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -392,7 +392,7 @@ static void tcp_probe_timer(struct sock *sk) int max_probes; if (tp->packets_out || !skb) { - icsk->icsk_probes_out = 0; + WRITE_ONCE(icsk->icsk_probes_out, 0); icsk->icsk_probes_tstamp = 0; return; } @@ -839,7 +839,7 @@ static void tcp_keepalive_timer(struct timer_list *t) goto out; } if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) { - icsk->icsk_probes_out++; + WRITE_ONCE(icsk->icsk_probes_out, icsk->icsk_probes_out + 1); elapsed = keepalive_intvl_when(tp); } else { /* If keepalive was lost due to local congestion, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7b177054452b..5620d9e50e19 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2232,7 +2232,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) jiffies_delta_to_clock_t(timer_expires - jiffies), READ_ONCE(icsk->icsk_retransmits), from_kuid_munged(seq_user_ns(seq), sk_uid(sp)), - icsk->icsk_probes_out, + READ_ONCE(icsk->icsk_probes_out), sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, jiffies_to_clock_t(icsk->icsk_rto), -- cgit v1.2.3 From 60c481d4caa569001c708d4e9622d19650b6bedc Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Fri, 22 Aug 2025 14:40:51 +0800 Subject: ipv6: mcast: Add ip6_mc_find_idev() helper Extract the same code logic from __ipv6_sock_mc_join() and ip6_mc_find_dev(), also add new helper ip6_mc_find_idev() to reduce redundancy and enhance readability. No functional changes intended. Signed-off-by: Yue Haibing Reviewed-by: Dawid Osuchowski Link: https://patch.msgid.link/20250822064051.2991480-1-yuehaibing@huawei.com Signed-off-by: Jakub Kicinski --- net/ipv6/mcast.c | 67 ++++++++++++++++++++++++++------------------------------ 1 file changed, 31 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 36ca27496b3c..55c49dc14b1b 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -169,6 +169,29 @@ static int unsolicited_report_interval(struct inet6_dev *idev) return iv > 0 ? iv : 1; } +static struct net_device *ip6_mc_find_dev(struct net *net, + const struct in6_addr *group, + int ifindex) +{ + struct net_device *dev = NULL; + struct rt6_info *rt; + + if (ifindex == 0) { + rcu_read_lock(); + rt = rt6_lookup(net, group, NULL, 0, NULL, 0); + if (rt) { + dev = dst_dev(&rt->dst); + dev_hold(dev); + ip6_rt_put(rt); + } + rcu_read_unlock(); + } else { + dev = dev_get_by_index(net, ifindex); + } + + return dev; +} + /* * socket join on multicast group */ @@ -191,28 +214,13 @@ static int __ipv6_sock_mc_join(struct sock *sk, int ifindex, } mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL); - if (!mc_lst) return -ENOMEM; mc_lst->next = NULL; mc_lst->addr = *addr; - if (ifindex == 0) { - struct rt6_info *rt; - - rcu_read_lock(); - rt = rt6_lookup(net, addr, NULL, 0, NULL, 0); - if (rt) { - dev = dst_dev(&rt->dst); - dev_hold(dev); - ip6_rt_put(rt); - } - rcu_read_unlock(); - } else { - dev = dev_get_by_index(net, ifindex); - } - + dev = ip6_mc_find_dev(net, addr, ifindex); if (!dev) { sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return -ENODEV; @@ -302,27 +310,14 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) } EXPORT_SYMBOL(ipv6_sock_mc_drop); -static struct inet6_dev *ip6_mc_find_dev(struct net *net, - const struct in6_addr *group, - int ifindex) +static struct inet6_dev *ip6_mc_find_idev(struct net *net, + const struct in6_addr *group, + int ifindex) { - struct net_device *dev = NULL; + struct net_device *dev; struct inet6_dev *idev; - if (ifindex == 0) { - struct rt6_info *rt; - - rcu_read_lock(); - rt = rt6_lookup(net, group, NULL, 0, NULL, 0); - if (rt) { - dev = dst_dev(&rt->dst); - dev_hold(dev); - ip6_rt_put(rt); - } - rcu_read_unlock(); - } else { - dev = dev_get_by_index(net, ifindex); - } + dev = ip6_mc_find_dev(net, group, ifindex); if (!dev) return NULL; @@ -374,7 +369,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, if (!ipv6_addr_is_multicast(group)) return -EINVAL; - idev = ip6_mc_find_dev(net, group, pgsr->gsr_interface); + idev = ip6_mc_find_idev(net, group, pgsr->gsr_interface); if (!idev) return -ENODEV; @@ -509,7 +504,7 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, gsf->gf_fmode != MCAST_EXCLUDE) return -EINVAL; - idev = ip6_mc_find_dev(net, group, gsf->gf_interface); + idev = ip6_mc_find_idev(net, group, gsf->gf_interface); if (!idev) return -ENODEV; -- cgit v1.2.3 From 9db0163e3cad57a36ac335308c17550c6911b7df Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 22 Aug 2025 19:06:56 +0000 Subject: tcp: Remove sk_protocol test for tcp_twsk_unique(). Commit 383eed2de529 ("tcp: get rid of twsk_unique()") added sk->sk_protocol test in __inet_check_established() and __inet6_check_established() to remove twsk_unique() and call tcp_twsk_unique() directly. DCCP has gone, and the condition is always true. Let's remove the sk_protocol test. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250822190803.540788-2-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_hashtables.c | 3 +-- net/ipv6/inet6_hashtables.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index ceeeec9b7290..fef71dd72521 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -579,8 +579,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); - if (sk->sk_protocol == IPPROTO_TCP && - tcp_twsk_unique(sk, sk2, twp)) + if (tcp_twsk_unique(sk, sk2, twp)) break; } goto not_unique; diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 76ee521189eb..dbb10774764a 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -305,8 +305,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); - if (sk->sk_protocol == IPPROTO_TCP && - tcp_twsk_unique(sk, sk2, twp)) + if (tcp_twsk_unique(sk, sk2, twp)) break; } goto not_unique; -- cgit v1.2.3 From 2d842b6c670b9bffee7c16cda284eb49644d8169 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 22 Aug 2025 19:06:57 +0000 Subject: tcp: Remove timewait_sock_ops.twsk_destructor(). Since DCCP has been removed, sk->sk_prot->twsk_prot->twsk_destructor is always tcp_twsk_destructor(). Let's call tcp_twsk_destructor() directly in inet_twsk_free() and remove ->twsk_destructor(). While at it, tcp_twsk_destructor() is un-exported. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250822190803.540788-3-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/timewait_sock.h | 7 ------- net/ipv4/inet_timewait_sock.c | 5 +++-- net/ipv4/tcp_ipv4.c | 1 - net/ipv4/tcp_minisocks.c | 1 - net/ipv6/tcp_ipv6.c | 1 - 5 files changed, 3 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h index 62b3e9f2aed4..0a85ac64a66d 100644 --- a/include/net/timewait_sock.h +++ b/include/net/timewait_sock.h @@ -15,13 +15,6 @@ struct timewait_sock_ops { struct kmem_cache *twsk_slab; char *twsk_slab_name; unsigned int twsk_obj_size; - void (*twsk_destructor)(struct sock *sk); }; -static inline void twsk_destructor(struct sock *sk) -{ - if (sk->sk_prot->twsk_prot->twsk_destructor != NULL) - sk->sk_prot->twsk_prot->twsk_destructor(sk); -} - #endif /* _TIMEWAIT_SOCK_H */ diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 875ff923a8ed..5b5426b8ee92 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -15,7 +15,7 @@ #include #include #include - +#include /** * inet_twsk_bind_unhash - unhash a timewait socket from bind hash @@ -74,7 +74,8 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw) void inet_twsk_free(struct inet_timewait_sock *tw) { struct module *owner = tw->tw_prot->owner; - twsk_destructor((struct sock *)tw); + + tcp_twsk_destructor((struct sock *)tw); kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw); module_put(owner); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 9543f1538359..a48b98f67b6a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2459,7 +2459,6 @@ do_time_wait: static struct timewait_sock_ops tcp_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp_timewait_sock), - .twsk_destructor= tcp_twsk_destructor, }; void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 2994c9222c9c..d1c9e4088646 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -401,7 +401,6 @@ void tcp_twsk_destructor(struct sock *sk) #endif tcp_ao_destroy_sock(sk, true); } -EXPORT_IPV6_MOD_GPL(tcp_twsk_destructor); void tcp_twsk_purge(struct list_head *net_exit_list) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5620d9e50e19..d99717376bff 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2050,7 +2050,6 @@ void tcp_v6_early_demux(struct sk_buff *skb) static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp6_timewait_sock), - .twsk_destructor = tcp_twsk_destructor, }; INDIRECT_CALLABLE_SCOPE void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb) -- cgit v1.2.3 From 8150f3a44b17cded59c4cfb71efd59f0a293c48e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 22 Aug 2025 19:06:58 +0000 Subject: tcp: Remove hashinfo test for inet6?_lookup_run_sk_lookup(). Commit 6c886db2e78c ("net: remove duplicate sk_lookup helpers") started to check if hashinfo == net->ipv4.tcp_death_row.hashinfo in __inet_lookup_listener() and inet6_lookup_listener() and stopped invoking BPF sk_lookup prog for DCCP. DCCP has gone and the condition is always true. Let's remove the hashinfo test. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250822190803.540788-4-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_hashtables.c | 3 +-- net/ipv6/inet6_hashtables.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index fef71dd72521..374adb8a2640 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -436,8 +436,7 @@ struct sock *__inet_lookup_listener(const struct net *net, unsigned int hash2; /* Lookup redirect from BPF */ - if (static_branch_unlikely(&bpf_sk_lookup_enabled) && - hashinfo == net->ipv4.tcp_death_row.hashinfo) { + if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, saddr, sport, daddr, hnum, dif, inet_ehashfn); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index dbb10774764a..d6c3db31dcab 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -211,8 +211,7 @@ struct sock *inet6_lookup_listener(const struct net *net, unsigned int hash2; /* Lookup redirect from BPF */ - if (static_branch_unlikely(&bpf_sk_lookup_enabled) && - hashinfo == net->ipv4.tcp_death_row.hashinfo) { + if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { result = inet6_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, saddr, sport, daddr, hnum, dif, inet6_ehashfn); -- cgit v1.2.3 From cb16f4b6c73df4be16b74099f826fea30ef72426 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 22 Aug 2025 19:06:59 +0000 Subject: tcp: Don't pass hashinfo to socket lookup helpers. These socket lookup functions required struct inet_hashinfo because they are shared by TCP and DCCP. * __inet_lookup_established() * __inet_lookup_listener() * __inet6_lookup_established() * inet6_lookup_listener() DCCP has gone, and we don't need to pass hashinfo down to them. Let's fetch net->ipv4.tcp_death_row.hashinfo directly in the above 4 functions. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250822190803.540788-5-kuniyu@google.com Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c | 9 ++--- drivers/net/ethernet/netronome/nfp/crypto/tls.c | 9 ++--- include/net/inet6_hashtables.h | 18 +++------ include/net/inet_hashtables.h | 37 ++++++++---------- net/core/filter.c | 5 +-- net/ipv4/esp4.c | 4 +- net/ipv4/inet_diag.c | 6 +-- net/ipv4/inet_hashtables.c | 28 +++++++------- net/ipv4/netfilter/nf_socket_ipv4.c | 3 +- net/ipv4/netfilter/nf_tproxy_ipv4.c | 5 +-- net/ipv4/tcp_ipv4.c | 16 +++----- net/ipv4/tcp_offload.c | 3 +- net/ipv6/esp6.c | 4 +- net/ipv6/inet6_hashtables.c | 45 +++++++++++----------- net/ipv6/netfilter/nf_socket_ipv6.c | 3 +- net/ipv6/netfilter/nf_tproxy_ipv6.c | 5 +-- net/ipv6/tcp_ipv6.c | 14 +++---- net/ipv6/tcpv6_offload.c | 3 +- 18 files changed, 94 insertions(+), 123 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c index 65ccb33edafb..d7a11ff9bbdb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c @@ -498,9 +498,9 @@ static void resync_update_sn(struct mlx5e_rq *rq, struct sk_buff *skb) depth += sizeof(struct iphdr); th = (void *)iph + sizeof(struct iphdr); - sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - iph->saddr, th->source, iph->daddr, - th->dest, netdev->ifindex); + sk = inet_lookup_established(net, iph->saddr, th->source, + iph->daddr, th->dest, + netdev->ifindex); #if IS_ENABLED(CONFIG_IPV6) } else { struct ipv6hdr *ipv6h = (struct ipv6hdr *)iph; @@ -508,8 +508,7 @@ static void resync_update_sn(struct mlx5e_rq *rq, struct sk_buff *skb) depth += sizeof(struct ipv6hdr); th = (void *)ipv6h + sizeof(struct ipv6hdr); - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - &ipv6h->saddr, th->source, + sk = __inet6_lookup_established(net, &ipv6h->saddr, th->source, &ipv6h->daddr, ntohs(th->dest), netdev->ifindex, 0); #endif diff --git a/drivers/net/ethernet/netronome/nfp/crypto/tls.c b/drivers/net/ethernet/netronome/nfp/crypto/tls.c index f80f1a6953fa..f252ecdcd2cd 100644 --- a/drivers/net/ethernet/netronome/nfp/crypto/tls.c +++ b/drivers/net/ethernet/netronome/nfp/crypto/tls.c @@ -495,14 +495,13 @@ int nfp_net_tls_rx_resync_req(struct net_device *netdev, switch (ipv6h->version) { case 4: - sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - iph->saddr, th->source, iph->daddr, - th->dest, netdev->ifindex); + sk = inet_lookup_established(net, iph->saddr, th->source, + iph->daddr, th->dest, + netdev->ifindex); break; #if IS_ENABLED(CONFIG_IPV6) case 6: - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - &ipv6h->saddr, th->source, + sk = __inet6_lookup_established(net, &ipv6h->saddr, th->source, &ipv6h->daddr, ntohs(th->dest), netdev->ifindex, 0); break; diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index ab3929a2a956..1f985d2012ce 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -41,7 +41,6 @@ static inline unsigned int __inet6_ehashfn(const u32 lhash, * The sockhash lock must be held as a reader here. */ struct sock *__inet6_lookup_established(const struct net *net, - struct inet_hashinfo *hashinfo, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, @@ -65,7 +64,6 @@ struct sock *inet6_lookup_reuseport(const struct net *net, struct sock *sk, inet6_ehashfn_t *ehashfn); struct sock *inet6_lookup_listener(const struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, @@ -83,7 +81,6 @@ struct sock *inet6_lookup_run_sk_lookup(const struct net *net, inet6_ehashfn_t *ehashfn); static inline struct sock *__inet6_lookup(const struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, @@ -92,14 +89,14 @@ static inline struct sock *__inet6_lookup(const struct net *net, const int dif, const int sdif, bool *refcounted) { - struct sock *sk = __inet6_lookup_established(net, hashinfo, saddr, - sport, daddr, hnum, + struct sock *sk = __inet6_lookup_established(net, saddr, sport, + daddr, hnum, dif, sdif); *refcounted = true; if (sk) return sk; *refcounted = false; - return inet6_lookup_listener(net, hashinfo, skb, doff, saddr, sport, + return inet6_lookup_listener(net, skb, doff, saddr, sport, daddr, hnum, dif, sdif); } @@ -143,8 +140,7 @@ struct sock *inet6_steal_sock(struct net *net, struct sk_buff *skb, int doff, return reuse_sk; } -static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, +static inline struct sock *__inet6_lookup_skb(struct sk_buff *skb, int doff, const __be16 sport, const __be16 dport, int iif, int sdif, @@ -161,14 +157,12 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, if (sk) return sk; - return __inet6_lookup(net, hashinfo, skb, - doff, &ip6h->saddr, sport, + return __inet6_lookup(net, skb, doff, &ip6h->saddr, sport, &ip6h->daddr, ntohs(dport), iif, sdif, refcounted); } -struct sock *inet6_lookup(const struct net *net, struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, +struct sock *inet6_lookup(const struct net *net, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, const int dif); diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 19dbd9081d5a..a3b32241c2f2 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -294,7 +294,6 @@ int inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); struct sock *__inet_lookup_listener(const struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, @@ -302,12 +301,12 @@ struct sock *__inet_lookup_listener(const struct net *net, const int dif, const int sdif); static inline struct sock *inet_lookup_listener(struct net *net, - struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, - __be32 saddr, __be16 sport, - __be32 daddr, __be16 dport, int dif, int sdif) + struct sk_buff *skb, int doff, + __be32 saddr, __be16 sport, + __be32 daddr, __be16 dport, + int dif, int sdif) { - return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, sport, + return __inet_lookup_listener(net, skb, doff, saddr, sport, daddr, ntohs(dport), dif, sdif); } @@ -358,7 +357,6 @@ static inline bool inet_match(const struct net *net, const struct sock *sk, * not check it for lookups anymore, thanks Alexey. -DaveM */ struct sock *__inet_lookup_established(const struct net *net, - struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 hnum, const int dif, const int sdif); @@ -384,18 +382,16 @@ struct sock *inet_lookup_run_sk_lookup(const struct net *net, __be32 daddr, u16 hnum, const int dif, inet_ehashfn_t *ehashfn); -static inline struct sock * - inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, - const __be32 saddr, const __be16 sport, - const __be32 daddr, const __be16 dport, - const int dif) +static inline struct sock *inet_lookup_established(struct net *net, + const __be32 saddr, const __be16 sport, + const __be32 daddr, const __be16 dport, + const int dif) { - return __inet_lookup_established(net, hashinfo, saddr, sport, daddr, + return __inet_lookup_established(net, saddr, sport, daddr, ntohs(dport), dif, 0); } static inline struct sock *__inet_lookup(struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, @@ -405,18 +401,17 @@ static inline struct sock *__inet_lookup(struct net *net, u16 hnum = ntohs(dport); struct sock *sk; - sk = __inet_lookup_established(net, hashinfo, saddr, sport, + sk = __inet_lookup_established(net, saddr, sport, daddr, hnum, dif, sdif); *refcounted = true; if (sk) return sk; *refcounted = false; - return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, + return __inet_lookup_listener(net, skb, doff, saddr, sport, daddr, hnum, dif, sdif); } static inline struct sock *inet_lookup(struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, @@ -425,7 +420,7 @@ static inline struct sock *inet_lookup(struct net *net, struct sock *sk; bool refcounted; - sk = __inet_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, + sk = __inet_lookup(net, skb, doff, saddr, sport, daddr, dport, dif, 0, &refcounted); if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt)) @@ -473,8 +468,7 @@ struct sock *inet_steal_sock(struct net *net, struct sk_buff *skb, int doff, return reuse_sk; } -static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, - struct sk_buff *skb, +static inline struct sock *__inet_lookup_skb(struct sk_buff *skb, int doff, const __be16 sport, const __be16 dport, @@ -492,8 +486,7 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, if (sk) return sk; - return __inet_lookup(net, hashinfo, skb, - doff, iph->saddr, sport, + return __inet_lookup(net, skb, doff, iph->saddr, sport, iph->daddr, dport, inet_iif(skb), sdif, refcounted); } diff --git a/net/core/filter.c b/net/core/filter.c index 63f3baee2daf..5da1cad66be2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6767,7 +6767,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, int dif, int sdif, u8 family, u8 proto) { - struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; bool refcounted = false; struct sock *sk = NULL; @@ -6776,7 +6775,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, __be32 dst4 = tuple->ipv4.daddr; if (proto == IPPROTO_TCP) - sk = __inet_lookup(net, hinfo, NULL, 0, + sk = __inet_lookup(net, NULL, 0, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, &refcounted); @@ -6790,7 +6789,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; if (proto == IPPROTO_TCP) - sk = __inet6_lookup(net, hinfo, NULL, 0, + sk = __inet6_lookup(net, NULL, 0, src6, tuple->ipv6.sport, dst6, ntohs(tuple->ipv6.dport), dif, sdif, &refcounted); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index f14a41ee4aa1..2c922afadb8f 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -132,8 +132,8 @@ static struct sock *esp_find_tcp_sk(struct xfrm_state *x) dport = encap->encap_dport; spin_unlock_bh(&x->lock); - sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, x->id.daddr.a4, - dport, x->props.saddr.a4, sport, 0); + sk = inet_lookup_established(net, x->id.daddr.a4, dport, + x->props.saddr.a4, sport, 0); if (!sk) return ERR_PTR(-ENOENT); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 549f1f521f4f..462406948c84 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -526,18 +526,18 @@ struct sock *inet_diag_find_one_icsk(struct net *net, rcu_read_lock(); if (req->sdiag_family == AF_INET) - sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0], + sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_if); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) { if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) - sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[3], + sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[3], req->id.idiag_dport, req->id.idiag_src[3], req->id.idiag_sport, req->id.idiag_if); else - sk = inet6_lookup(net, hashinfo, NULL, 0, + sk = inet6_lookup(net, NULL, 0, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, (struct in6_addr *)req->id.idiag_src, diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 374adb8a2640..4bc2b1921d2b 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -425,13 +425,13 @@ struct sock *inet_lookup_run_sk_lookup(const struct net *net, } struct sock *__inet_lookup_listener(const struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif) { struct inet_listen_hashbucket *ilb2; + struct inet_hashinfo *hashinfo; struct sock *result = NULL; unsigned int hash2; @@ -444,6 +444,7 @@ struct sock *__inet_lookup_listener(const struct net *net, goto done; } + hashinfo = net->ipv4.tcp_death_row.hashinfo; hash2 = ipv4_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); @@ -489,21 +490,22 @@ void sock_edemux(struct sk_buff *skb) EXPORT_SYMBOL(sock_edemux); struct sock *__inet_lookup_established(const struct net *net, - struct inet_hashinfo *hashinfo, - const __be32 saddr, const __be16 sport, - const __be32 daddr, const u16 hnum, - const int dif, const int sdif) + const __be32 saddr, const __be16 sport, + const __be32 daddr, const u16 hnum, + const int dif, const int sdif) { - INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(sport, hnum); - struct sock *sk; + INET_ADDR_COOKIE(acookie, saddr, daddr); const struct hlist_nulls_node *node; - /* Optimize here for direct hit, only listening connections can - * have wildcards anyways. - */ - unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); - unsigned int slot = hash & hashinfo->ehash_mask; - struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; + struct inet_ehash_bucket *head; + struct inet_hashinfo *hashinfo; + unsigned int hash, slot; + struct sock *sk; + + hashinfo = net->ipv4.tcp_death_row.hashinfo; + hash = inet_ehashfn(net, daddr, hnum, saddr, sport); + slot = hash & hashinfo->ehash_mask; + head = &hashinfo->ehash[slot]; begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c index a1350fc25838..5080fa5fbf6a 100644 --- a/net/ipv4/netfilter/nf_socket_ipv4.c +++ b/net/ipv4/netfilter/nf_socket_ipv4.c @@ -71,8 +71,7 @@ nf_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff, { switch (protocol) { case IPPROTO_TCP: - return inet_lookup(net, net->ipv4.tcp_death_row.hashinfo, - skb, doff, saddr, sport, daddr, dport, + return inet_lookup(net, skb, doff, saddr, sport, daddr, dport, in->ifindex); case IPPROTO_UDP: return udp4_lib_lookup(net, saddr, sport, daddr, dport, diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c index 73e66a088e25..041c3f37f237 100644 --- a/net/ipv4/netfilter/nf_tproxy_ipv4.c +++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c @@ -81,7 +81,6 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, const struct net_device *in, const enum nf_tproxy_lookup_t lookup_type) { - struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; struct sock *sk; switch (protocol) { @@ -95,7 +94,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, switch (lookup_type) { case NF_TPROXY_LOOKUP_LISTENER: - sk = inet_lookup_listener(net, hinfo, skb, + sk = inet_lookup_listener(net, skb, ip_hdrlen(skb) + __tcp_hdrlen(hp), saddr, sport, daddr, dport, in->ifindex, 0); @@ -109,7 +108,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, */ break; case NF_TPROXY_LOOKUP_ESTABLISHED: - sk = inet_lookup_established(net, hinfo, saddr, sport, + sk = inet_lookup_established(net, saddr, sport, daddr, dport, in->ifindex); break; default: diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a48b98f67b6a..a0c93b24c6e0 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -506,8 +506,7 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) struct sock *sk; int err; - sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - iph->daddr, th->dest, iph->saddr, + sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr, ntohs(th->source), inet_iif(skb), 0); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); @@ -823,8 +822,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, - NULL, 0, ip_hdr(skb)->saddr, + sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr, th->source, ip_hdr(skb)->daddr, ntohs(th->source), dif, sdif); /* don't send rst if it can't find key */ @@ -1992,8 +1990,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) if (th->doff < sizeof(struct tcphdr) / 4) return 0; - sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - iph->saddr, th->source, + sk = __inet_lookup_established(net, iph->saddr, th->source, iph->daddr, ntohs(th->dest), skb->skb_iif, inet_sdif(skb)); if (sk) { @@ -2236,8 +2233,7 @@ int tcp_v4_rcv(struct sk_buff *skb) th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); lookup: - sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo, - skb, __tcp_hdrlen(th), th->source, + sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source, th->dest, sdif, &refcounted); if (!sk) goto no_tcp_socket; @@ -2426,9 +2422,7 @@ do_time_wait: &drop_reason); switch (tw_status) { case TCP_TW_SYN: { - struct sock *sk2 = inet_lookup_listener(net, - net->ipv4.tcp_death_row.hashinfo, - skb, __tcp_hdrlen(th), + struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th), iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb), diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index be5c2294610e..e6612bd84d09 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -434,8 +434,7 @@ static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, inet_get_iif_sdif(skb, &iif, &sdif); iph = skb_gro_network_header(skb); net = dev_net_rcu(skb->dev); - sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - iph->saddr, th->source, + sk = __inet_lookup_established(net, iph->saddr, th->source, iph->daddr, ntohs(th->dest), iif, sdif); NAPI_GRO_CB(skb)->is_flist = !sk; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 72adfc107b55..e75da98f5283 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -149,8 +149,8 @@ static struct sock *esp6_find_tcp_sk(struct xfrm_state *x) dport = encap->encap_dport; spin_unlock_bh(&x->lock); - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &x->id.daddr.in6, - dport, &x->props.saddr.in6, ntohs(sport), 0, 0); + sk = __inet6_lookup_established(net, &x->id.daddr.in6, dport, + &x->props.saddr.in6, ntohs(sport), 0, 0); if (!sk) return ERR_PTR(-ENOENT); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index d6c3db31dcab..a3a9ea49fee2 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -47,24 +47,23 @@ EXPORT_SYMBOL_GPL(inet6_ehashfn); * The sockhash lock must be held as a reader here. */ struct sock *__inet6_lookup_established(const struct net *net, - struct inet_hashinfo *hashinfo, - const struct in6_addr *saddr, - const __be16 sport, - const struct in6_addr *daddr, - const u16 hnum, - const int dif, const int sdif) + const struct in6_addr *saddr, + const __be16 sport, + const struct in6_addr *daddr, + const u16 hnum, + const int dif, const int sdif) { - struct sock *sk; - const struct hlist_nulls_node *node; const __portpair ports = INET_COMBINED_PORTS(sport, hnum); - /* Optimize here for direct hit, only listening connections can - * have wildcards anyways. - */ - unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport); - unsigned int slot = hash & hashinfo->ehash_mask; - struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; - + const struct hlist_nulls_node *node; + struct inet_ehash_bucket *head; + struct inet_hashinfo *hashinfo; + unsigned int hash, slot; + struct sock *sk; + hashinfo = net->ipv4.tcp_death_row.hashinfo; + hash = inet6_ehashfn(net, daddr, hnum, saddr, sport); + slot = hash & hashinfo->ehash_mask; + head = &hashinfo->ehash[slot]; begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_hash != hash) @@ -200,13 +199,15 @@ struct sock *inet6_lookup_run_sk_lookup(const struct net *net, EXPORT_SYMBOL_GPL(inet6_lookup_run_sk_lookup); struct sock *inet6_lookup_listener(const struct net *net, - struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, - const struct in6_addr *saddr, - const __be16 sport, const struct in6_addr *daddr, - const unsigned short hnum, const int dif, const int sdif) + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + const __be16 sport, + const struct in6_addr *daddr, + const unsigned short hnum, + const int dif, const int sdif) { struct inet_listen_hashbucket *ilb2; + struct inet_hashinfo *hashinfo; struct sock *result = NULL; unsigned int hash2; @@ -219,6 +220,7 @@ struct sock *inet6_lookup_listener(const struct net *net, goto done; } + hashinfo = net->ipv4.tcp_death_row.hashinfo; hash2 = ipv6_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); @@ -243,7 +245,6 @@ done: EXPORT_SYMBOL_GPL(inet6_lookup_listener); struct sock *inet6_lookup(const struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, @@ -252,7 +253,7 @@ struct sock *inet6_lookup(const struct net *net, struct sock *sk; bool refcounted; - sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, + sk = __inet6_lookup(net, skb, doff, saddr, sport, daddr, ntohs(dport), dif, 0, &refcounted); if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c index 9ea5ef56cb27..ced8bd44828e 100644 --- a/net/ipv6/netfilter/nf_socket_ipv6.c +++ b/net/ipv6/netfilter/nf_socket_ipv6.c @@ -83,8 +83,7 @@ nf_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff, { switch (protocol) { case IPPROTO_TCP: - return inet6_lookup(net, net->ipv4.tcp_death_row.hashinfo, - skb, doff, saddr, sport, daddr, dport, + return inet6_lookup(net, skb, doff, saddr, sport, daddr, dport, in->ifindex); case IPPROTO_UDP: return udp6_lib_lookup(net, saddr, sport, daddr, dport, diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c index 52f828bb5a83..b2f59ed9d7cc 100644 --- a/net/ipv6/netfilter/nf_tproxy_ipv6.c +++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c @@ -80,7 +80,6 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, const struct net_device *in, const enum nf_tproxy_lookup_t lookup_type) { - struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; struct sock *sk; switch (protocol) { @@ -94,7 +93,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, switch (lookup_type) { case NF_TPROXY_LOOKUP_LISTENER: - sk = inet6_lookup_listener(net, hinfo, skb, + sk = inet6_lookup_listener(net, skb, thoff + __tcp_hdrlen(hp), saddr, sport, daddr, ntohs(dport), @@ -109,7 +108,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, */ break; case NF_TPROXY_LOOKUP_ESTABLISHED: - sk = __inet6_lookup_established(net, hinfo, saddr, sport, daddr, + sk = __inet6_lookup_established(net, saddr, sport, daddr, ntohs(dport), in->ifindex, 0); break; default: diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d99717376bff..8b2e7b7afbd8 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -388,8 +388,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, bool fatal; int err; - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - &hdr->daddr, th->dest, + sk = __inet6_lookup_established(net, &hdr->daddr, th->dest, &hdr->saddr, ntohs(th->source), skb->dev->ifindex, inet6_sdif(skb)); @@ -1073,8 +1072,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, - NULL, 0, &ipv6h->saddr, th->source, + sk1 = inet6_lookup_listener(net, NULL, 0, &ipv6h->saddr, th->source, &ipv6h->daddr, ntohs(th->source), dif, sdif); if (!sk1) @@ -1789,7 +1787,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) hdr = ipv6_hdr(skb); lookup: - sk = __inet6_lookup_skb(net->ipv4.tcp_death_row.hashinfo, skb, __tcp_hdrlen(th), + sk = __inet6_lookup_skb(skb, __tcp_hdrlen(th), th->source, th->dest, inet6_iif(skb), sdif, &refcounted); if (!sk) @@ -1976,8 +1974,7 @@ do_time_wait: { struct sock *sk2; - sk2 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, - skb, __tcp_hdrlen(th), + sk2 = inet6_lookup_listener(net, skb, __tcp_hdrlen(th), &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, ntohs(th->dest), @@ -2029,8 +2026,7 @@ void tcp_v6_early_demux(struct sk_buff *skb) return; /* Note : We use inet6_iif() here, not tcp_v6_iif() */ - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - &hdr->saddr, th->source, + sk = __inet6_lookup_established(net, &hdr->saddr, th->source, &hdr->daddr, ntohs(th->dest), inet6_iif(skb), inet6_sdif(skb)); if (sk) { diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index a8a04f441e78..effeba58630b 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -36,8 +36,7 @@ static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, inet6_get_iif_sdif(skb, &iif, &sdif); hdr = skb_gro_network_header(skb); net = dev_net_rcu(skb->dev); - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - &hdr->saddr, th->source, + sk = __inet6_lookup_established(net, &hdr->saddr, th->source, &hdr->daddr, ntohs(th->dest), iif, sdif); NAPI_GRO_CB(skb)->is_flist = !sk; -- cgit v1.2.3 From f1241200cd66b3e25fd2a44dd961d9720e965aa1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 22 Aug 2025 19:07:00 +0000 Subject: tcp: Don't pass hashinfo to inet_diag helpers. These inet_diag functions required struct inet_hashinfo because they are shared by TCP and DCCP: * inet_diag_dump_icsk() * inet_diag_dump_one_icsk() * inet_diag_find_one_icsk() DCCP has gone, and we don't need to pass hashinfo down to them. Let's fetch net->ipv4.tcp_death_row.hashinfo directly in the first 2 functions. Note that inet_diag_find_one_icsk() don't need hashinfo since the previous patch. We will move TCP-specific functions to tcp_diag.c in the next patch. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250822190803.540788-6-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/inet_diag.h | 6 ++---- net/ipv4/inet_diag.c | 10 +++++----- net/ipv4/tcp_diag.c | 17 +++-------------- 3 files changed, 10 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index a9033696b0aa..34de992b5bd9 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -48,15 +48,13 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, u16 nlmsg_flags, bool net_admin); -void inet_diag_dump_icsk(struct inet_hashinfo *h, struct sk_buff *skb, +void inet_diag_dump_icsk(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r); -int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, - struct netlink_callback *cb, +int inet_diag_dump_one_icsk(struct netlink_callback *cb, const struct inet_diag_req_v2 *req); struct sock *inet_diag_find_one_icsk(struct net *net, - struct inet_hashinfo *hashinfo, const struct inet_diag_req_v2 *req); int inet_diag_bc_sk(const struct nlattr *_bc, struct sock *sk); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 462406948c84..5cbbb0695aff 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -519,7 +519,6 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, } struct sock *inet_diag_find_one_icsk(struct net *net, - struct inet_hashinfo *hashinfo, const struct inet_diag_req_v2 *req) { struct sock *sk; @@ -562,8 +561,7 @@ struct sock *inet_diag_find_one_icsk(struct net *net, } EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk); -int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, - struct netlink_callback *cb, +int inet_diag_dump_one_icsk(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { struct sk_buff *in_skb = cb->skb; @@ -573,7 +571,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sock *sk; int err; - sk = inet_diag_find_one_icsk(net, hashinfo, req); + sk = inet_diag_find_one_icsk(net, req); if (IS_ERR(sk)) return PTR_ERR(sk); @@ -1018,7 +1016,7 @@ static void twsk_build_assert(void) #endif } -void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, +void inet_diag_dump_icsk(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { @@ -1026,10 +1024,12 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, struct inet_diag_dump_data *cb_data = cb->data; struct net *net = sock_net(skb->sk); u32 idiag_states = r->idiag_states; + struct inet_hashinfo *hashinfo; int i, num, s_i, s_num; struct nlattr *bc; struct sock *sk; + hashinfo = net->ipv4.tcp_death_row.hashinfo; bc = cb_data->inet_diag_nla_bc; if (idiag_states & TCPF_SYN_RECV) idiag_states |= TCPF_NEW_SYN_RECV; diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 45e174b8cd22..7cd9d032efdd 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -180,21 +180,13 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { - struct inet_hashinfo *hinfo; - - hinfo = sock_net(cb->skb->sk)->ipv4.tcp_death_row.hashinfo; - - inet_diag_dump_icsk(hinfo, skb, cb, r); + inet_diag_dump_icsk(skb, cb, r); } static int tcp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { - struct inet_hashinfo *hinfo; - - hinfo = sock_net(cb->skb->sk)->ipv4.tcp_death_row.hashinfo; - - return inet_diag_dump_one_icsk(hinfo, cb, req); + return inet_diag_dump_one_icsk(cb, req); } #ifdef CONFIG_INET_DIAG_DESTROY @@ -202,13 +194,10 @@ static int tcp_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { struct net *net = sock_net(in_skb->sk); - struct inet_hashinfo *hinfo; struct sock *sk; int err; - hinfo = net->ipv4.tcp_death_row.hashinfo; - sk = inet_diag_find_one_icsk(net, hinfo, req); - + sk = inet_diag_find_one_icsk(net, req); if (IS_ERR(sk)) return PTR_ERR(sk); -- cgit v1.2.3 From 382a4d9cb6dc07643345e15c49738088a727d29b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 22 Aug 2025 19:07:01 +0000 Subject: tcp: Move TCP-specific diag functions to tcp_diag.c. tcp_diag_dump() / tcp_diag_dump_one() is just a wrapper of inet_diag_dump_icsk() / inet_diag_dump_one_icsk(), respectively. Let's inline them in tcp_diag.c and move static callees as well. Note that inet_sk_attr_size() is merged into tcp_diag_get_aux_size(), and we remove inet_diag_handler.idiag_get_aux_size() accordingly. While at it, BUG_ON() is replaced with DEBUG_NET_WARN_ON_ONCE(). Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250822190803.540788-7-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/inet_diag.h | 11 -- net/ipv4/inet_diag.c | 479 ---------------------------------------------- net/ipv4/tcp_diag.c | 460 +++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 455 insertions(+), 495 deletions(-) (limited to 'net') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 34de992b5bd9..30bf8f7ea62b 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -24,9 +24,6 @@ struct inet_diag_handler { bool net_admin, struct sk_buff *skb); - size_t (*idiag_get_aux_size)(struct sock *sk, - bool net_admin); - int (*destroy)(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req); @@ -48,14 +45,6 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, u16 nlmsg_flags, bool net_admin); -void inet_diag_dump_icsk(struct sk_buff *skb, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *r); -int inet_diag_dump_one_icsk(struct netlink_callback *cb, - const struct inet_diag_req_v2 *req); - -struct sock *inet_diag_find_one_icsk(struct net *net, - const struct inet_diag_req_v2 *req); int inet_diag_bc_sk(const struct nlattr *_bc, struct sock *sk); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 5cbbb0695aff..9d4dcd17728c 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -20,9 +20,6 @@ #include #include #include -#include -#include -#include #include #include @@ -97,31 +94,6 @@ void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk) } EXPORT_SYMBOL_GPL(inet_diag_msg_common_fill); -static size_t inet_sk_attr_size(struct sock *sk, - const struct inet_diag_req_v2 *req, - bool net_admin) -{ - const struct inet_diag_handler *handler; - size_t aux = 0; - - rcu_read_lock(); - handler = rcu_dereference(inet_diag_table[req->sdiag_protocol]); - DEBUG_NET_WARN_ON_ONCE(!handler); - if (handler && handler->idiag_get_aux_size) - aux = handler->idiag_get_aux_size(sk, net_admin); - rcu_read_unlock(); - - return nla_total_size(sizeof(struct tcp_info)) - + nla_total_size(sizeof(struct inet_diag_msg)) - + inet_diag_msg_attrs_size() - + nla_total_size(sizeof(struct inet_diag_meminfo)) - + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) - + nla_total_size(TCP_CA_NAME_MAX) - + nla_total_size(sizeof(struct tcpvegas_info)) - + aux - + 64; -} - int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, struct inet_diag_msg *r, int ext, struct user_namespace *user_ns, @@ -422,181 +394,6 @@ errout: } EXPORT_SYMBOL_GPL(inet_sk_diag_fill); -static int inet_twsk_diag_fill(struct sock *sk, - struct sk_buff *skb, - struct netlink_callback *cb, - u16 nlmsg_flags, bool net_admin) -{ - struct inet_timewait_sock *tw = inet_twsk(sk); - struct inet_diag_msg *r; - struct nlmsghdr *nlh; - long tmo; - - nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, - sizeof(*r), nlmsg_flags); - if (!nlh) - return -EMSGSIZE; - - r = nlmsg_data(nlh); - BUG_ON(tw->tw_state != TCP_TIME_WAIT); - - inet_diag_msg_common_fill(r, sk); - r->idiag_retrans = 0; - - r->idiag_state = READ_ONCE(tw->tw_substate); - r->idiag_timer = 3; - tmo = tw->tw_timer.expires - jiffies; - r->idiag_expires = jiffies_delta_to_msecs(tmo); - r->idiag_rqueue = 0; - r->idiag_wqueue = 0; - r->idiag_uid = 0; - r->idiag_inode = 0; - - if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, - tw->tw_mark)) { - nlmsg_cancel(skb, nlh); - return -EMSGSIZE; - } - - nlmsg_end(skb, nlh); - return 0; -} - -static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, - struct netlink_callback *cb, - u16 nlmsg_flags, bool net_admin) -{ - struct request_sock *reqsk = inet_reqsk(sk); - struct inet_diag_msg *r; - struct nlmsghdr *nlh; - long tmo; - - nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags); - if (!nlh) - return -EMSGSIZE; - - r = nlmsg_data(nlh); - inet_diag_msg_common_fill(r, sk); - r->idiag_state = TCP_SYN_RECV; - r->idiag_timer = 1; - r->idiag_retrans = reqsk->num_retrans; - - BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) != - offsetof(struct sock, sk_cookie)); - - tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies; - r->idiag_expires = jiffies_delta_to_msecs(tmo); - r->idiag_rqueue = 0; - r->idiag_wqueue = 0; - r->idiag_uid = 0; - r->idiag_inode = 0; - - if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, - inet_rsk(reqsk)->ir_mark)) { - nlmsg_cancel(skb, nlh); - return -EMSGSIZE; - } - - nlmsg_end(skb, nlh); - return 0; -} - -static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - u16 nlmsg_flags, bool net_admin) -{ - if (sk->sk_state == TCP_TIME_WAIT) - return inet_twsk_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); - - if (sk->sk_state == TCP_NEW_SYN_RECV) - return inet_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); - - return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags, - net_admin); -} - -struct sock *inet_diag_find_one_icsk(struct net *net, - const struct inet_diag_req_v2 *req) -{ - struct sock *sk; - - rcu_read_lock(); - if (req->sdiag_family == AF_INET) - sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[0], - req->id.idiag_dport, req->id.idiag_src[0], - req->id.idiag_sport, req->id.idiag_if); -#if IS_ENABLED(CONFIG_IPV6) - else if (req->sdiag_family == AF_INET6) { - if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && - ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) - sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[3], - req->id.idiag_dport, req->id.idiag_src[3], - req->id.idiag_sport, req->id.idiag_if); - else - sk = inet6_lookup(net, NULL, 0, - (struct in6_addr *)req->id.idiag_dst, - req->id.idiag_dport, - (struct in6_addr *)req->id.idiag_src, - req->id.idiag_sport, - req->id.idiag_if); - } -#endif - else { - rcu_read_unlock(); - return ERR_PTR(-EINVAL); - } - rcu_read_unlock(); - if (!sk) - return ERR_PTR(-ENOENT); - - if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) { - sock_gen_put(sk); - return ERR_PTR(-ENOENT); - } - - return sk; -} -EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk); - -int inet_diag_dump_one_icsk(struct netlink_callback *cb, - const struct inet_diag_req_v2 *req) -{ - struct sk_buff *in_skb = cb->skb; - bool net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN); - struct net *net = sock_net(in_skb->sk); - struct sk_buff *rep; - struct sock *sk; - int err; - - sk = inet_diag_find_one_icsk(net, req); - if (IS_ERR(sk)) - return PTR_ERR(sk); - - rep = nlmsg_new(inet_sk_attr_size(sk, req, net_admin), GFP_KERNEL); - if (!rep) { - err = -ENOMEM; - goto out; - } - - err = sk_diag_fill(sk, rep, cb, req, 0, net_admin); - if (err < 0) { - WARN_ON(err == -EMSGSIZE); - nlmsg_free(rep); - goto out; - } - err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); - -out: - if (sk) - sock_gen_put(sk); - - return err; -} -EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk); - static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb, const struct nlmsghdr *nlh, int hdrlen, @@ -990,282 +787,6 @@ static int inet_diag_bc_audit(const struct nlattr *attr, return len == 0 ? 0 : -EINVAL; } -static void twsk_build_assert(void) -{ - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) != - offsetof(struct sock, sk_family)); - - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) != - offsetof(struct inet_sock, inet_num)); - - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) != - offsetof(struct inet_sock, inet_dport)); - - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) != - offsetof(struct inet_sock, inet_rcv_saddr)); - - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) != - offsetof(struct inet_sock, inet_daddr)); - -#if IS_ENABLED(CONFIG_IPV6) - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) != - offsetof(struct sock, sk_v6_rcv_saddr)); - - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) != - offsetof(struct sock, sk_v6_daddr)); -#endif -} - -void inet_diag_dump_icsk(struct sk_buff *skb, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *r) -{ - bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); - struct inet_diag_dump_data *cb_data = cb->data; - struct net *net = sock_net(skb->sk); - u32 idiag_states = r->idiag_states; - struct inet_hashinfo *hashinfo; - int i, num, s_i, s_num; - struct nlattr *bc; - struct sock *sk; - - hashinfo = net->ipv4.tcp_death_row.hashinfo; - bc = cb_data->inet_diag_nla_bc; - if (idiag_states & TCPF_SYN_RECV) - idiag_states |= TCPF_NEW_SYN_RECV; - s_i = cb->args[1]; - s_num = num = cb->args[2]; - - if (cb->args[0] == 0) { - if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport) - goto skip_listen_ht; - - for (i = s_i; i <= hashinfo->lhash2_mask; i++) { - struct inet_listen_hashbucket *ilb; - struct hlist_nulls_node *node; - - num = 0; - ilb = &hashinfo->lhash2[i]; - - if (hlist_nulls_empty(&ilb->nulls_head)) { - s_num = 0; - continue; - } - spin_lock(&ilb->lock); - sk_nulls_for_each(sk, node, &ilb->nulls_head) { - struct inet_sock *inet = inet_sk(sk); - - if (!net_eq(sock_net(sk), net)) - continue; - - if (num < s_num) { - num++; - continue; - } - - if (r->sdiag_family != AF_UNSPEC && - sk->sk_family != r->sdiag_family) - goto next_listen; - - if (r->id.idiag_sport != inet->inet_sport && - r->id.idiag_sport) - goto next_listen; - - if (!inet_diag_bc_sk(bc, sk)) - goto next_listen; - - if (inet_sk_diag_fill(sk, inet_csk(sk), skb, - cb, r, NLM_F_MULTI, - net_admin) < 0) { - spin_unlock(&ilb->lock); - goto done; - } - -next_listen: - ++num; - } - spin_unlock(&ilb->lock); - - s_num = 0; - } -skip_listen_ht: - cb->args[0] = 1; - s_i = num = s_num = 0; - } - -/* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets - * with bh disabled. - */ -#define SKARR_SZ 16 - - /* Dump bound but inactive (not listening, connecting, etc.) sockets */ - if (cb->args[0] == 1) { - if (!(idiag_states & TCPF_BOUND_INACTIVE)) - goto skip_bind_ht; - - for (i = s_i; i < hashinfo->bhash_size; i++) { - struct inet_bind_hashbucket *ibb; - struct inet_bind2_bucket *tb2; - struct sock *sk_arr[SKARR_SZ]; - int num_arr[SKARR_SZ]; - int idx, accum, res; - -resume_bind_walk: - num = 0; - accum = 0; - ibb = &hashinfo->bhash2[i]; - - if (hlist_empty(&ibb->chain)) { - s_num = 0; - continue; - } - spin_lock_bh(&ibb->lock); - inet_bind_bucket_for_each(tb2, &ibb->chain) { - if (!net_eq(ib2_net(tb2), net)) - continue; - - sk_for_each_bound(sk, &tb2->owners) { - struct inet_sock *inet = inet_sk(sk); - - if (num < s_num) - goto next_bind; - - if (sk->sk_state != TCP_CLOSE || - !inet->inet_num) - goto next_bind; - - if (r->sdiag_family != AF_UNSPEC && - r->sdiag_family != sk->sk_family) - goto next_bind; - - if (!inet_diag_bc_sk(bc, sk)) - goto next_bind; - - sock_hold(sk); - num_arr[accum] = num; - sk_arr[accum] = sk; - if (++accum == SKARR_SZ) - goto pause_bind_walk; -next_bind: - num++; - } - } -pause_bind_walk: - spin_unlock_bh(&ibb->lock); - - res = 0; - for (idx = 0; idx < accum; idx++) { - if (res >= 0) { - res = inet_sk_diag_fill(sk_arr[idx], - NULL, skb, cb, - r, NLM_F_MULTI, - net_admin); - if (res < 0) - num = num_arr[idx]; - } - sock_put(sk_arr[idx]); - } - if (res < 0) - goto done; - - cond_resched(); - - if (accum == SKARR_SZ) { - s_num = num + 1; - goto resume_bind_walk; - } - - s_num = 0; - } -skip_bind_ht: - cb->args[0] = 2; - s_i = num = s_num = 0; - } - - if (!(idiag_states & ~TCPF_LISTEN)) - goto out; - - for (i = s_i; i <= hashinfo->ehash_mask; i++) { - struct inet_ehash_bucket *head = &hashinfo->ehash[i]; - spinlock_t *lock = inet_ehash_lockp(hashinfo, i); - struct hlist_nulls_node *node; - struct sock *sk_arr[SKARR_SZ]; - int num_arr[SKARR_SZ]; - int idx, accum, res; - - if (hlist_nulls_empty(&head->chain)) - continue; - - if (i > s_i) - s_num = 0; - -next_chunk: - num = 0; - accum = 0; - spin_lock_bh(lock); - sk_nulls_for_each(sk, node, &head->chain) { - int state; - - if (!net_eq(sock_net(sk), net)) - continue; - if (num < s_num) - goto next_normal; - state = (sk->sk_state == TCP_TIME_WAIT) ? - READ_ONCE(inet_twsk(sk)->tw_substate) : sk->sk_state; - if (!(idiag_states & (1 << state))) - goto next_normal; - if (r->sdiag_family != AF_UNSPEC && - sk->sk_family != r->sdiag_family) - goto next_normal; - if (r->id.idiag_sport != htons(sk->sk_num) && - r->id.idiag_sport) - goto next_normal; - if (r->id.idiag_dport != sk->sk_dport && - r->id.idiag_dport) - goto next_normal; - twsk_build_assert(); - - if (!inet_diag_bc_sk(bc, sk)) - goto next_normal; - - if (!refcount_inc_not_zero(&sk->sk_refcnt)) - goto next_normal; - - num_arr[accum] = num; - sk_arr[accum] = sk; - if (++accum == SKARR_SZ) - break; -next_normal: - ++num; - } - spin_unlock_bh(lock); - res = 0; - for (idx = 0; idx < accum; idx++) { - if (res >= 0) { - res = sk_diag_fill(sk_arr[idx], skb, cb, r, - NLM_F_MULTI, net_admin); - if (res < 0) - num = num_arr[idx]; - } - sock_gen_put(sk_arr[idx]); - } - if (res < 0) - break; - cond_resched(); - if (accum == SKARR_SZ) { - s_num = num + 1; - goto next_chunk; - } - } - -done: - cb->args[1] = i; - cb->args[2] = num; -out: - ; -} -EXPORT_SYMBOL_GPL(inet_diag_dump_icsk); - static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 7cd9d032efdd..2f3a779ce7a2 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -12,6 +12,9 @@ #include +#include +#include +#include #include #include @@ -174,19 +177,467 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) size += ulp_ops->get_info_size(sk, net_admin); } } - return size; + + return size + + nla_total_size(sizeof(struct tcp_info)) + + nla_total_size(sizeof(struct inet_diag_msg)) + + inet_diag_msg_attrs_size() + + nla_total_size(sizeof(struct inet_diag_meminfo)) + + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) + + nla_total_size(TCP_CA_NAME_MAX) + + nla_total_size(sizeof(struct tcpvegas_info)) + + 64; +} + +static int tcp_twsk_diag_fill(struct sock *sk, + struct sk_buff *skb, + struct netlink_callback *cb, + u16 nlmsg_flags, bool net_admin) +{ + struct inet_timewait_sock *tw = inet_twsk(sk); + struct inet_diag_msg *r; + struct nlmsghdr *nlh; + long tmo; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, + sizeof(*r), nlmsg_flags); + if (!nlh) + return -EMSGSIZE; + + r = nlmsg_data(nlh); + DEBUG_NET_WARN_ON_ONCE(tw->tw_state != TCP_TIME_WAIT); + + inet_diag_msg_common_fill(r, sk); + r->idiag_retrans = 0; + + r->idiag_state = READ_ONCE(tw->tw_substate); + r->idiag_timer = 3; + tmo = tw->tw_timer.expires - jiffies; + r->idiag_expires = jiffies_delta_to_msecs(tmo); + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; + r->idiag_uid = 0; + r->idiag_inode = 0; + + if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, + tw->tw_mark)) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + + nlmsg_end(skb, nlh); + return 0; +} + +static int tcp_req_diag_fill(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, + u16 nlmsg_flags, bool net_admin) +{ + struct request_sock *reqsk = inet_reqsk(sk); + struct inet_diag_msg *r; + struct nlmsghdr *nlh; + long tmo; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags); + if (!nlh) + return -EMSGSIZE; + + r = nlmsg_data(nlh); + inet_diag_msg_common_fill(r, sk); + r->idiag_state = TCP_SYN_RECV; + r->idiag_timer = 1; + r->idiag_retrans = reqsk->num_retrans; + + BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) != + offsetof(struct sock, sk_cookie)); + + tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies; + r->idiag_expires = jiffies_delta_to_msecs(tmo); + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; + r->idiag_uid = 0; + r->idiag_inode = 0; + + if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, + inet_rsk(reqsk)->ir_mark)) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + + nlmsg_end(skb, nlh); + return 0; +} + +static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, + const struct inet_diag_req_v2 *r, + u16 nlmsg_flags, bool net_admin) +{ + if (sk->sk_state == TCP_TIME_WAIT) + return tcp_twsk_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); + + if (sk->sk_state == TCP_NEW_SYN_RECV) + return tcp_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); + + return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags, + net_admin); +} + +static void twsk_build_assert(void) +{ + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) != + offsetof(struct sock, sk_family)); + + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) != + offsetof(struct inet_sock, inet_num)); + + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) != + offsetof(struct inet_sock, inet_dport)); + + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) != + offsetof(struct inet_sock, inet_rcv_saddr)); + + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) != + offsetof(struct inet_sock, inet_daddr)); + +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) != + offsetof(struct sock, sk_v6_rcv_saddr)); + + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) != + offsetof(struct sock, sk_v6_daddr)); +#endif } static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { - inet_diag_dump_icsk(skb, cb, r); + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); + struct inet_diag_dump_data *cb_data = cb->data; + struct net *net = sock_net(skb->sk); + u32 idiag_states = r->idiag_states; + struct inet_hashinfo *hashinfo; + int i, num, s_i, s_num; + struct nlattr *bc; + struct sock *sk; + + hashinfo = net->ipv4.tcp_death_row.hashinfo; + bc = cb_data->inet_diag_nla_bc; + if (idiag_states & TCPF_SYN_RECV) + idiag_states |= TCPF_NEW_SYN_RECV; + s_i = cb->args[1]; + s_num = num = cb->args[2]; + + if (cb->args[0] == 0) { + if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport) + goto skip_listen_ht; + + for (i = s_i; i <= hashinfo->lhash2_mask; i++) { + struct inet_listen_hashbucket *ilb; + struct hlist_nulls_node *node; + + num = 0; + ilb = &hashinfo->lhash2[i]; + + if (hlist_nulls_empty(&ilb->nulls_head)) { + s_num = 0; + continue; + } + spin_lock(&ilb->lock); + sk_nulls_for_each(sk, node, &ilb->nulls_head) { + struct inet_sock *inet = inet_sk(sk); + + if (!net_eq(sock_net(sk), net)) + continue; + + if (num < s_num) { + num++; + continue; + } + + if (r->sdiag_family != AF_UNSPEC && + sk->sk_family != r->sdiag_family) + goto next_listen; + + if (r->id.idiag_sport != inet->inet_sport && + r->id.idiag_sport) + goto next_listen; + + if (!inet_diag_bc_sk(bc, sk)) + goto next_listen; + + if (inet_sk_diag_fill(sk, inet_csk(sk), skb, + cb, r, NLM_F_MULTI, + net_admin) < 0) { + spin_unlock(&ilb->lock); + goto done; + } + +next_listen: + ++num; + } + spin_unlock(&ilb->lock); + + s_num = 0; + } +skip_listen_ht: + cb->args[0] = 1; + s_i = num = s_num = 0; + } + +/* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets + * with bh disabled. + */ +#define SKARR_SZ 16 + + /* Dump bound but inactive (not listening, connecting, etc.) sockets */ + if (cb->args[0] == 1) { + if (!(idiag_states & TCPF_BOUND_INACTIVE)) + goto skip_bind_ht; + + for (i = s_i; i < hashinfo->bhash_size; i++) { + struct inet_bind_hashbucket *ibb; + struct inet_bind2_bucket *tb2; + struct sock *sk_arr[SKARR_SZ]; + int num_arr[SKARR_SZ]; + int idx, accum, res; + +resume_bind_walk: + num = 0; + accum = 0; + ibb = &hashinfo->bhash2[i]; + + if (hlist_empty(&ibb->chain)) { + s_num = 0; + continue; + } + spin_lock_bh(&ibb->lock); + inet_bind_bucket_for_each(tb2, &ibb->chain) { + if (!net_eq(ib2_net(tb2), net)) + continue; + + sk_for_each_bound(sk, &tb2->owners) { + struct inet_sock *inet = inet_sk(sk); + + if (num < s_num) + goto next_bind; + + if (sk->sk_state != TCP_CLOSE || + !inet->inet_num) + goto next_bind; + + if (r->sdiag_family != AF_UNSPEC && + r->sdiag_family != sk->sk_family) + goto next_bind; + + if (!inet_diag_bc_sk(bc, sk)) + goto next_bind; + + sock_hold(sk); + num_arr[accum] = num; + sk_arr[accum] = sk; + if (++accum == SKARR_SZ) + goto pause_bind_walk; +next_bind: + num++; + } + } +pause_bind_walk: + spin_unlock_bh(&ibb->lock); + + res = 0; + for (idx = 0; idx < accum; idx++) { + if (res >= 0) { + res = inet_sk_diag_fill(sk_arr[idx], + NULL, skb, cb, + r, NLM_F_MULTI, + net_admin); + if (res < 0) + num = num_arr[idx]; + } + sock_put(sk_arr[idx]); + } + if (res < 0) + goto done; + + cond_resched(); + + if (accum == SKARR_SZ) { + s_num = num + 1; + goto resume_bind_walk; + } + + s_num = 0; + } +skip_bind_ht: + cb->args[0] = 2; + s_i = num = s_num = 0; + } + + if (!(idiag_states & ~TCPF_LISTEN)) + goto out; + + for (i = s_i; i <= hashinfo->ehash_mask; i++) { + struct inet_ehash_bucket *head = &hashinfo->ehash[i]; + spinlock_t *lock = inet_ehash_lockp(hashinfo, i); + struct hlist_nulls_node *node; + struct sock *sk_arr[SKARR_SZ]; + int num_arr[SKARR_SZ]; + int idx, accum, res; + + if (hlist_nulls_empty(&head->chain)) + continue; + + if (i > s_i) + s_num = 0; + +next_chunk: + num = 0; + accum = 0; + spin_lock_bh(lock); + sk_nulls_for_each(sk, node, &head->chain) { + int state; + + if (!net_eq(sock_net(sk), net)) + continue; + if (num < s_num) + goto next_normal; + state = (sk->sk_state == TCP_TIME_WAIT) ? + READ_ONCE(inet_twsk(sk)->tw_substate) : sk->sk_state; + if (!(idiag_states & (1 << state))) + goto next_normal; + if (r->sdiag_family != AF_UNSPEC && + sk->sk_family != r->sdiag_family) + goto next_normal; + if (r->id.idiag_sport != htons(sk->sk_num) && + r->id.idiag_sport) + goto next_normal; + if (r->id.idiag_dport != sk->sk_dport && + r->id.idiag_dport) + goto next_normal; + twsk_build_assert(); + + if (!inet_diag_bc_sk(bc, sk)) + goto next_normal; + + if (!refcount_inc_not_zero(&sk->sk_refcnt)) + goto next_normal; + + num_arr[accum] = num; + sk_arr[accum] = sk; + if (++accum == SKARR_SZ) + break; +next_normal: + ++num; + } + spin_unlock_bh(lock); + + res = 0; + for (idx = 0; idx < accum; idx++) { + if (res >= 0) { + res = sk_diag_fill(sk_arr[idx], skb, cb, r, + NLM_F_MULTI, net_admin); + if (res < 0) + num = num_arr[idx]; + } + sock_gen_put(sk_arr[idx]); + } + if (res < 0) + break; + + cond_resched(); + + if (accum == SKARR_SZ) { + s_num = num + 1; + goto next_chunk; + } + } + +done: + cb->args[1] = i; + cb->args[2] = num; +out: + ; +} + +static struct sock *tcp_diag_find_one_icsk(struct net *net, + const struct inet_diag_req_v2 *req) +{ + struct sock *sk; + + rcu_read_lock(); + if (req->sdiag_family == AF_INET) { + sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[0], + req->id.idiag_dport, req->id.idiag_src[0], + req->id.idiag_sport, req->id.idiag_if); +#if IS_ENABLED(CONFIG_IPV6) + } else if (req->sdiag_family == AF_INET6) { + if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && + ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) + sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[3], + req->id.idiag_dport, req->id.idiag_src[3], + req->id.idiag_sport, req->id.idiag_if); + else + sk = inet6_lookup(net, NULL, 0, + (struct in6_addr *)req->id.idiag_dst, + req->id.idiag_dport, + (struct in6_addr *)req->id.idiag_src, + req->id.idiag_sport, + req->id.idiag_if); +#endif + } else { + rcu_read_unlock(); + return ERR_PTR(-EINVAL); + } + rcu_read_unlock(); + if (!sk) + return ERR_PTR(-ENOENT); + + if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) { + sock_gen_put(sk); + return ERR_PTR(-ENOENT); + } + + return sk; } static int tcp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { - return inet_diag_dump_one_icsk(cb, req); + struct sk_buff *in_skb = cb->skb; + struct sk_buff *rep; + struct sock *sk; + struct net *net; + bool net_admin; + int err; + + net = sock_net(in_skb->sk); + sk = tcp_diag_find_one_icsk(net, req); + if (IS_ERR(sk)) + return PTR_ERR(sk); + + net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN); + rep = nlmsg_new(tcp_diag_get_aux_size(sk, net_admin), GFP_KERNEL); + if (!rep) { + err = -ENOMEM; + goto out; + } + + err = sk_diag_fill(sk, rep, cb, req, 0, net_admin); + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + nlmsg_free(rep); + goto out; + } + err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); + +out: + if (sk) + sock_gen_put(sk); + + return err; } #ifdef CONFIG_INET_DIAG_DESTROY @@ -197,7 +648,7 @@ static int tcp_diag_destroy(struct sk_buff *in_skb, struct sock *sk; int err; - sk = inet_diag_find_one_icsk(net, req); + sk = tcp_diag_find_one_icsk(net, req); if (IS_ERR(sk)) return PTR_ERR(sk); @@ -215,7 +666,6 @@ static const struct inet_diag_handler tcp_diag_handler = { .dump_one = tcp_diag_dump_one, .idiag_get_info = tcp_diag_get_info, .idiag_get_aux = tcp_diag_get_aux, - .idiag_get_aux_size = tcp_diag_get_aux_size, .idiag_type = IPPROTO_TCP, .idiag_info_size = sizeof(struct tcp_info), #ifdef CONFIG_INET_DIAG_DESTROY -- cgit v1.2.3 From 26e84445f02ce6b2fe5f3e0e28ff7add77f35e08 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Wed, 13 Aug 2025 16:52:36 +0300 Subject: wifi: cfg80211: fix use-after-free in cmp_bss() Following bss_free() quirk introduced in commit 776b3580178f ("cfg80211: track hidden SSID networks properly"), adjust cfg80211_update_known_bss() to free the last beacon frame elements only if they're not shared via the corresponding 'hidden_beacon_bss' pointer. Reported-by: syzbot+30754ca335e6fb7e3092@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=30754ca335e6fb7e3092 Fixes: 3ab8227d3e7d ("cfg80211: refactor cfg80211_bss_update") Signed-off-by: Dmitry Antipov Link: https://patch.msgid.link/20250813135236.799384-1-dmantipov@yandex.ru Signed-off-by: Johannes Berg --- net/wireless/scan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/scan.c b/net/wireless/scan.c index a8339ed52404..6c7b7c3828a4 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1916,7 +1916,8 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, */ f = rcu_access_pointer(new->pub.beacon_ies); - kfree_rcu((struct cfg80211_bss_ies *)f, rcu_head); + if (!new->pub.hidden_beacon_bss) + kfree_rcu((struct cfg80211_bss_ies *)f, rcu_head); return false; } -- cgit v1.2.3 From a33b375ab5b3a9897a0ab76be8258d9f6b748628 Mon Sep 17 00:00:00 2001 From: Liao Yuanhong Date: Mon, 25 Aug 2025 10:29:11 +0800 Subject: wifi: mac80211: fix incorrect type for ret The variable ret is declared as a u32 type, but it is assigned a value of -EOPNOTSUPP. Since unsigned types cannot correctly represent negative values, the type of ret should be changed to int. Signed-off-by: Liao Yuanhong Link: https://patch.msgid.link/20250825022911.139377-1-liaoyuanhong@vivo.com Signed-off-by: Johannes Berg --- net/mac80211/driver-ops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 181bcb34b795..55105d238d6b 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1416,7 +1416,7 @@ drv_get_ftm_responder_stats(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_ftm_responder_stats *ftm_stats) { - u32 ret = -EOPNOTSUPP; + int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); -- cgit v1.2.3 From 7e2f3213e85eba00acb4cfe6d71647892d63c3a1 Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Tue, 26 Aug 2025 18:54:37 +1000 Subject: wifi: mac80211: increase scan_ies_len for S1G Currently the S1G capability element is not taken into account for the scan_ies_len, which leads to a buffer length validation failure in ieee80211_prep_hw_scan() and subsequent WARN in __ieee80211_start_scan(). This prevents hw scanning from functioning. To fix ensure we accommodate for the S1G capability length. Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250826085437.3493-1-lachlan.hodges@morsemicro.com Signed-off-by: Johannes Berg --- net/mac80211/main.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 9c8f18b258a6..3ae6104e5cb2 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1111,7 +1111,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) int result, i; enum nl80211_band band; int channels, max_bitrates; - bool supp_ht, supp_vht, supp_he, supp_eht; + bool supp_ht, supp_vht, supp_he, supp_eht, supp_s1g; struct cfg80211_chan_def dflt_chandef = {}; if (ieee80211_hw_check(hw, QUEUE_CONTROL) && @@ -1227,6 +1227,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) supp_vht = false; supp_he = false; supp_eht = false; + supp_s1g = false; for (band = 0; band < NUM_NL80211_BANDS; band++) { const struct ieee80211_sband_iftype_data *iftd; struct ieee80211_supported_band *sband; @@ -1274,6 +1275,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) max_bitrates = sband->n_bitrates; supp_ht = supp_ht || sband->ht_cap.ht_supported; supp_vht = supp_vht || sband->vht_cap.vht_supported; + supp_s1g = supp_s1g || sband->s1g_cap.s1g; for_each_sband_iftype_data(sband, i, iftd) { u8 he_40_mhz_cap; @@ -1406,6 +1408,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) local->scan_ies_len += 2 + sizeof(struct ieee80211_vht_cap); + if (supp_s1g) + local->scan_ies_len += 2 + sizeof(struct ieee80211_s1g_cap); + /* * HE cap element is variable in size - set len to allow max size */ if (supp_he) { -- cgit v1.2.3 From d2b007374551ac09db16badde575cdd698f6fc92 Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Sun, 24 Aug 2025 11:43:50 +0300 Subject: devlink: Move graceful period parameter to reporter ops Move the default graceful period from a parameter to devlink_health_reporter_create() to a field in the devlink_health_reporter_ops structure. This change improves consistency, as the graceful period is inherently tied to the reporter's behavior and recovery policy. It simplifies the signature of devlink_health_reporter_create() and its internal helper functions. It also centralizes the reporter configuration at the ops structure, preparing the groundwork for a downstream patch that will introduce a devlink health reporter burst period attribute whose default value will similarly be provided by the driver via the ops structure. Signed-off-by: Shahar Shitrit Reviewed-by: Jiri Pirko Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250824084354.533182-2-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/pds_core/main.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 2 +- drivers/net/ethernet/huawei/hinic/hinic_devlink.c | 10 +++--- drivers/net/ethernet/intel/ice/devlink/health.c | 3 +- .../ethernet/marvell/octeontx2/af/rvu_devlink.c | 32 ++++++++++++----- .../mellanox/mlx5/core/diag/reporter_vnic.c | 2 +- .../ethernet/mellanox/mlx5/core/en/reporter_rx.c | 10 +++--- .../ethernet/mellanox/mlx5/core/en/reporter_tx.c | 10 +++--- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/health.c | 41 +++++++++++++--------- drivers/net/ethernet/mellanox/mlxsw/core.c | 2 +- drivers/net/ethernet/qlogic/qed/qed_devlink.c | 9 ++--- drivers/net/netdevsim/health.c | 4 +-- include/net/devlink.h | 11 +++--- net/devlink/health.c | 28 ++++++--------- 15 files changed, 97 insertions(+), 71 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/amd/pds_core/main.c b/drivers/net/ethernet/amd/pds_core/main.c index 9b81e1c260c2..c7a2eff57632 100644 --- a/drivers/net/ethernet/amd/pds_core/main.c +++ b/drivers/net/ethernet/amd/pds_core/main.c @@ -280,7 +280,7 @@ static int pdsc_init_pf(struct pdsc *pdsc) goto err_out_del_dev; } - hr = devl_health_reporter_create(dl, &pdsc_fw_reporter_ops, 0, pdsc); + hr = devl_health_reporter_create(dl, &pdsc_fw_reporter_ops, pdsc); if (IS_ERR(hr)) { devl_unlock(dl); dev_warn(pdsc->dev, "Failed to create fw reporter: %pe\n", hr); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index 4c4581b0342e..43fb75806cd6 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@ -220,7 +220,7 @@ __bnxt_dl_reporter_create(struct bnxt *bp, { struct devlink_health_reporter *reporter; - reporter = devlink_health_reporter_create(bp->dl, ops, 0, bp); + reporter = devlink_health_reporter_create(bp->dl, ops, bp); if (IS_ERR(reporter)) { netdev_warn(bp->dev, "Failed to create %s health reporter, rc = %ld\n", ops->name, PTR_ERR(reporter)); diff --git a/drivers/net/ethernet/huawei/hinic/hinic_devlink.c b/drivers/net/ethernet/huawei/hinic/hinic_devlink.c index 03e42512a2d5..300bc267a259 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_devlink.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_devlink.c @@ -443,8 +443,9 @@ int hinic_health_reporters_create(struct hinic_devlink_priv *priv) struct devlink *devlink = priv_to_devlink(priv); priv->hw_fault_reporter = - devlink_health_reporter_create(devlink, &hinic_hw_fault_reporter_ops, - 0, priv); + devlink_health_reporter_create(devlink, + &hinic_hw_fault_reporter_ops, + priv); if (IS_ERR(priv->hw_fault_reporter)) { dev_warn(&priv->hwdev->hwif->pdev->dev, "Failed to create hw fault reporter, err: %ld\n", PTR_ERR(priv->hw_fault_reporter)); @@ -452,8 +453,9 @@ int hinic_health_reporters_create(struct hinic_devlink_priv *priv) } priv->fw_fault_reporter = - devlink_health_reporter_create(devlink, &hinic_fw_fault_reporter_ops, - 0, priv); + devlink_health_reporter_create(devlink, + &hinic_fw_fault_reporter_ops, + priv); if (IS_ERR(priv->fw_fault_reporter)) { dev_warn(&priv->hwdev->hwif->pdev->dev, "Failed to create fw fault reporter, err: %ld\n", PTR_ERR(priv->fw_fault_reporter)); diff --git a/drivers/net/ethernet/intel/ice/devlink/health.c b/drivers/net/ethernet/intel/ice/devlink/health.c index ab519c0f28bf..8e9a8a8178d4 100644 --- a/drivers/net/ethernet/intel/ice/devlink/health.c +++ b/drivers/net/ethernet/intel/ice/devlink/health.c @@ -450,9 +450,8 @@ ice_init_devlink_rep(struct ice_pf *pf, { struct devlink *devlink = priv_to_devlink(pf); struct devlink_health_reporter *rep; - const u64 graceful_period = 0; - rep = devl_health_reporter_create(devlink, ops, graceful_period, pf); + rep = devl_health_reporter_create(devlink, ops, pf); if (IS_ERR(rep)) { struct device *dev = ice_pf_to_dev(pf); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c index 27c3a2daaaa9..3735372539bd 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c @@ -505,7 +505,9 @@ static int rvu_nix_register_reporters(struct rvu_devlink *rvu_dl) rvu_reporters->nix_event_ctx = nix_event_context; rvu_reporters->rvu_hw_nix_intr_reporter = - devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_nix_intr_reporter_ops, 0, rvu); + devlink_health_reporter_create(rvu_dl->dl, + &rvu_hw_nix_intr_reporter_ops, + rvu); if (IS_ERR(rvu_reporters->rvu_hw_nix_intr_reporter)) { dev_warn(rvu->dev, "Failed to create hw_nix_intr reporter, err=%ld\n", PTR_ERR(rvu_reporters->rvu_hw_nix_intr_reporter)); @@ -513,7 +515,9 @@ static int rvu_nix_register_reporters(struct rvu_devlink *rvu_dl) } rvu_reporters->rvu_hw_nix_gen_reporter = - devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_nix_gen_reporter_ops, 0, rvu); + devlink_health_reporter_create(rvu_dl->dl, + &rvu_hw_nix_gen_reporter_ops, + rvu); if (IS_ERR(rvu_reporters->rvu_hw_nix_gen_reporter)) { dev_warn(rvu->dev, "Failed to create hw_nix_gen reporter, err=%ld\n", PTR_ERR(rvu_reporters->rvu_hw_nix_gen_reporter)); @@ -521,7 +525,9 @@ static int rvu_nix_register_reporters(struct rvu_devlink *rvu_dl) } rvu_reporters->rvu_hw_nix_err_reporter = - devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_nix_err_reporter_ops, 0, rvu); + devlink_health_reporter_create(rvu_dl->dl, + &rvu_hw_nix_err_reporter_ops, + rvu); if (IS_ERR(rvu_reporters->rvu_hw_nix_err_reporter)) { dev_warn(rvu->dev, "Failed to create hw_nix_err reporter, err=%ld\n", PTR_ERR(rvu_reporters->rvu_hw_nix_err_reporter)); @@ -529,7 +535,9 @@ static int rvu_nix_register_reporters(struct rvu_devlink *rvu_dl) } rvu_reporters->rvu_hw_nix_ras_reporter = - devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_nix_ras_reporter_ops, 0, rvu); + devlink_health_reporter_create(rvu_dl->dl, + &rvu_hw_nix_ras_reporter_ops, + rvu); if (IS_ERR(rvu_reporters->rvu_hw_nix_ras_reporter)) { dev_warn(rvu->dev, "Failed to create hw_nix_ras reporter, err=%ld\n", PTR_ERR(rvu_reporters->rvu_hw_nix_ras_reporter)); @@ -1051,7 +1059,9 @@ static int rvu_npa_register_reporters(struct rvu_devlink *rvu_dl) rvu_reporters->npa_event_ctx = npa_event_context; rvu_reporters->rvu_hw_npa_intr_reporter = - devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_npa_intr_reporter_ops, 0, rvu); + devlink_health_reporter_create(rvu_dl->dl, + &rvu_hw_npa_intr_reporter_ops, + rvu); if (IS_ERR(rvu_reporters->rvu_hw_npa_intr_reporter)) { dev_warn(rvu->dev, "Failed to create hw_npa_intr reporter, err=%ld\n", PTR_ERR(rvu_reporters->rvu_hw_npa_intr_reporter)); @@ -1059,7 +1069,9 @@ static int rvu_npa_register_reporters(struct rvu_devlink *rvu_dl) } rvu_reporters->rvu_hw_npa_gen_reporter = - devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_npa_gen_reporter_ops, 0, rvu); + devlink_health_reporter_create(rvu_dl->dl, + &rvu_hw_npa_gen_reporter_ops, + rvu); if (IS_ERR(rvu_reporters->rvu_hw_npa_gen_reporter)) { dev_warn(rvu->dev, "Failed to create hw_npa_gen reporter, err=%ld\n", PTR_ERR(rvu_reporters->rvu_hw_npa_gen_reporter)); @@ -1067,7 +1079,9 @@ static int rvu_npa_register_reporters(struct rvu_devlink *rvu_dl) } rvu_reporters->rvu_hw_npa_err_reporter = - devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_npa_err_reporter_ops, 0, rvu); + devlink_health_reporter_create(rvu_dl->dl, + &rvu_hw_npa_err_reporter_ops, + rvu); if (IS_ERR(rvu_reporters->rvu_hw_npa_err_reporter)) { dev_warn(rvu->dev, "Failed to create hw_npa_err reporter, err=%ld\n", PTR_ERR(rvu_reporters->rvu_hw_npa_err_reporter)); @@ -1075,7 +1089,9 @@ static int rvu_npa_register_reporters(struct rvu_devlink *rvu_dl) } rvu_reporters->rvu_hw_npa_ras_reporter = - devlink_health_reporter_create(rvu_dl->dl, &rvu_hw_npa_ras_reporter_ops, 0, rvu); + devlink_health_reporter_create(rvu_dl->dl, + &rvu_hw_npa_ras_reporter_ops, + rvu); if (IS_ERR(rvu_reporters->rvu_hw_npa_ras_reporter)) { dev_warn(rvu->dev, "Failed to create hw_npa_ras reporter, err=%ld\n", PTR_ERR(rvu_reporters->rvu_hw_npa_ras_reporter)); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c index 32bb769f1829..73f5b62b8c7f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c @@ -135,7 +135,7 @@ void mlx5_reporter_vnic_create(struct mlx5_core_dev *dev) health->vnic_reporter = devlink_health_reporter_create(devlink, &mlx5_reporter_vnic_ops, - 0, dev); + dev); if (IS_ERR(health->vnic_reporter)) mlx5_core_warn(dev, "Failed to create vnic reporter, err = %ld\n", diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c index 16c44d628eda..1b9ea72abc5a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c @@ -651,22 +651,24 @@ void mlx5e_reporter_icosq_resume_recovery(struct mlx5e_channel *c) mutex_unlock(&c->icosq_recovery_lock); } +#define MLX5E_REPORTER_RX_GRACEFUL_PERIOD 500 + static const struct devlink_health_reporter_ops mlx5_rx_reporter_ops = { .name = "rx", .recover = mlx5e_rx_reporter_recover, .diagnose = mlx5e_rx_reporter_diagnose, .dump = mlx5e_rx_reporter_dump, + .default_graceful_period = MLX5E_REPORTER_RX_GRACEFUL_PERIOD, }; -#define MLX5E_REPORTER_RX_GRACEFUL_PERIOD 500 - void mlx5e_reporter_rx_create(struct mlx5e_priv *priv) { + struct devlink_port *port = priv->netdev->devlink_port; struct devlink_health_reporter *reporter; - reporter = devlink_port_health_reporter_create(priv->netdev->devlink_port, + reporter = devlink_port_health_reporter_create(port, &mlx5_rx_reporter_ops, - MLX5E_REPORTER_RX_GRACEFUL_PERIOD, priv); + priv); if (IS_ERR(reporter)) { netdev_warn(priv->netdev, "Failed to create rx reporter, err = %ld\n", PTR_ERR(reporter)); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c index 85d5cb39b107..7a4a77f6fe6a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c @@ -539,22 +539,24 @@ void mlx5e_reporter_tx_ptpsq_unhealthy(struct mlx5e_ptpsq *ptpsq) mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); } +#define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500 + static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = { .name = "tx", .recover = mlx5e_tx_reporter_recover, .diagnose = mlx5e_tx_reporter_diagnose, .dump = mlx5e_tx_reporter_dump, + .default_graceful_period = MLX5_REPORTER_TX_GRACEFUL_PERIOD, }; -#define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500 - void mlx5e_reporter_tx_create(struct mlx5e_priv *priv) { + struct devlink_port *port = priv->netdev->devlink_port; struct devlink_health_reporter *reporter; - reporter = devlink_port_health_reporter_create(priv->netdev->devlink_port, + reporter = devlink_port_health_reporter_create(port, &mlx5_tx_reporter_ops, - MLX5_REPORTER_TX_GRACEFUL_PERIOD, priv); + priv); if (IS_ERR(reporter)) { netdev_warn(priv->netdev, "Failed to create tx reporter, err = %ld\n", diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 63a7a788fb0d..b231e7855bca 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1447,7 +1447,7 @@ static void mlx5e_rep_vnic_reporter_create(struct mlx5e_priv *priv, reporter = devl_port_health_reporter_create(dl_port, &mlx5_rep_vnic_reporter_ops, - 0, rpriv); + rpriv); if (IS_ERR(reporter)) { mlx5_core_err(priv->mdev, "Failed to create representor vnic reporter, err = %ld\n", diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index cf7a1edd0530..b63c5a221eb9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -669,54 +669,61 @@ static void mlx5_fw_fatal_reporter_err_work(struct work_struct *work) } } +#define MLX5_FW_REPORTER_ECPF_GRACEFUL_PERIOD 180000 +#define MLX5_FW_REPORTER_PF_GRACEFUL_PERIOD 60000 +#define MLX5_FW_REPORTER_VF_GRACEFUL_PERIOD 30000 +#define MLX5_FW_REPORTER_DEFAULT_GRACEFUL_PERIOD \ + MLX5_FW_REPORTER_VF_GRACEFUL_PERIOD + +static +const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ecpf_ops = { + .name = "fw_fatal", + .recover = mlx5_fw_fatal_reporter_recover, + .dump = mlx5_fw_fatal_reporter_dump, + .default_graceful_period = + MLX5_FW_REPORTER_ECPF_GRACEFUL_PERIOD, +}; + static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_pf_ops = { .name = "fw_fatal", .recover = mlx5_fw_fatal_reporter_recover, .dump = mlx5_fw_fatal_reporter_dump, + .default_graceful_period = MLX5_FW_REPORTER_PF_GRACEFUL_PERIOD, }; static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = { .name = "fw_fatal", .recover = mlx5_fw_fatal_reporter_recover, + .default_graceful_period = + MLX5_FW_REPORTER_DEFAULT_GRACEFUL_PERIOD, }; -#define MLX5_FW_REPORTER_ECPF_GRACEFUL_PERIOD 180000 -#define MLX5_FW_REPORTER_PF_GRACEFUL_PERIOD 60000 -#define MLX5_FW_REPORTER_VF_GRACEFUL_PERIOD 30000 -#define MLX5_FW_REPORTER_DEFAULT_GRACEFUL_PERIOD MLX5_FW_REPORTER_VF_GRACEFUL_PERIOD - void mlx5_fw_reporters_create(struct mlx5_core_dev *dev) { const struct devlink_health_reporter_ops *fw_fatal_ops; struct mlx5_core_health *health = &dev->priv.health; const struct devlink_health_reporter_ops *fw_ops; struct devlink *devlink = priv_to_devlink(dev); - u64 grace_period; - fw_fatal_ops = &mlx5_fw_fatal_reporter_pf_ops; fw_ops = &mlx5_fw_reporter_pf_ops; if (mlx5_core_is_ecpf(dev)) { - grace_period = MLX5_FW_REPORTER_ECPF_GRACEFUL_PERIOD; + fw_fatal_ops = &mlx5_fw_fatal_reporter_ecpf_ops; } else if (mlx5_core_is_pf(dev)) { - grace_period = MLX5_FW_REPORTER_PF_GRACEFUL_PERIOD; + fw_fatal_ops = &mlx5_fw_fatal_reporter_pf_ops; } else { /* VF or SF */ - grace_period = MLX5_FW_REPORTER_DEFAULT_GRACEFUL_PERIOD; fw_fatal_ops = &mlx5_fw_fatal_reporter_ops; fw_ops = &mlx5_fw_reporter_ops; } - health->fw_reporter = - devl_health_reporter_create(devlink, fw_ops, 0, dev); + health->fw_reporter = devl_health_reporter_create(devlink, fw_ops, dev); if (IS_ERR(health->fw_reporter)) mlx5_core_warn(dev, "Failed to create fw reporter, err = %ld\n", PTR_ERR(health->fw_reporter)); - health->fw_fatal_reporter = - devl_health_reporter_create(devlink, - fw_fatal_ops, - grace_period, - dev); + health->fw_fatal_reporter = devl_health_reporter_create(devlink, + fw_fatal_ops, + dev); if (IS_ERR(health->fw_fatal_reporter)) mlx5_core_warn(dev, "Failed to create fw fatal reporter, err = %ld\n", PTR_ERR(health->fw_fatal_reporter)); diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index 2bb2b77351bd..980f3223f124 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -2043,7 +2043,7 @@ static int mlxsw_core_health_init(struct mlxsw_core *mlxsw_core) return 0; fw_fatal = devl_health_reporter_create(devlink, &mlxsw_core_health_fw_fatal_ops, - 0, mlxsw_core); + mlxsw_core); if (IS_ERR(fw_fatal)) { dev_err(mlxsw_core->bus_info->dev, "Failed to create fw fatal reporter"); return PTR_ERR(fw_fatal); diff --git a/drivers/net/ethernet/qlogic/qed/qed_devlink.c b/drivers/net/ethernet/qlogic/qed/qed_devlink.c index 1adc7fbb3f2f..94c5689b5abd 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_devlink.c +++ b/drivers/net/ethernet/qlogic/qed/qed_devlink.c @@ -87,20 +87,21 @@ qed_fw_fatal_reporter_recover(struct devlink_health_reporter *reporter, return 0; } +#define QED_REPORTER_FW_GRACEFUL_PERIOD 0 + static const struct devlink_health_reporter_ops qed_fw_fatal_reporter_ops = { .name = "fw_fatal", .recover = qed_fw_fatal_reporter_recover, .dump = qed_fw_fatal_reporter_dump, + .default_graceful_period = QED_REPORTER_FW_GRACEFUL_PERIOD, }; -#define QED_REPORTER_FW_GRACEFUL_PERIOD 0 - void qed_fw_reporters_create(struct devlink *devlink) { struct qed_devlink *dl = devlink_priv(devlink); - dl->fw_reporter = devlink_health_reporter_create(devlink, &qed_fw_fatal_reporter_ops, - QED_REPORTER_FW_GRACEFUL_PERIOD, dl); + dl->fw_reporter = devlink_health_reporter_create(devlink, + &qed_fw_fatal_reporter_ops, dl); if (IS_ERR(dl->fw_reporter)) { DP_NOTICE(dl->cdev, "Failed to create fw reporter, err = %ld\n", PTR_ERR(dl->fw_reporter)); diff --git a/drivers/net/netdevsim/health.c b/drivers/net/netdevsim/health.c index 688f05316b5e..3bd0e7a489c3 100644 --- a/drivers/net/netdevsim/health.c +++ b/drivers/net/netdevsim/health.c @@ -183,14 +183,14 @@ int nsim_dev_health_init(struct nsim_dev *nsim_dev, struct devlink *devlink) health->empty_reporter = devl_health_reporter_create(devlink, &nsim_dev_empty_reporter_ops, - 0, health); + health); if (IS_ERR(health->empty_reporter)) return PTR_ERR(health->empty_reporter); health->dummy_reporter = devl_health_reporter_create(devlink, &nsim_dev_dummy_reporter_ops, - 0, health); + health); if (IS_ERR(health->dummy_reporter)) { err = PTR_ERR(health->dummy_reporter); goto err_empty_reporter_destroy; diff --git a/include/net/devlink.h b/include/net/devlink.h index 3119d053bc4d..c7ad7a981b39 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -746,6 +746,8 @@ enum devlink_health_reporter_state { * if priv_ctx is NULL, run a full dump * @diagnose: callback to diagnose the current status * @test: callback to trigger a test event + * @default_graceful_period: default min time (in msec) + * between recovery attempts */ struct devlink_health_reporter_ops { @@ -760,6 +762,7 @@ struct devlink_health_reporter_ops { struct netlink_ext_ack *extack); int (*test)(struct devlink_health_reporter *reporter, struct netlink_ext_ack *extack); + u64 default_graceful_period; }; /** @@ -1928,22 +1931,22 @@ void devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, struct devlink_health_reporter * devl_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv); + void *priv); struct devlink_health_reporter * devlink_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv); + void *priv); struct devlink_health_reporter * devl_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv); + void *priv); struct devlink_health_reporter * devlink_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv); + void *priv); void devl_health_reporter_destroy(struct devlink_health_reporter *reporter); diff --git a/net/devlink/health.c b/net/devlink/health.c index b3ce8ecbb7fb..ba144b7426fa 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -108,11 +108,11 @@ devlink_port_health_reporter_find_by_name(struct devlink_port *devlink_port, static struct devlink_health_reporter * __devlink_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) + void *priv) { struct devlink_health_reporter *reporter; - if (WARN_ON(graceful_period && !ops->recover)) + if (WARN_ON(ops->default_graceful_period && !ops->recover)) return ERR_PTR(-EINVAL); reporter = kzalloc(sizeof(*reporter), GFP_KERNEL); @@ -122,7 +122,7 @@ __devlink_health_reporter_create(struct devlink *devlink, reporter->priv = priv; reporter->ops = ops; reporter->devlink = devlink; - reporter->graceful_period = graceful_period; + reporter->graceful_period = ops->default_graceful_period; reporter->auto_recover = !!ops->recover; reporter->auto_dump = !!ops->dump; return reporter; @@ -134,13 +134,12 @@ __devlink_health_reporter_create(struct devlink *devlink, * * @port: devlink_port to which health reports will relate * @ops: devlink health reporter ops - * @graceful_period: min time (in msec) between recovery attempts * @priv: driver priv pointer */ struct devlink_health_reporter * devl_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) + void *priv) { struct devlink_health_reporter *reporter; @@ -150,8 +149,7 @@ devl_port_health_reporter_create(struct devlink_port *port, ops->name)) return ERR_PTR(-EEXIST); - reporter = __devlink_health_reporter_create(port->devlink, ops, - graceful_period, priv); + reporter = __devlink_health_reporter_create(port->devlink, ops, priv); if (IS_ERR(reporter)) return reporter; @@ -164,14 +162,13 @@ EXPORT_SYMBOL_GPL(devl_port_health_reporter_create); struct devlink_health_reporter * devlink_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) + void *priv) { struct devlink_health_reporter *reporter; struct devlink *devlink = port->devlink; devl_lock(devlink); - reporter = devl_port_health_reporter_create(port, ops, - graceful_period, priv); + reporter = devl_port_health_reporter_create(port, ops, priv); devl_unlock(devlink); return reporter; } @@ -182,13 +179,12 @@ EXPORT_SYMBOL_GPL(devlink_port_health_reporter_create); * * @devlink: devlink instance which the health reports will relate * @ops: devlink health reporter ops - * @graceful_period: min time (in msec) between recovery attempts * @priv: driver priv pointer */ struct devlink_health_reporter * devl_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) + void *priv) { struct devlink_health_reporter *reporter; @@ -197,8 +193,7 @@ devl_health_reporter_create(struct devlink *devlink, if (devlink_health_reporter_find_by_name(devlink, ops->name)) return ERR_PTR(-EEXIST); - reporter = __devlink_health_reporter_create(devlink, ops, - graceful_period, priv); + reporter = __devlink_health_reporter_create(devlink, ops, priv); if (IS_ERR(reporter)) return reporter; @@ -210,13 +205,12 @@ EXPORT_SYMBOL_GPL(devl_health_reporter_create); struct devlink_health_reporter * devlink_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) + void *priv) { struct devlink_health_reporter *reporter; devl_lock(devlink); - reporter = devl_health_reporter_create(devlink, ops, - graceful_period, priv); + reporter = devl_health_reporter_create(devlink, ops, priv); devl_unlock(devlink); return reporter; } -- cgit v1.2.3 From 20597fb9436e2e2372ddf782f0bb5ecbe3481068 Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Sun, 24 Aug 2025 11:43:51 +0300 Subject: devlink: Move health reporter recovery abort logic to a separate function Extract the health reporter recovery abort logic into a separate function devlink_health_recover_abort(). The function encapsulates the conditions for aborting recovery: - When auto-recovery is disabled - When previous error wasn't recovered - When within the grace period after last recovery Signed-off-by: Shahar Shitrit Reviewed-by: Jiri Pirko Reviewed-by: Dragos Tatulea Reviewed-by: Carolina Jubran Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250824084354.533182-3-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- net/devlink/health.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/devlink/health.c b/net/devlink/health.c index ba144b7426fa..9d0d4a9face7 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -586,12 +586,33 @@ dump_err: return err; } +static bool +devlink_health_recover_abort(struct devlink_health_reporter *reporter, + enum devlink_health_reporter_state prev_state) +{ + unsigned long recover_ts_threshold; + + if (!reporter->auto_recover) + return false; + + /* abort if the previous error wasn't recovered */ + if (prev_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY) + return true; + + recover_ts_threshold = reporter->last_recovery_ts + + msecs_to_jiffies(reporter->graceful_period); + if (reporter->last_recovery_ts && reporter->recovery_count && + time_is_after_jiffies(recover_ts_threshold)) + return true; + + return false; +} + int devlink_health_report(struct devlink_health_reporter *reporter, const char *msg, void *priv_ctx) { enum devlink_health_reporter_state prev_health_state; struct devlink *devlink = reporter->devlink; - unsigned long recover_ts_threshold; int ret; /* write a log message of the current error */ @@ -602,13 +623,7 @@ int devlink_health_report(struct devlink_health_reporter *reporter, reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR; devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); - /* abort if the previous error wasn't recovered */ - recover_ts_threshold = reporter->last_recovery_ts + - msecs_to_jiffies(reporter->graceful_period); - if (reporter->auto_recover && - (prev_health_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY || - (reporter->last_recovery_ts && reporter->recovery_count && - time_is_after_jiffies(recover_ts_threshold)))) { + if (devlink_health_recover_abort(reporter, prev_health_state)) { trace_devlink_health_recover_aborted(devlink, reporter->ops->name, reporter->health_state, -- cgit v1.2.3 From 6a06d8c40510ba1ecf27977f528b1eb74f290a60 Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Sun, 24 Aug 2025 11:43:52 +0300 Subject: devlink: Introduce burst period for health reporter Currently, the devlink health reporter starts the grace period immediately after handling an error, blocking any further recoveries until it finished. However, when a single root cause triggers multiple errors in a short time frame, it is desirable to treat them as a bulk of errors and to allow their recoveries, avoiding premature blocking of subsequent related errors, and reducing the risk of inconsistent or incomplete error handling. To address this, introduce a configurable burst period for devlink health reporter. Start this period when the first error is handled, and allow recovery attempts for reported errors during this window. Once burst period expires, begin the grace period to block further recoveries until it concludes. Timeline summary: ----|--------|------------------------------/----------------------/-- error is error is burst period grace period reported recovered (recoveries allowed) (recoveries blocked) For calculating the burst period duration, use the same last_recovery_ts as the grace period. Update it on recovery only when the burst period is inactive (either disabled or at the first error). This patch implements the framework for the burst period and effectively sets its value to 0 at reporter creation, so the current behavior remains unchanged, which ensures backward compatibility. A downstream patch will make the burst period configurable. Signed-off-by: Shahar Shitrit Reviewed-by: Jiri Pirko Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250824084354.533182-4-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 3 +++ net/devlink/health.c | 22 +++++++++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index c7ad7a981b39..5f44e702c25c 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -748,6 +748,8 @@ enum devlink_health_reporter_state { * @test: callback to trigger a test event * @default_graceful_period: default min time (in msec) * between recovery attempts + * @default_burst_period: default time (in msec) for + * error recoveries before starting the grace period */ struct devlink_health_reporter_ops { @@ -763,6 +765,7 @@ struct devlink_health_reporter_ops { int (*test)(struct devlink_health_reporter *reporter, struct netlink_ext_ack *extack); u64 default_graceful_period; + u64 default_burst_period; }; /** diff --git a/net/devlink/health.c b/net/devlink/health.c index 9d0d4a9face7..94ab77f77add 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -60,6 +60,7 @@ struct devlink_health_reporter { struct devlink_port *devlink_port; struct devlink_fmsg *dump_fmsg; u64 graceful_period; + u64 burst_period; bool auto_recover; bool auto_dump; u8 health_state; @@ -123,6 +124,7 @@ __devlink_health_reporter_create(struct devlink *devlink, reporter->ops = ops; reporter->devlink = devlink; reporter->graceful_period = ops->default_graceful_period; + reporter->burst_period = ops->default_burst_period; reporter->auto_recover = !!ops->recover; reporter->auto_dump = !!ops->dump; return reporter; @@ -508,11 +510,25 @@ static void devlink_recover_notify(struct devlink_health_reporter *reporter, devlink_nl_notify_send_desc(devlink, msg, &desc); } +static bool +devlink_health_reporter_in_burst(struct devlink_health_reporter *reporter) +{ + unsigned long burst_threshold = reporter->last_recovery_ts + + msecs_to_jiffies(reporter->burst_period); + + return time_is_after_jiffies(burst_threshold); +} + void devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter) { reporter->recovery_count++; - reporter->last_recovery_ts = jiffies; + if (!devlink_health_reporter_in_burst(reporter)) + /* When burst period is set, last_recovery_ts marks the first + * recovery within the burst period, not necessarily the last + * one. + */ + reporter->last_recovery_ts = jiffies; } EXPORT_SYMBOL_GPL(devlink_health_reporter_recovery_done); @@ -599,7 +615,11 @@ devlink_health_recover_abort(struct devlink_health_reporter *reporter, if (prev_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY) return true; + if (devlink_health_reporter_in_burst(reporter)) + return false; + recover_ts_threshold = reporter->last_recovery_ts + + msecs_to_jiffies(reporter->burst_period) + msecs_to_jiffies(reporter->graceful_period); if (reporter->last_recovery_ts && reporter->recovery_count && time_is_after_jiffies(recover_ts_threshold)) -- cgit v1.2.3 From da0e2197645c8e01bb6080c7a2b86d9a56cc64a9 Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Sun, 24 Aug 2025 11:43:53 +0300 Subject: devlink: Make health reporter burst period configurable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable configuration of the burst period — a time window starting from the first error recovery, during which the reporter allows recovery attempts for each reported error. This feature is helpful when a single underlying issue causes multiple errors, as it delays the start of the grace period to allow sufficient time for recovering all related errors. For example, if multiple TX queues time out simultaneously, a sufficient burst period could allow all affected TX queues to be recovered within that window. Without this period, only the first TX queue that reports a timeout will undergo recovery, while the remaining TX queues will be blocked once the grace period begins. Configuration example: $ devlink health set pci/0000:00:09.0 reporter tx burst_period 500 Configuration example with ynl: ./tools/net/ynl/pyynl/cli.py \ --spec Documentation/netlink/specs/devlink.yaml \ --do health-reporter-set --json '{ "bus-name": "auxiliary", "dev-name": "mlx5_core.eth.0", "port-index": 65535, "health-reporter-name": "tx", "health-reporter-burst-period": 500 }' Signed-off-by: Shahar Shitrit Reviewed-by: Jiri Pirko Reviewed-by: Dragos Tatulea Reviewed-by: Carolina Jubran Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250824084354.533182-5-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/devlink.yaml | 7 ++++++ .../networking/devlink/devlink-health.rst | 2 +- include/uapi/linux/devlink.h | 2 ++ net/devlink/health.c | 28 ++++++++++++++++++++-- net/devlink/netlink_gen.c | 5 ++-- 5 files changed, 39 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml index bb87111d5e16..3db59c965869 100644 --- a/Documentation/netlink/specs/devlink.yaml +++ b/Documentation/netlink/specs/devlink.yaml @@ -853,6 +853,10 @@ attribute-sets: type: nest multi-attr: true nested-attributes: dl-rate-tc-bws + - + name: health-reporter-burst-period + type: u64 + doc: Time (in msec) for recoveries before starting the grace period. - name: dl-dev-stats subset-of: devlink @@ -1216,6 +1220,8 @@ attribute-sets: name: health-reporter-dump-ts-ns - name: health-reporter-auto-dump + - + name: health-reporter-burst-period - name: dl-attr-stats @@ -1961,6 +1967,7 @@ operations: - health-reporter-graceful-period - health-reporter-auto-recover - health-reporter-auto-dump + - health-reporter-burst-period - name: health-reporter-recover diff --git a/Documentation/networking/devlink/devlink-health.rst b/Documentation/networking/devlink/devlink-health.rst index e0b8cfed610a..4d10536377ab 100644 --- a/Documentation/networking/devlink/devlink-health.rst +++ b/Documentation/networking/devlink/devlink-health.rst @@ -50,7 +50,7 @@ Once an error is reported, devlink health will perform the following actions: * Auto recovery attempt is being done. Depends on: - Auto-recovery configuration - - Grace period vs. time passed since last recover + - Grace period (and burst period) vs. time passed since last recover Devlink formatted message ========================= diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 9fcb25a0f447..bcad11a787a5 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -636,6 +636,8 @@ enum devlink_attr { DEVLINK_ATTR_RATE_TC_BWS, /* nested */ + DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD, /* u64 */ + /* Add new attributes above here, update the spec in * Documentation/netlink/specs/devlink.yaml and re-generate * net/devlink/netlink_gen.c. diff --git a/net/devlink/health.c b/net/devlink/health.c index 94ab77f77add..136a67c36a20 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -116,6 +116,9 @@ __devlink_health_reporter_create(struct devlink *devlink, if (WARN_ON(ops->default_graceful_period && !ops->recover)) return ERR_PTR(-EINVAL); + if (WARN_ON(ops->default_burst_period && !ops->default_graceful_period)) + return ERR_PTR(-EINVAL); + reporter = kzalloc(sizeof(*reporter), GFP_KERNEL); if (!reporter) return ERR_PTR(-ENOMEM); @@ -293,6 +296,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg, devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, reporter->graceful_period)) goto reporter_nest_cancel; + if (reporter->ops->recover && + devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD, + reporter->burst_period)) + goto reporter_nest_cancel; if (reporter->ops->recover && nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, reporter->auto_recover)) @@ -458,16 +465,33 @@ int devlink_nl_health_reporter_set_doit(struct sk_buff *skb, if (!reporter->ops->recover && (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] || - info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])) + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] || + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD])) return -EOPNOTSUPP; if (!reporter->ops->dump && info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) return -EOPNOTSUPP; - if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) + if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) { reporter->graceful_period = nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]); + if (!reporter->graceful_period) + reporter->burst_period = 0; + } + + if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]) { + u64 burst_period = + nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]); + + if (!reporter->graceful_period && burst_period) { + NL_SET_ERR_MSG_MOD(info->extack, + "Cannot set burst period without a grace period."); + return -EINVAL; + } + + reporter->burst_period = burst_period; + } if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]) reporter->auto_recover = diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c index d97c326a9045..9fd00977d59e 100644 --- a/net/devlink/netlink_gen.c +++ b/net/devlink/netlink_gen.c @@ -389,7 +389,7 @@ static const struct nla_policy devlink_health_reporter_get_dump_nl_policy[DEVLIN }; /* DEVLINK_CMD_HEALTH_REPORTER_SET - do */ -static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP + 1] = { +static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, @@ -397,6 +397,7 @@ static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATT [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64, }, [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8, }, [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8, }, + [DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD] = { .type = NLA_U64, }, }; /* DEVLINK_CMD_HEALTH_REPORTER_RECOVER - do */ @@ -1032,7 +1033,7 @@ const struct genl_split_ops devlink_nl_ops[74] = { .doit = devlink_nl_health_reporter_set_doit, .post_doit = devlink_nl_post_doit, .policy = devlink_health_reporter_set_nl_policy, - .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, + .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { -- cgit v1.2.3 From 1bec9d0c0046fe4e2bfb6a1c5aadcb5d56cdb0fb Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 25 Aug 2025 15:37:43 +0200 Subject: ipv4: Convert ->flowi4_tos to dscp_t. Convert the ->flowic_tos field of struct flowi_common from __u8 to dscp_t, rename it ->flowic_dscp and propagate these changes to struct flowi and struct flowi4. We've had several bugs in the past where ECN bits could interfere with IPv4 routing, because these bits were not properly cleared when setting ->flowi4_tos. These bugs should be fixed now and the dscp_t type has been introduced to ensure that variables carrying DSCP values don't accidentally have any ECN bits set. Several variables and structure fields have been converted to dscp_t already, but the main IPv4 routing structure, struct flowi4, is still using a __u8. To avoid any future regression, this patch converts it to dscp_t. There are many users to convert at once. Fortunately, around half of ->flowi4_tos users already have a dscp_t value at hand, which they currently convert to __u8 using inet_dscp_to_dsfield(). For all of these users, we just need to drop that conversion. But, although we try to do the __u8 <-> dscp_t conversions at the boundaries of the network or of user space, some places still store TOS/DSCP variables as __u8 in core networking code. Those can hardly be converted either because the data structure is part of UAPI or because the same variable or field is also used for handling ECN in other parts of the code. In all of these cases where we don't have a dscp_t variable at hand, we need to use inet_dsfield_to_dscp() when interacting with ->flowi4_dscp. Changes since v1: * Fix space alignment in __bpf_redirect_neigh_v4() (Ido). Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/29acecb45e911d17446b9a3dbdb1ab7b821ea371.1756128932.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/amt.c | 6 ++++-- drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c | 7 ++++--- drivers/net/ethernet/sfc/tc_encap_actions.c | 4 +++- drivers/net/gtp.c | 7 ++++--- drivers/net/ipvlan/ipvlan_core.c | 4 ++-- drivers/net/vrf.c | 4 ++-- include/net/flow.h | 11 ++++++----- include/net/inet_dscp.h | 6 ++++++ include/net/ip_fib.h | 2 +- include/net/ip_tunnels.h | 4 +++- include/net/route.h | 2 +- include/trace/events/fib.h | 4 +++- net/core/filter.c | 4 ++-- net/core/lwt_bpf.c | 4 ++-- net/ipv4/fib_frontend.c | 7 ++++--- net/ipv4/fib_rules.c | 4 ++-- net/ipv4/icmp.c | 5 +++-- net/ipv4/ip_gre.c | 4 ++-- net/ipv4/ip_output.c | 3 ++- net/ipv4/ipmr.c | 3 ++- net/ipv4/netfilter.c | 4 ++-- net/ipv4/netfilter/ipt_rpfilter.c | 4 ++-- net/ipv4/netfilter/nf_dup_ipv4.c | 4 ++-- net/ipv4/netfilter/nft_fib_ipv4.c | 4 ++-- net/ipv4/route.c | 8 ++++---- net/ipv4/udp_tunnel_core.c | 3 ++- net/ipv4/xfrm4_policy.c | 4 ++-- net/netfilter/nft_flow_offload.c | 4 ++-- net/sctp/protocol.c | 3 ++- net/xfrm/xfrm_policy.c | 6 +++--- 30 files changed, 81 insertions(+), 58 deletions(-) (limited to 'net') diff --git a/drivers/net/amt.c b/drivers/net/amt.c index ed86537b2f61..902c817a0dea 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -28,6 +29,7 @@ #include #include #include +#include #include static struct workqueue_struct *amt_wq; @@ -1018,7 +1020,7 @@ static bool amt_send_membership_update(struct amt_dev *amt, fl4.flowi4_oif = amt->stream_dev->ifindex; fl4.daddr = amt->remote_ip; fl4.saddr = amt->local_ip; - fl4.flowi4_tos = AMT_TOS; + fl4.flowi4_dscp = inet_dsfield_to_dscp(AMT_TOS); fl4.flowi4_proto = IPPROTO_UDP; rt = ip_route_output_key(amt->net, &fl4); if (IS_ERR(rt)) { @@ -1133,7 +1135,7 @@ static bool amt_send_membership_query(struct amt_dev *amt, fl4.flowi4_oif = amt->stream_dev->ifindex; fl4.daddr = tunnel->ip4; fl4.saddr = amt->local_ip; - fl4.flowi4_tos = AMT_TOS; + fl4.flowi4_dscp = inet_dsfield_to_dscp(AMT_TOS); fl4.flowi4_proto = IPPROTO_UDP; rt = ip_route_output_key(amt->net, &fl4); if (IS_ERR(rt)) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c index 2162d776fe35..a14f216048cd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c @@ -1,7 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ /* Copyright (c) 2018 Mellanox Technologies. */ -#include +#include +#include #include #include #include @@ -233,7 +234,7 @@ int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv *priv, int err; /* add the IP fields */ - attr.fl.fl4.flowi4_tos = tun_key->tos & ~INET_ECN_MASK; + attr.fl.fl4.flowi4_dscp = inet_dsfield_to_dscp(tun_key->tos); attr.fl.fl4.daddr = tun_key->u.ipv4.dst; attr.fl.fl4.saddr = tun_key->u.ipv4.src; attr.ttl = tun_key->ttl; @@ -349,7 +350,7 @@ int mlx5e_tc_tun_update_header_ipv4(struct mlx5e_priv *priv, int err; /* add the IP fields */ - attr.fl.fl4.flowi4_tos = tun_key->tos & ~INET_ECN_MASK; + attr.fl.fl4.flowi4_dscp = inet_dsfield_to_dscp(tun_key->tos); attr.fl.fl4.daddr = tun_key->u.ipv4.dst; attr.fl.fl4.saddr = tun_key->u.ipv4.src; attr.ttl = tun_key->ttl; diff --git a/drivers/net/ethernet/sfc/tc_encap_actions.c b/drivers/net/ethernet/sfc/tc_encap_actions.c index e872f926e438..eef06e48185d 100644 --- a/drivers/net/ethernet/sfc/tc_encap_actions.c +++ b/drivers/net/ethernet/sfc/tc_encap_actions.c @@ -11,6 +11,8 @@ #include "tc_encap_actions.h" #include "tc.h" #include "mae.h" +#include +#include #include #include #include @@ -99,7 +101,7 @@ static int efx_bind_neigh(struct efx_nic *efx, case EFX_ENCAP_TYPE_GENEVE: flow4.flowi4_proto = IPPROTO_UDP; flow4.fl4_dport = encap->key.tp_dst; - flow4.flowi4_tos = encap->key.tos; + flow4.flowi4_dscp = inet_dsfield_to_dscp(encap->key.tos); flow4.daddr = encap->key.u.ipv4.dst; flow4.saddr = encap->key.u.ipv4.src; break; diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 4b668ebaa0f7..5cb59d72bc82 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -21,9 +21,10 @@ #include #include +#include +#include #include #include -#include #include #include #include @@ -352,7 +353,7 @@ static struct rtable *ip4_route_output_gtp(struct flowi4 *fl4, fl4->flowi4_oif = sk->sk_bound_dev_if; fl4->daddr = daddr; fl4->saddr = saddr; - fl4->flowi4_tos = inet_dscp_to_dsfield(inet_sk_dscp(inet_sk(sk))); + fl4->flowi4_dscp = inet_sk_dscp(inet_sk(sk)); fl4->flowi4_scope = ip_sock_rt_scope(sk); fl4->flowi4_proto = sk->sk_protocol; @@ -2401,7 +2402,7 @@ static int gtp_genl_send_echo_req(struct sk_buff *skb, struct genl_info *info) udp_tunnel_xmit_skb(rt, sk, skb_to_send, fl4.saddr, fl4.daddr, - fl4.flowi4_tos, + inet_dscp_to_dsfield(fl4.flowi4_dscp), ip4_dst_hoplimit(&rt->dst), 0, port, port, diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index e3e65772c599..d7e3ddbcab6f 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -2,7 +2,7 @@ /* Copyright (c) 2014 Mahesh Bandewar */ -#include +#include #include #include "ipvlan.h" @@ -433,7 +433,7 @@ static noinline_for_stack int ipvlan_process_v4_outbound(struct sk_buff *skb) ip4h = ip_hdr(skb); fl4.daddr = ip4h->daddr; fl4.saddr = ip4h->saddr; - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)); + fl4.flowi4_dscp = ip4h_dscp(ip4h); rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 3ccd649913b5..571847a7f86d 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -26,6 +26,7 @@ #include #include +#include #include #include #include @@ -38,7 +39,6 @@ #include #include #include -#include #define DRV_NAME "vrf" #define DRV_VERSION "1.1" @@ -505,7 +505,7 @@ static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, /* needed to match OIF rule */ fl4.flowi4_l3mdev = vrf_dev->ifindex; fl4.flowi4_iif = LOOPBACK_IFINDEX; - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)); + fl4.flowi4_dscp = ip4h_dscp(ip4h); fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; fl4.flowi4_proto = ip4h->protocol; fl4.daddr = ip4h->daddr; diff --git a/include/net/flow.h b/include/net/flow.h index a1839c278d87..ae9481c40063 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -12,6 +12,7 @@ #include #include #include +#include struct flow_keys; @@ -32,7 +33,7 @@ struct flowi_common { int flowic_iif; int flowic_l3mdev; __u32 flowic_mark; - __u8 flowic_tos; + dscp_t flowic_dscp; __u8 flowic_scope; __u8 flowic_proto; __u8 flowic_flags; @@ -70,7 +71,7 @@ struct flowi4 { #define flowi4_iif __fl_common.flowic_iif #define flowi4_l3mdev __fl_common.flowic_l3mdev #define flowi4_mark __fl_common.flowic_mark -#define flowi4_tos __fl_common.flowic_tos +#define flowi4_dscp __fl_common.flowic_dscp #define flowi4_scope __fl_common.flowic_scope #define flowi4_proto __fl_common.flowic_proto #define flowi4_flags __fl_common.flowic_flags @@ -103,7 +104,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, fl4->flowi4_iif = LOOPBACK_IFINDEX; fl4->flowi4_l3mdev = 0; fl4->flowi4_mark = mark; - fl4->flowi4_tos = tos; + fl4->flowi4_dscp = inet_dsfield_to_dscp(tos); fl4->flowi4_scope = scope; fl4->flowi4_proto = proto; fl4->flowi4_flags = flags; @@ -141,7 +142,7 @@ struct flowi6 { #define flowi6_uid __fl_common.flowic_uid struct in6_addr daddr; struct in6_addr saddr; - /* Note: flowi6_tos is encoded in flowlabel, too. */ + /* Note: flowi6_dscp is encoded in flowlabel, too. */ __be32 flowlabel; union flowi_uli uli; #define fl6_sport uli.ports.sport @@ -163,7 +164,7 @@ struct flowi { #define flowi_iif u.__fl_common.flowic_iif #define flowi_l3mdev u.__fl_common.flowic_l3mdev #define flowi_mark u.__fl_common.flowic_mark -#define flowi_tos u.__fl_common.flowic_tos +#define flowi_dscp u.__fl_common.flowic_dscp #define flowi_scope u.__fl_common.flowic_scope #define flowi_proto u.__fl_common.flowic_proto #define flowi_flags u.__fl_common.flowic_flags diff --git a/include/net/inet_dscp.h b/include/net/inet_dscp.h index 72f250dffada..1aa9f04ed1ab 100644 --- a/include/net/inet_dscp.h +++ b/include/net/inet_dscp.h @@ -39,6 +39,12 @@ typedef u8 __bitwise dscp_t; #define INET_DSCP_MASK 0xfc +/* A few places in the IPv4 code need to ignore the three high order bits of + * DSCP because of backward compatibility (as these bits used to represent the + * IPv4 Precedence in RFC 791's TOS field and were ignored). + */ +#define INET_DSCP_LEGACY_TOS_MASK ((__force dscp_t)0x1c) + static inline dscp_t inet_dsfield_to_dscp(__u8 dsfield) { return (__force dscp_t)(dsfield & INET_DSCP_MASK); diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 48bb3cf41469..b4495c38e0a0 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -440,7 +440,7 @@ static inline bool fib4_rules_early_flow_dissect(struct net *net, static inline bool fib_dscp_masked_match(dscp_t dscp, const struct flowi4 *fl4) { - return dscp == inet_dsfield_to_dscp(RT_TOS(fl4->flowi4_tos)); + return dscp == (fl4->flowi4_dscp & INET_DSCP_LEGACY_TOS_MASK); } /* Exported by fib_frontend.c */ diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 8cf1380f3656..4314a97702ea 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -11,7 +11,9 @@ #include #include +#include #include +#include #include #include #include @@ -362,7 +364,7 @@ static inline void ip_tunnel_init_flow(struct flowi4 *fl4, fl4->daddr = daddr; fl4->saddr = saddr; - fl4->flowi4_tos = tos; + fl4->flowi4_dscp = inet_dsfield_to_dscp(tos); fl4->flowi4_proto = proto; fl4->fl4_gre_key = key; fl4->flowi4_mark = mark; diff --git a/include/net/route.h b/include/net/route.h index 7ea840daa775..c71998f464f8 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -189,7 +189,7 @@ static inline struct rtable *ip_route_output(struct net *net, __be32 daddr, { struct flowi4 fl4 = { .flowi4_oif = oif, - .flowi4_tos = inet_dscp_to_dsfield(dscp), + .flowi4_dscp = dscp, .flowi4_scope = scope, .daddr = daddr, .saddr = saddr, diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h index 20b914250ce9..feb28b359eff 100644 --- a/include/trace/events/fib.h +++ b/include/trace/events/fib.h @@ -7,6 +7,8 @@ #include #include +#include +#include #include #include @@ -44,7 +46,7 @@ TRACE_EVENT(fib_table_lookup, __entry->err = err; __entry->oif = flp->flowi4_oif; __entry->iif = flp->flowi4_iif; - __entry->tos = flp->flowi4_tos; + __entry->tos = inet_dscp_to_dsfield(flp->flowi4_dscp); __entry->scope = flp->flowi4_scope; __entry->flags = flp->flowi4_flags; diff --git a/net/core/filter.c b/net/core/filter.c index 5da1cad66be2..b005363f482c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2373,7 +2373,7 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, struct flowi4 fl4 = { .flowi4_flags = FLOWI_FLAG_ANYSRC, .flowi4_mark = skb->mark, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)), + .flowi4_dscp = ip4h_dscp(ip4h), .flowi4_oif = dev->ifindex, .flowi4_proto = ip4h->protocol, .daddr = ip4h->daddr, @@ -6020,7 +6020,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, fl4.flowi4_iif = params->ifindex; fl4.flowi4_oif = 0; } - fl4.flowi4_tos = params->tos & INET_DSCP_MASK; + fl4.flowi4_dscp = inet_dsfield_to_dscp(params->tos); fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index ae74634310a3..9f40be0c3e71 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -8,12 +8,12 @@ #include #include #include +#include #include #include #include #include #include -#include struct bpf_lwt_prog { struct bpf_prog *prog; @@ -209,7 +209,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) fl4.flowi4_oif = oif; fl4.flowi4_mark = skb->mark; fl4.flowi4_uid = sock_net_uid(net, sk); - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); + fl4.flowi4_dscp = ip4h_dscp(iph); fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; fl4.flowi4_proto = iph->protocol; fl4.daddr = iph->daddr; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 6e1b94796f67..1dab44e13d3b 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -32,6 +32,7 @@ #include #include +#include #include #include #include @@ -293,7 +294,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(skb))), + .flowi4_dscp = ip4h_dscp(ip_hdr(skb)), .flowi4_scope = scope, .flowi4_mark = vmark ? skb->mark : 0, }; @@ -358,7 +359,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; fl4.saddr = dst; - fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); + fl4.flowi4_dscp = dscp; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_flags = 0; @@ -1372,7 +1373,7 @@ static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn) struct flowi4 fl4 = { .flowi4_mark = frn->fl_mark, .daddr = frn->fl_addr, - .flowi4_tos = frn->fl_tos & INET_DSCP_MASK, + .flowi4_dscp = inet_dsfield_to_dscp(frn->fl_tos), .flowi4_scope = frn->fl_scope, }; struct fib_table *tb; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index fa58d6620ed6..51f0193092f0 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -193,8 +194,7 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule, * to mask the upper three DSCP bits prior to matching to maintain * legacy behavior. */ - if (r->dscp_full && - (r->dscp ^ inet_dsfield_to_dscp(fl4->flowi4_tos)) & r->dscp_mask) + if (r->dscp_full && (r->dscp ^ fl4->flowi4_dscp) & r->dscp_mask) return 0; else if (!r->dscp_full && r->dscp && !fib_dscp_masked_match(r->dscp, fl4)) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 91765057aa1d..7248c15cbd75 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -72,6 +72,7 @@ #include #include #include +#include #include #include #include @@ -444,7 +445,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.saddr = saddr; fl4.flowi4_mark = mark; fl4.flowi4_uid = sock_net_uid(net, NULL); - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(skb))); + fl4.flowi4_dscp = ip4h_dscp(ip_hdr(skb)); fl4.flowi4_proto = IPPROTO_ICMP; fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev); security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); @@ -495,7 +496,7 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, fl4->saddr = saddr; fl4->flowi4_mark = mark; fl4->flowi4_uid = sock_net_uid(net, NULL); - fl4->flowi4_tos = inet_dscp_to_dsfield(dscp); + fl4->flowi4_dscp = dscp; fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index f5b9004d6938..761a53c6a89a 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -28,6 +28,7 @@ #include #include +#include #include #include #include @@ -44,7 +45,6 @@ #include #include #include -#include /* Problems & solutions @@ -930,7 +930,7 @@ static int ipgre_open(struct net_device *dev) if (ipv4_is_multicast(t->parms.iph.daddr)) { struct flowi4 fl4 = { .flowi4_oif = t->parms.link, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(&t->parms.iph)), + .flowi4_dscp = ip4h_dscp(&t->parms.iph), .flowi4_scope = RT_SCOPE_UNIVERSE, .flowi4_proto = IPPROTO_GRE, .saddr = t->parms.iph.saddr, diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 84e7f8a2f50f..2b96651d719b 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -63,6 +63,7 @@ #include #include +#include #include #include #include @@ -485,7 +486,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, inet_sk_init_flowi4(inet, fl4); /* sctp_v4_xmit() uses its own DSCP value */ - fl4->flowi4_tos = tos & INET_DSCP_MASK; + fl4->flowi4_dscp = inet_dsfield_to_dscp(tos); /* If this fails, retransmit mechanism of transport layer will * keep trying until route appears or the connection times diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index e86a8a862c41..345e5faac634 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -2120,7 +2121,7 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)), + .flowi4_dscp = ip4h_dscp(iph), .flowi4_oif = (rt_is_output_route(rt) ? skb->dev->ifindex : 0), .flowi4_iif = (rt_is_output_route(rt) ? diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index e60e54e7945d..ce310eb779e0 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -11,10 +11,10 @@ #include #include #include +#include #include #include #include -#include #include /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ @@ -44,7 +44,7 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un */ fl4.daddr = iph->daddr; fl4.saddr = saddr; - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); + fl4.flowi4_dscp = ip4h_dscp(iph); fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0; fl4.flowi4_l3mdev = l3mdev_master_ifindex(dev); fl4.flowi4_mark = skb->mark; diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index a27782d7653e..6d9bf5106868 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -8,8 +8,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -76,7 +76,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) flow.daddr = iph->saddr; flow.saddr = rpfilter_get_saddr(iph->daddr); flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; - flow.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); + flow.flowi4_dscp = ip4h_dscp(iph); flow.flowi4_scope = RT_SCOPE_UNIVERSE; flow.flowi4_l3mdev = l3mdev_master_ifindex_rcu(xt_in(par)); flow.flowi4_uid = sock_net_uid(xt_net(par), NULL); diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index ed08fb78cfa8..9a773502f10a 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -12,10 +12,10 @@ #include #include #include +#include #include #include #include -#include #include #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include @@ -33,7 +33,7 @@ static bool nf_dup_ipv4_route(struct net *net, struct sk_buff *skb, fl4.flowi4_oif = oif; fl4.daddr = gw->s_addr; - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); + fl4.flowi4_dscp = ip4h_dscp(iph); fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH; rt = ip_route_output_key(net, &fl4); diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 7e7c49535e3f..82af6cd76d13 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -10,7 +10,7 @@ #include #include -#include +#include #include #include #include @@ -114,7 +114,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, if (priv->flags & NFTA_FIB_F_MARK) fl4.flowi4_mark = pkt->skb->mark; - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); + fl4.flowi4_dscp = ip4h_dscp(iph); if (priv->flags & NFTA_FIB_F_DADDR) { fl4.daddr = iph->daddr; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 1f212b2ce4c6..771f6986ed05 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -84,6 +84,7 @@ #include #include #include +#include #include #include #include @@ -1291,7 +1292,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)), + .flowi4_dscp = ip4h_dscp(iph), .flowi4_oif = rt->dst.dev->ifindex, .flowi4_iif = skb->dev->ifindex, .flowi4_mark = skb->mark, @@ -2331,7 +2332,7 @@ ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.flowi4_oif = 0; fl4.flowi4_iif = dev->ifindex; fl4.flowi4_mark = skb->mark; - fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); + fl4.flowi4_dscp = dscp; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; fl4.daddr = daddr; @@ -2690,7 +2691,6 @@ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, struct rtable *rth; fl4->flowi4_iif = LOOPBACK_IFINDEX; - fl4->flowi4_tos &= INET_DSCP_MASK; rcu_read_lock(); rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); @@ -3333,7 +3333,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fl4.daddr = dst; fl4.saddr = src; - fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); + fl4.flowi4_dscp = dscp; fl4.flowi4_oif = nla_get_u32_default(tb[RTA_OIF], 0); fl4.flowi4_mark = mark; fl4.flowi4_uid = uid; diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index fce945f23069..54386e06a813 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -253,7 +254,7 @@ struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb, fl4.saddr = key->u.ipv4.src; fl4.fl4_dport = dport; fl4.fl4_sport = sport; - fl4.flowi4_tos = tos & INET_DSCP_MASK; + fl4.flowi4_dscp = inet_dsfield_to_dscp(tos); fl4.flowi4_flags = key->flow_flags; rt = ip_route_output_key(net, &fl4); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 7fb6205619e7..58faf1ddd2b1 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include @@ -25,7 +25,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct flowi4 *fl4, memset(fl4, 0, sizeof(*fl4)); fl4->daddr = params->daddr->a4; - fl4->flowi4_tos = inet_dscp_to_dsfield(params->dscp); + fl4->flowi4_dscp = params->dscp; fl4->flowi4_l3mdev = l3mdev_master_ifindex_by_index(params->net, params->oif); fl4->flowi4_mark = params->mark; diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 225ff293cd50..14dd1c0698c3 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include @@ -236,7 +236,7 @@ static int nft_flow_route(const struct nft_pktinfo *pkt, fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; - fl.u.ip4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(pkt->skb))); + fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); fl.u.ip4.flowi4_mark = pkt->skb->mark; fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; break; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 3b2373b3bd5d..9dbc24af749b 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -437,7 +438,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, fl4->fl4_dport = daddr->v4.sin_port; fl4->flowi4_proto = IPPROTO_SCTP; if (asoc) { - fl4->flowi4_tos = inet_dscp_to_dsfield(dscp); + fl4->flowi4_dscp = dscp; fl4->flowi4_scope = ip_sock_rt_scope(asoc->base.sk); fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if; fl4->fl4_sport = htons(asoc->base.bind_addr.port); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 7111184eef59..62486f866975 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2594,7 +2594,7 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl, static dscp_t xfrm_get_dscp(const struct flowi *fl, int family) { if (family == AF_INET) - return inet_dsfield_to_dscp(fl->u.ip4.flowi4_tos); + return fl->u.ip4.flowi4_dscp; return 0; } @@ -3462,7 +3462,7 @@ decode_session4(const struct xfrm_flow_keys *flkeys, struct flowi *fl, bool reve } fl4->flowi4_proto = flkeys->basic.ip_proto; - fl4->flowi4_tos = flkeys->ip.tos & ~INET_ECN_MASK; + fl4->flowi4_dscp = inet_dsfield_to_dscp(flkeys->ip.tos); } #if IS_ENABLED(CONFIG_IPV6) @@ -3594,7 +3594,7 @@ static bool xfrm_icmp_flow_decode(struct sk_buff *skb, unsigned short family, fl1->flowi_oif = fl->flowi_oif; fl1->flowi_mark = fl->flowi_mark; - fl1->flowi_tos = fl->flowi_tos; + fl1->flowi_dscp = fl->flowi_dscp; nf_nat_decode_session(newskb, fl1, family); ret = false; -- cgit v1.2.3 From 095928e7d80186c524013a5b5d54889fa2ec1eaa Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 23 Aug 2025 21:36:43 -0400 Subject: ipv6: sr: Use HMAC-SHA1 and HMAC-SHA256 library functions Use the HMAC-SHA1 and HMAC-SHA256 library functions instead of crypto_shash. This is simpler and faster. Pre-allocating per-CPU hash transformation objects and descriptors is no longer needed, and a microbenchmark on x86_64 shows seg6_hmac_compute() (with HMAC-SHA256) dropping from ~2494 cycles to ~1978 cycles, a 20% improvement. Signed-off-by: Eric Biggers Link: https://patch.msgid.link/20250824013644.71928-2-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/net/seg6_hmac.h | 12 --- net/ipv6/Kconfig | 7 +- net/ipv6/seg6.c | 7 -- net/ipv6/seg6_hmac.c | 207 +++++++----------------------------------------- 4 files changed, 30 insertions(+), 203 deletions(-) (limited to 'net') diff --git a/include/net/seg6_hmac.h b/include/net/seg6_hmac.h index 24f733b3e3fe..3fe4123dbbf0 100644 --- a/include/net/seg6_hmac.h +++ b/include/net/seg6_hmac.h @@ -19,7 +19,6 @@ #include #include -#define SEG6_HMAC_MAX_DIGESTSIZE 160 #define SEG6_HMAC_RING_SIZE 256 struct seg6_hmac_info { @@ -32,13 +31,6 @@ struct seg6_hmac_info { u8 alg_id; }; -struct seg6_hmac_algo { - u8 alg_id; - char name[64]; - struct crypto_shash * __percpu *tfms; - struct shash_desc * __percpu *shashs; -}; - extern int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, struct in6_addr *saddr, u8 *output); @@ -50,13 +42,9 @@ extern int seg6_push_hmac(struct net *net, struct in6_addr *saddr, struct ipv6_sr_hdr *srh); extern bool seg6_hmac_validate_skb(struct sk_buff *skb); #ifdef CONFIG_IPV6_SEG6_HMAC -extern int seg6_hmac_init(void); -extern void seg6_hmac_exit(void); extern int seg6_hmac_net_init(struct net *net); extern void seg6_hmac_net_exit(struct net *net); #else -static inline int seg6_hmac_init(void) { return 0; } -static inline void seg6_hmac_exit(void) {} static inline int seg6_hmac_net_init(struct net *net) { return 0; } static inline void seg6_hmac_net_exit(struct net *net) {} #endif diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 1c9c686d9522..b8f9a8c0302e 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -304,10 +304,9 @@ config IPV6_SEG6_LWTUNNEL config IPV6_SEG6_HMAC bool "IPv6: Segment Routing HMAC support" depends on IPV6 - select CRYPTO - select CRYPTO_HMAC - select CRYPTO_SHA1 - select CRYPTO_SHA256 + select CRYPTO_LIB_SHA1 + select CRYPTO_LIB_SHA256 + select CRYPTO_LIB_UTILS help Support for HMAC signature generation and verification of SR-enabled packets. diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 180da19c148c..a5c4c629b788 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -522,16 +522,10 @@ int __init seg6_init(void) if (err) goto out_unregister_iptun; - err = seg6_hmac_init(); - if (err) - goto out_unregister_seg6; - pr_info("Segment Routing with IPv6\n"); out: return err; -out_unregister_seg6: - seg6_local_exit(); out_unregister_iptun: seg6_iptunnel_exit(); out_unregister_genl: @@ -543,7 +537,6 @@ out_unregister_pernet: void seg6_exit(void) { - seg6_hmac_exit(); seg6_local_exit(); seg6_iptunnel_exit(); genl_unregister_family(&seg6_genl_family); diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index fd58426f222b..61f6019df55b 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include @@ -34,7 +33,8 @@ #include #include -#include +#include +#include #include #include #include @@ -78,17 +78,6 @@ static const struct rhashtable_params rht_params = { .obj_cmpfn = seg6_hmac_cmpfn, }; -static struct seg6_hmac_algo hmac_algos[] = { - { - .alg_id = SEG6_HMAC_ALGO_SHA1, - .name = "hmac(sha1)", - }, - { - .alg_id = SEG6_HMAC_ALGO_SHA256, - .name = "hmac(sha256)", - }, -}; - static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh) { struct sr6_tlv_hmac *tlv; @@ -108,75 +97,13 @@ static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh) return tlv; } -static struct seg6_hmac_algo *__hmac_get_algo(u8 alg_id) -{ - struct seg6_hmac_algo *algo; - int i, alg_count; - - alg_count = ARRAY_SIZE(hmac_algos); - for (i = 0; i < alg_count; i++) { - algo = &hmac_algos[i]; - if (algo->alg_id == alg_id) - return algo; - } - - return NULL; -} - -static int __do_hmac(struct seg6_hmac_info *hinfo, const char *text, u8 psize, - u8 *output, int outlen) -{ - struct seg6_hmac_algo *algo; - struct crypto_shash *tfm; - struct shash_desc *shash; - int ret, dgsize; - - algo = __hmac_get_algo(hinfo->alg_id); - if (!algo) - return -ENOENT; - - tfm = *this_cpu_ptr(algo->tfms); - - dgsize = crypto_shash_digestsize(tfm); - if (dgsize > outlen) { - pr_debug("sr-ipv6: __do_hmac: digest size too big (%d / %d)\n", - dgsize, outlen); - return -ENOMEM; - } - - ret = crypto_shash_setkey(tfm, hinfo->secret, hinfo->slen); - if (ret < 0) { - pr_debug("sr-ipv6: crypto_shash_setkey failed: err %d\n", ret); - goto failed; - } - - shash = *this_cpu_ptr(algo->shashs); - shash->tfm = tfm; - - ret = crypto_shash_digest(shash, text, psize, output); - if (ret < 0) { - pr_debug("sr-ipv6: crypto_shash_digest failed: err %d\n", ret); - goto failed; - } - - return dgsize; - -failed: - return ret; -} - int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, struct in6_addr *saddr, u8 *output) { __be32 hmackeyid = cpu_to_be32(hinfo->hmackeyid); - u8 tmp_out[SEG6_HMAC_MAX_DIGESTSIZE]; - int plen, i, dgsize, wrsize; + int plen, i, ret = 0; char *ring, *off; - /* a 160-byte buffer for digest output allows to store highest known - * hash function (RadioGatun) with up to 1216 bits - */ - /* saddr(16) + first_seg(1) + flags(1) + keyid(4) + seglist(16n) */ plen = 16 + 1 + 1 + 4 + (hdr->first_segment + 1) * 16; @@ -219,22 +146,26 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, off += 16; } - dgsize = __do_hmac(hinfo, ring, plen, tmp_out, - SEG6_HMAC_MAX_DIGESTSIZE); + switch (hinfo->alg_id) { + case SEG6_HMAC_ALGO_SHA1: + hmac_sha1_usingrawkey(hinfo->secret, hinfo->slen, ring, plen, + output); + static_assert(SEG6_HMAC_FIELD_LEN > SHA1_DIGEST_SIZE); + memset(&output[SHA1_DIGEST_SIZE], 0, + SEG6_HMAC_FIELD_LEN - SHA1_DIGEST_SIZE); + break; + case SEG6_HMAC_ALGO_SHA256: + hmac_sha256_usingrawkey(hinfo->secret, hinfo->slen, ring, plen, + output); + static_assert(SEG6_HMAC_FIELD_LEN == SHA256_DIGEST_SIZE); + break; + default: + ret = -ENOENT; + break; + } local_unlock_nested_bh(&hmac_storage.bh_lock); local_bh_enable(); - - if (dgsize < 0) - return dgsize; - - wrsize = SEG6_HMAC_FIELD_LEN; - if (wrsize > dgsize) - wrsize = dgsize; - - memset(output, 0, SEG6_HMAC_FIELD_LEN); - memcpy(output, tmp_out, wrsize); - - return 0; + return ret; } EXPORT_SYMBOL(seg6_hmac_compute); @@ -305,8 +236,13 @@ int seg6_hmac_info_add(struct net *net, u32 key, struct seg6_hmac_info *hinfo) struct seg6_pernet_data *sdata = seg6_pernet(net); int err; - if (!__hmac_get_algo(hinfo->alg_id)) + switch (hinfo->alg_id) { + case SEG6_HMAC_ALGO_SHA1: + case SEG6_HMAC_ALGO_SHA256: + break; + default: return -EINVAL; + } err = rhashtable_lookup_insert_fast(&sdata->hmac_infos, &hinfo->node, rht_params); @@ -363,65 +299,6 @@ out: } EXPORT_SYMBOL(seg6_push_hmac); -static int seg6_hmac_init_algo(void) -{ - struct seg6_hmac_algo *algo; - struct crypto_shash *tfm; - struct shash_desc *shash; - int i, alg_count, cpu; - int ret = -ENOMEM; - - alg_count = ARRAY_SIZE(hmac_algos); - - for (i = 0; i < alg_count; i++) { - struct crypto_shash **p_tfm; - int shsize; - - algo = &hmac_algos[i]; - algo->tfms = alloc_percpu(struct crypto_shash *); - if (!algo->tfms) - goto error_out; - - for_each_possible_cpu(cpu) { - tfm = crypto_alloc_shash(algo->name, 0, 0); - if (IS_ERR(tfm)) { - ret = PTR_ERR(tfm); - goto error_out; - } - p_tfm = per_cpu_ptr(algo->tfms, cpu); - *p_tfm = tfm; - } - - p_tfm = raw_cpu_ptr(algo->tfms); - tfm = *p_tfm; - - shsize = sizeof(*shash) + crypto_shash_descsize(tfm); - - algo->shashs = alloc_percpu(struct shash_desc *); - if (!algo->shashs) - goto error_out; - - for_each_possible_cpu(cpu) { - shash = kzalloc_node(shsize, GFP_KERNEL, - cpu_to_node(cpu)); - if (!shash) - goto error_out; - *per_cpu_ptr(algo->shashs, cpu) = shash; - } - } - - return 0; - -error_out: - seg6_hmac_exit(); - return ret; -} - -int __init seg6_hmac_init(void) -{ - return seg6_hmac_init_algo(); -} - int __net_init seg6_hmac_net_init(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); @@ -429,36 +306,6 @@ int __net_init seg6_hmac_net_init(struct net *net) return rhashtable_init(&sdata->hmac_infos, &rht_params); } -void seg6_hmac_exit(void) -{ - struct seg6_hmac_algo *algo = NULL; - struct crypto_shash *tfm; - struct shash_desc *shash; - int i, alg_count, cpu; - - alg_count = ARRAY_SIZE(hmac_algos); - for (i = 0; i < alg_count; i++) { - algo = &hmac_algos[i]; - - if (algo->shashs) { - for_each_possible_cpu(cpu) { - shash = *per_cpu_ptr(algo->shashs, cpu); - kfree(shash); - } - free_percpu(algo->shashs); - } - - if (algo->tfms) { - for_each_possible_cpu(cpu) { - tfm = *per_cpu_ptr(algo->tfms, cpu); - crypto_free_shash(tfm); - } - free_percpu(algo->tfms); - } - } -} -EXPORT_SYMBOL(seg6_hmac_exit); - void __net_exit seg6_hmac_net_exit(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); -- cgit v1.2.3 From fe60065689048edf4df99fffdb180a2166f9a54d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 23 Aug 2025 21:36:44 -0400 Subject: ipv6: sr: Prepare HMAC key ahead of time Prepare the HMAC key when it is added to the kernel, instead of preparing it implicitly for every packet. This significantly improves the performance of seg6_hmac_compute(). A microbenchmark on x86_64 shows seg6_hmac_compute() (with HMAC-SHA256) dropping from ~1978 cycles to ~1419 cycles, a 28% improvement. The size of 'struct seg6_hmac_info' increases by 128 bytes, but that should be fine, since there should not be a massive number of keys. Signed-off-by: Eric Biggers Link: https://patch.msgid.link/20250824013644.71928-3-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/net/seg6_hmac.h | 8 ++++++++ net/ipv6/seg6_hmac.c | 14 +++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/seg6_hmac.h b/include/net/seg6_hmac.h index 3fe4123dbbf0..e9f41725933e 100644 --- a/include/net/seg6_hmac.h +++ b/include/net/seg6_hmac.h @@ -9,6 +9,8 @@ #ifndef _NET_SEG6_HMAC_H #define _NET_SEG6_HMAC_H +#include +#include #include #include #include @@ -26,9 +28,15 @@ struct seg6_hmac_info { struct rcu_head rcu; u32 hmackeyid; + /* The raw key, kept only so it can be returned back to userspace */ char secret[SEG6_HMAC_SECRET_LEN]; u8 slen; u8 alg_id; + /* The prepared key, which the calculations actually use */ + union { + struct hmac_sha1_key sha1; + struct hmac_sha256_key sha256; + } key; }; extern int seg6_hmac_compute(struct seg6_hmac_info *hinfo, diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index 61f6019df55b..ee6bac0160ac 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -148,19 +148,18 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, switch (hinfo->alg_id) { case SEG6_HMAC_ALGO_SHA1: - hmac_sha1_usingrawkey(hinfo->secret, hinfo->slen, ring, plen, - output); + hmac_sha1(&hinfo->key.sha1, ring, plen, output); static_assert(SEG6_HMAC_FIELD_LEN > SHA1_DIGEST_SIZE); memset(&output[SHA1_DIGEST_SIZE], 0, SEG6_HMAC_FIELD_LEN - SHA1_DIGEST_SIZE); break; case SEG6_HMAC_ALGO_SHA256: - hmac_sha256_usingrawkey(hinfo->secret, hinfo->slen, ring, plen, - output); + hmac_sha256(&hinfo->key.sha256, ring, plen, output); static_assert(SEG6_HMAC_FIELD_LEN == SHA256_DIGEST_SIZE); break; default: - ret = -ENOENT; + WARN_ON_ONCE(1); + ret = -EINVAL; break; } local_unlock_nested_bh(&hmac_storage.bh_lock); @@ -238,7 +237,12 @@ int seg6_hmac_info_add(struct net *net, u32 key, struct seg6_hmac_info *hinfo) switch (hinfo->alg_id) { case SEG6_HMAC_ALGO_SHA1: + hmac_sha1_preparekey(&hinfo->key.sha1, + hinfo->secret, hinfo->slen); + break; case SEG6_HMAC_ALGO_SHA256: + hmac_sha256_preparekey(&hinfo->key.sha256, + hinfo->secret, hinfo->slen); break; default: return -EINVAL; -- cgit v1.2.3 From 479a54ab92087318514c82428a87af2d7af1a576 Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Fri, 22 Aug 2025 11:52:19 +0800 Subject: netfilter: br_netfilter: do not check confirmed bit in br_nf_local_in() after confirm When send a broadcast packet to a tap device, which was added to a bridge, br_nf_local_in() is called to confirm the conntrack. If another conntrack with the same hash value is added to the hash table, which can be triggered by a normal packet to a non-bridge device, the below warning may happen. ------------[ cut here ]------------ WARNING: CPU: 1 PID: 96 at net/bridge/br_netfilter_hooks.c:632 br_nf_local_in+0x168/0x200 CPU: 1 UID: 0 PID: 96 Comm: tap_send Not tainted 6.17.0-rc2-dirty #44 PREEMPT(voluntary) RIP: 0010:br_nf_local_in+0x168/0x200 Call Trace: nf_hook_slow+0x3e/0xf0 br_pass_frame_up+0x103/0x180 br_handle_frame_finish+0x2de/0x5b0 br_nf_hook_thresh+0xc0/0x120 br_nf_pre_routing_finish+0x168/0x3a0 br_nf_pre_routing+0x237/0x5e0 br_handle_frame+0x1ec/0x3c0 __netif_receive_skb_core+0x225/0x1210 __netif_receive_skb_one_core+0x37/0xa0 netif_receive_skb+0x36/0x160 tun_get_user+0xa54/0x10c0 tun_chr_write_iter+0x65/0xb0 vfs_write+0x305/0x410 ksys_write+0x60/0xd0 do_syscall_64+0xa4/0x260 entry_SYSCALL_64_after_hwframe+0x77/0x7f ---[ end trace 0000000000000000 ]--- To solve the hash conflict, nf_ct_resolve_clash() try to merge the conntracks, and update skb->_nfct. However, br_nf_local_in() still use the old ct from local variable 'nfct' after confirm(), which leads to this warning. If confirm() does not insert the conntrack entry and return NF_DROP, the warning may also occur. There is no need to reserve the WARN_ON_ONCE, just remove it. Link: https://lore.kernel.org/netdev/20250820043329.2902014-1-wangliang74@huawei.com/ Fixes: 62e7151ae3eb ("netfilter: bridge: confirm multicast packets before passing them up the stack") Suggested-by: Florian Westphal Signed-off-by: Wang Liang Signed-off-by: Florian Westphal --- net/bridge/br_netfilter_hooks.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 94cbe967d1c1..083e2fe96441 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -626,9 +626,6 @@ static unsigned int br_nf_local_in(void *priv, break; } - ct = container_of(nfct, struct nf_conn, ct_general); - WARN_ON_ONCE(!nf_ct_is_confirmed(ct)); - return ret; } #endif -- cgit v1.2.3 From 54416fd76770bd04fc3c501810e8d673550bab26 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Mon, 18 Aug 2025 13:22:20 +0200 Subject: netfilter: conntrack: helper: Replace -EEXIST by -EBUSY The helper registration return value is passed-through by module_init callbacks which modprobe confuses with the harmless -EEXIST returned when trying to load an already loaded module. Make sure modprobe fails so users notice their helper has not been registered and won't work. Suggested-by: Christophe Leroy Fixes: 12f7a505331e ("netfilter: add user-space connection tracking helper infrastructure") Signed-off-by: Phil Sutter Signed-off-by: Florian Westphal --- net/netfilter/nf_conntrack_helper.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 4ed5878cb25b..ceb48c3ca0a4 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -368,7 +368,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) (cur->tuple.src.l3num == NFPROTO_UNSPEC || cur->tuple.src.l3num == me->tuple.src.l3num) && cur->tuple.dst.protonum == me->tuple.dst.protonum) { - ret = -EEXIST; + ret = -EBUSY; goto out; } } @@ -379,7 +379,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) { if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple, &mask)) { - ret = -EEXIST; + ret = -EBUSY; goto out; } } -- cgit v1.2.3 From dcb34659028f856c423a29ef9b4e2571d203444d Mon Sep 17 00:00:00 2001 From: Takamitsu Iwai Date: Sat, 23 Aug 2025 17:58:55 +0900 Subject: net: rose: split remove and free operations in rose_remove_neigh() The current rose_remove_neigh() performs two distinct operations: 1. Removes rose_neigh from rose_neigh_list 2. Frees the rose_neigh structure Split these operations into separate functions to improve maintainability and prepare for upcoming refcount_t conversion. The timer cleanup remains in rose_remove_neigh() because free operations can be called from timer itself. This patch introduce rose_neigh_put() to handle the freeing of rose_neigh structures and modify rose_remove_neigh() to handle removal only. Signed-off-by: Takamitsu Iwai Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250823085857.47674-2-takamitz@amazon.co.jp Signed-off-by: Jakub Kicinski --- include/net/rose.h | 8 ++++++++ net/rose/rose_route.c | 15 ++++++--------- 2 files changed, 14 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/rose.h b/include/net/rose.h index 23267b4efcfa..174b4f605d84 100644 --- a/include/net/rose.h +++ b/include/net/rose.h @@ -151,6 +151,14 @@ struct rose_sock { #define rose_sk(sk) ((struct rose_sock *)(sk)) +static inline void rose_neigh_put(struct rose_neigh *rose_neigh) +{ + if (rose_neigh->ax25) + ax25_cb_put(rose_neigh->ax25); + kfree(rose_neigh->digipeat); + kfree(rose_neigh); +} + /* af_rose.c */ extern ax25_address rose_callsign; extern int sysctl_rose_restart_request_timeout; diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index b72bf8a08d48..0c44c416f485 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -234,20 +234,12 @@ static void rose_remove_neigh(struct rose_neigh *rose_neigh) if ((s = rose_neigh_list) == rose_neigh) { rose_neigh_list = rose_neigh->next; - if (rose_neigh->ax25) - ax25_cb_put(rose_neigh->ax25); - kfree(rose_neigh->digipeat); - kfree(rose_neigh); return; } while (s != NULL && s->next != NULL) { if (s->next == rose_neigh) { s->next = rose_neigh->next; - if (rose_neigh->ax25) - ax25_cb_put(rose_neigh->ax25); - kfree(rose_neigh->digipeat); - kfree(rose_neigh); return; } @@ -331,8 +323,10 @@ static int rose_del_node(struct rose_route_struct *rose_route, if (rose_node->neighbour[i] == rose_neigh) { rose_neigh->count--; - if (rose_neigh->count == 0 && rose_neigh->use == 0) + if (rose_neigh->count == 0 && rose_neigh->use == 0) { rose_remove_neigh(rose_neigh); + rose_neigh_put(rose_neigh); + } rose_node->count--; @@ -513,6 +507,7 @@ void rose_rt_device_down(struct net_device *dev) } rose_remove_neigh(s); + rose_neigh_put(s); } spin_unlock_bh(&rose_neigh_list_lock); spin_unlock_bh(&rose_node_list_lock); @@ -569,6 +564,7 @@ static int rose_clear_routes(void) if (s->use == 0 && !s->loopback) { s->count = 0; rose_remove_neigh(s); + rose_neigh_put(s); } } @@ -1301,6 +1297,7 @@ void __exit rose_rt_free(void) rose_neigh = rose_neigh->next; rose_remove_neigh(s); + rose_neigh_put(s); } while (rose_node != NULL) { -- cgit v1.2.3 From d860d1faa6b2ce3becfdb8b0c2b048ad31800061 Mon Sep 17 00:00:00 2001 From: Takamitsu Iwai Date: Sat, 23 Aug 2025 17:58:56 +0900 Subject: net: rose: convert 'use' field to refcount_t The 'use' field in struct rose_neigh is used as a reference counter but lacks atomicity. This can lead to race conditions where a rose_neigh structure is freed while still being referenced by other code paths. For example, when rose_neigh->use becomes zero during an ioctl operation via rose_rt_ioctl(), the structure may be removed while its timer is still active, potentially causing use-after-free issues. This patch changes the type of 'use' from unsigned short to refcount_t and updates all code paths to use rose_neigh_hold() and rose_neigh_put() which operate reference counts atomically. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Takamitsu Iwai Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250823085857.47674-3-takamitz@amazon.co.jp Signed-off-by: Jakub Kicinski --- include/net/rose.h | 18 +++++++++++++----- net/rose/af_rose.c | 13 +++++++------ net/rose/rose_in.c | 12 ++++++------ net/rose/rose_route.c | 33 ++++++++++++++++++--------------- net/rose/rose_timer.c | 2 +- 5 files changed, 45 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/include/net/rose.h b/include/net/rose.h index 174b4f605d84..2b5491bbf39a 100644 --- a/include/net/rose.h +++ b/include/net/rose.h @@ -8,6 +8,7 @@ #ifndef _ROSE_H #define _ROSE_H +#include #include #include #include @@ -96,7 +97,7 @@ struct rose_neigh { ax25_cb *ax25; struct net_device *dev; unsigned short count; - unsigned short use; + refcount_t use; unsigned int number; char restarted; char dce_mode; @@ -151,12 +152,19 @@ struct rose_sock { #define rose_sk(sk) ((struct rose_sock *)(sk)) +static inline void rose_neigh_hold(struct rose_neigh *rose_neigh) +{ + refcount_inc(&rose_neigh->use); +} + static inline void rose_neigh_put(struct rose_neigh *rose_neigh) { - if (rose_neigh->ax25) - ax25_cb_put(rose_neigh->ax25); - kfree(rose_neigh->digipeat); - kfree(rose_neigh); + if (refcount_dec_and_test(&rose_neigh->use)) { + if (rose_neigh->ax25) + ax25_cb_put(rose_neigh->ax25); + kfree(rose_neigh->digipeat); + kfree(rose_neigh); + } } /* af_rose.c */ diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 4e72b636a46a..543f9e8ebb69 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -170,7 +170,7 @@ void rose_kill_by_neigh(struct rose_neigh *neigh) if (rose->neighbour == neigh) { rose_disconnect(s, ENETUNREACH, ROSE_OUT_OF_ORDER, 0); - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); rose->neighbour = NULL; } } @@ -212,7 +212,7 @@ start: if (rose->device == dev) { rose_disconnect(sk, ENETUNREACH, ROSE_OUT_OF_ORDER, 0); if (rose->neighbour) - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); netdev_put(rose->device, &rose->dev_tracker); rose->device = NULL; } @@ -655,7 +655,7 @@ static int rose_release(struct socket *sock) break; case ROSE_STATE_2: - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); release_sock(sk); rose_disconnect(sk, 0, -1, -1); lock_sock(sk); @@ -823,6 +823,7 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le rose->lci = rose_new_lci(rose->neighbour); if (!rose->lci) { err = -ENETUNREACH; + rose_neigh_put(rose->neighbour); goto out_release; } @@ -834,12 +835,14 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le dev = rose_dev_first(); if (!dev) { err = -ENETUNREACH; + rose_neigh_put(rose->neighbour); goto out_release; } user = ax25_findbyuid(current_euid()); if (!user) { err = -EINVAL; + rose_neigh_put(rose->neighbour); dev_put(dev); goto out_release; } @@ -874,8 +877,6 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le rose->state = ROSE_STATE_1; - rose->neighbour->use++; - rose_write_internal(sk, ROSE_CALL_REQUEST); rose_start_heartbeat(sk); rose_start_t1timer(sk); @@ -1077,7 +1078,7 @@ int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct ros GFP_ATOMIC); make_rose->facilities = facilities; - make_rose->neighbour->use++; + rose_neigh_hold(make_rose->neighbour); if (rose_sk(sk)->defer) { make_rose->state = ROSE_STATE_5; diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c index 3e99181e759f..0276b393f0e5 100644 --- a/net/rose/rose_in.c +++ b/net/rose/rose_in.c @@ -56,7 +56,7 @@ static int rose_state1_machine(struct sock *sk, struct sk_buff *skb, int framety case ROSE_CLEAR_REQUEST: rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); rose_disconnect(sk, ECONNREFUSED, skb->data[3], skb->data[4]); - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); break; default: @@ -79,12 +79,12 @@ static int rose_state2_machine(struct sock *sk, struct sk_buff *skb, int framety case ROSE_CLEAR_REQUEST: rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); rose_disconnect(sk, 0, skb->data[3], skb->data[4]); - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); break; case ROSE_CLEAR_CONFIRMATION: rose_disconnect(sk, 0, -1, -1); - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); break; default: @@ -121,7 +121,7 @@ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int framety case ROSE_CLEAR_REQUEST: rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); rose_disconnect(sk, 0, skb->data[3], skb->data[4]); - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); break; case ROSE_RR: @@ -234,7 +234,7 @@ static int rose_state4_machine(struct sock *sk, struct sk_buff *skb, int framety case ROSE_CLEAR_REQUEST: rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); rose_disconnect(sk, 0, skb->data[3], skb->data[4]); - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); break; default: @@ -254,7 +254,7 @@ static int rose_state5_machine(struct sock *sk, struct sk_buff *skb, int framety if (frametype == ROSE_CLEAR_REQUEST) { rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); rose_disconnect(sk, 0, skb->data[3], skb->data[4]); - rose_sk(sk)->neighbour->use--; + rose_neigh_put(rose_sk(sk)->neighbour); } return 0; diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 0c44c416f485..8efb9033c057 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -93,11 +93,11 @@ static int __must_check rose_add_node(struct rose_route_struct *rose_route, rose_neigh->ax25 = NULL; rose_neigh->dev = dev; rose_neigh->count = 0; - rose_neigh->use = 0; rose_neigh->dce_mode = 0; rose_neigh->loopback = 0; rose_neigh->number = rose_neigh_no++; rose_neigh->restarted = 0; + refcount_set(&rose_neigh->use, 1); skb_queue_head_init(&rose_neigh->queue); @@ -255,10 +255,10 @@ static void rose_remove_route(struct rose_route *rose_route) struct rose_route *s; if (rose_route->neigh1 != NULL) - rose_route->neigh1->use--; + rose_neigh_put(rose_route->neigh1); if (rose_route->neigh2 != NULL) - rose_route->neigh2->use--; + rose_neigh_put(rose_route->neigh2); if ((s = rose_route_list) == rose_route) { rose_route_list = rose_route->next; @@ -323,7 +323,7 @@ static int rose_del_node(struct rose_route_struct *rose_route, if (rose_node->neighbour[i] == rose_neigh) { rose_neigh->count--; - if (rose_neigh->count == 0 && rose_neigh->use == 0) { + if (rose_neigh->count == 0) { rose_remove_neigh(rose_neigh); rose_neigh_put(rose_neigh); } @@ -375,11 +375,11 @@ void rose_add_loopback_neigh(void) sn->ax25 = NULL; sn->dev = NULL; sn->count = 0; - sn->use = 0; sn->dce_mode = 1; sn->loopback = 1; sn->number = rose_neigh_no++; sn->restarted = 1; + refcount_set(&sn->use, 1); skb_queue_head_init(&sn->queue); @@ -561,8 +561,7 @@ static int rose_clear_routes(void) s = rose_neigh; rose_neigh = rose_neigh->next; - if (s->use == 0 && !s->loopback) { - s->count = 0; + if (!s->loopback) { rose_remove_neigh(s); rose_neigh_put(s); } @@ -680,6 +679,7 @@ struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause, for (i = 0; i < node->count; i++) { if (node->neighbour[i]->restarted) { res = node->neighbour[i]; + rose_neigh_hold(node->neighbour[i]); goto out; } } @@ -691,6 +691,7 @@ struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause, for (i = 0; i < node->count; i++) { if (!rose_ftimer_running(node->neighbour[i])) { res = node->neighbour[i]; + rose_neigh_hold(node->neighbour[i]); goto out; } failed = 1; @@ -780,13 +781,13 @@ static void rose_del_route_by_neigh(struct rose_neigh *rose_neigh) } if (rose_route->neigh1 == rose_neigh) { - rose_route->neigh1->use--; + rose_neigh_put(rose_route->neigh1); rose_route->neigh1 = NULL; rose_transmit_clear_request(rose_route->neigh2, rose_route->lci2, ROSE_OUT_OF_ORDER, 0); } if (rose_route->neigh2 == rose_neigh) { - rose_route->neigh2->use--; + rose_neigh_put(rose_route->neigh2); rose_route->neigh2 = NULL; rose_transmit_clear_request(rose_route->neigh1, rose_route->lci1, ROSE_OUT_OF_ORDER, 0); } @@ -915,7 +916,7 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) rose_clear_queues(sk); rose->cause = ROSE_NETWORK_CONGESTION; rose->diagnostic = 0; - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); rose->neighbour = NULL; rose->lci = 0; rose->state = ROSE_STATE_0; @@ -1040,12 +1041,12 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) if ((new_lci = rose_new_lci(new_neigh)) == 0) { rose_transmit_clear_request(rose_neigh, lci, ROSE_NETWORK_CONGESTION, 71); - goto out; + goto put_neigh; } if ((rose_route = kmalloc(sizeof(*rose_route), GFP_ATOMIC)) == NULL) { rose_transmit_clear_request(rose_neigh, lci, ROSE_NETWORK_CONGESTION, 120); - goto out; + goto put_neigh; } rose_route->lci1 = lci; @@ -1058,8 +1059,8 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) rose_route->lci2 = new_lci; rose_route->neigh2 = new_neigh; - rose_route->neigh1->use++; - rose_route->neigh2->use++; + rose_neigh_hold(rose_route->neigh1); + rose_neigh_hold(rose_route->neigh2); rose_route->next = rose_route_list; rose_route_list = rose_route; @@ -1071,6 +1072,8 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) rose_transmit_link(skb, rose_route->neigh2); res = 1; +put_neigh: + rose_neigh_put(new_neigh); out: spin_unlock_bh(&rose_route_list_lock); spin_unlock_bh(&rose_neigh_list_lock); @@ -1186,7 +1189,7 @@ static int rose_neigh_show(struct seq_file *seq, void *v) (rose_neigh->loopback) ? "RSLOOP-0" : ax2asc(buf, &rose_neigh->callsign), rose_neigh->dev ? rose_neigh->dev->name : "???", rose_neigh->count, - rose_neigh->use, + refcount_read(&rose_neigh->use) - 1, (rose_neigh->dce_mode) ? "DCE" : "DTE", (rose_neigh->restarted) ? "yes" : "no", ax25_display_timer(&rose_neigh->t0timer) / HZ, diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c index 020369c49587..bb60a1654d61 100644 --- a/net/rose/rose_timer.c +++ b/net/rose/rose_timer.c @@ -180,7 +180,7 @@ static void rose_timer_expiry(struct timer_list *t) break; case ROSE_STATE_2: /* T3 */ - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); rose_disconnect(sk, ETIMEDOUT, -1, -1); break; -- cgit v1.2.3 From da9c9c877597170b929a6121a68dcd3dd9a80f45 Mon Sep 17 00:00:00 2001 From: Takamitsu Iwai Date: Sat, 23 Aug 2025 17:58:57 +0900 Subject: net: rose: include node references in rose_neigh refcount Current implementation maintains two separate reference counting mechanisms: the 'count' field in struct rose_neigh tracks references from rose_node structures, while the 'use' field (now refcount_t) tracks references from rose_sock. This patch merges these two reference counting systems using 'use' field for proper reference management. Specifically, this patch adds incrementing and decrementing of rose_neigh->use when rose_neigh->count is incremented or decremented. This patch also modifies rose_rt_free(), rose_rt_device_down() and rose_clear_route() to properly release references to rose_neigh objects before freeing a rose_node through rose_remove_node(). These changes ensure rose_neigh structures are properly freed only when all references, including those from rose_node structures, are released. As a result, this resolves a slab-use-after-free issue reported by Syzbot. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+942297eecf7d2d61d1f1@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=942297eecf7d2d61d1f1 Signed-off-by: Takamitsu Iwai Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250823085857.47674-4-takamitz@amazon.co.jp Signed-off-by: Jakub Kicinski --- net/rose/rose_route.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 8efb9033c057..1adee1fbc2ed 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -178,6 +178,7 @@ static int __must_check rose_add_node(struct rose_route_struct *rose_route, } } rose_neigh->count++; + rose_neigh_hold(rose_neigh); goto out; } @@ -187,6 +188,7 @@ static int __must_check rose_add_node(struct rose_route_struct *rose_route, rose_node->neighbour[rose_node->count] = rose_neigh; rose_node->count++; rose_neigh->count++; + rose_neigh_hold(rose_neigh); } out: @@ -322,6 +324,7 @@ static int rose_del_node(struct rose_route_struct *rose_route, for (i = 0; i < rose_node->count; i++) { if (rose_node->neighbour[i] == rose_neigh) { rose_neigh->count--; + rose_neigh_put(rose_neigh); if (rose_neigh->count == 0) { rose_remove_neigh(rose_neigh); @@ -430,6 +433,7 @@ int rose_add_loopback_node(const rose_address *address) rose_node_list = rose_node; rose_loopback_neigh->count++; + rose_neigh_hold(rose_loopback_neigh); out: spin_unlock_bh(&rose_node_list_lock); @@ -461,6 +465,7 @@ void rose_del_loopback_node(const rose_address *address) rose_remove_node(rose_node); rose_loopback_neigh->count--; + rose_neigh_put(rose_loopback_neigh); out: spin_unlock_bh(&rose_node_list_lock); @@ -500,6 +505,7 @@ void rose_rt_device_down(struct net_device *dev) memmove(&t->neighbour[i], &t->neighbour[i + 1], sizeof(t->neighbour[0]) * (t->count - i)); + rose_neigh_put(s); } if (t->count <= 0) @@ -543,6 +549,7 @@ static int rose_clear_routes(void) { struct rose_neigh *s, *rose_neigh; struct rose_node *t, *rose_node; + int i; spin_lock_bh(&rose_node_list_lock); spin_lock_bh(&rose_neigh_list_lock); @@ -553,8 +560,12 @@ static int rose_clear_routes(void) while (rose_node != NULL) { t = rose_node; rose_node = rose_node->next; - if (!t->loopback) + + if (!t->loopback) { + for (i = 0; i < rose_node->count; i++) + rose_neigh_put(t->neighbour[i]); rose_remove_node(t); + } } while (rose_neigh != NULL) { @@ -1189,7 +1200,7 @@ static int rose_neigh_show(struct seq_file *seq, void *v) (rose_neigh->loopback) ? "RSLOOP-0" : ax2asc(buf, &rose_neigh->callsign), rose_neigh->dev ? rose_neigh->dev->name : "???", rose_neigh->count, - refcount_read(&rose_neigh->use) - 1, + refcount_read(&rose_neigh->use) - rose_neigh->count - 1, (rose_neigh->dce_mode) ? "DCE" : "DTE", (rose_neigh->restarted) ? "yes" : "no", ax25_display_timer(&rose_neigh->t0timer) / HZ, @@ -1294,6 +1305,7 @@ void __exit rose_rt_free(void) struct rose_neigh *s, *rose_neigh = rose_neigh_list; struct rose_node *t, *rose_node = rose_node_list; struct rose_route *u, *rose_route = rose_route_list; + int i; while (rose_neigh != NULL) { s = rose_neigh; @@ -1307,6 +1319,8 @@ void __exit rose_rt_free(void) t = rose_node; rose_node = rose_node->next; + for (i = 0; i < t->count; i++) + rose_neigh_put(t->neighbour[i]); rose_remove_node(t); } -- cgit v1.2.3 From 2e8750469242cad8f01f320131fd5a6f540dbb99 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Aug 2025 14:13:14 +0000 Subject: sctp: initialize more fields in sctp_v6_from_sk() syzbot found that sin6_scope_id was not properly initialized, leading to undefined behavior. Clear sin6_scope_id and sin6_flowinfo. BUG: KMSAN: uninit-value in __sctp_v6_cmp_addr+0x887/0x8c0 net/sctp/ipv6.c:649 __sctp_v6_cmp_addr+0x887/0x8c0 net/sctp/ipv6.c:649 sctp_inet6_cmp_addr+0x4f2/0x510 net/sctp/ipv6.c:983 sctp_bind_addr_conflict+0x22a/0x3b0 net/sctp/bind_addr.c:390 sctp_get_port_local+0x21eb/0x2440 net/sctp/socket.c:8452 sctp_get_port net/sctp/socket.c:8523 [inline] sctp_listen_start net/sctp/socket.c:8567 [inline] sctp_inet_listen+0x710/0xfd0 net/sctp/socket.c:8636 __sys_listen_socket net/socket.c:1912 [inline] __sys_listen net/socket.c:1927 [inline] __do_sys_listen net/socket.c:1932 [inline] __se_sys_listen net/socket.c:1930 [inline] __x64_sys_listen+0x343/0x4c0 net/socket.c:1930 x64_sys_call+0x271d/0x3e20 arch/x86/include/generated/asm/syscalls_64.h:51 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xd9/0x210 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Local variable addr.i.i created at: sctp_get_port net/sctp/socket.c:8515 [inline] sctp_listen_start net/sctp/socket.c:8567 [inline] sctp_inet_listen+0x650/0xfd0 net/sctp/socket.c:8636 __sys_listen_socket net/socket.c:1912 [inline] __sys_listen net/socket.c:1927 [inline] __do_sys_listen net/socket.c:1932 [inline] __se_sys_listen net/socket.c:1930 [inline] __x64_sys_listen+0x343/0x4c0 net/socket.c:1930 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+e69f06a0f30116c68056@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/68adc0a2.050a0220.37038e.00c4.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Marcelo Ricardo Leitner Acked-by: Xin Long Link: https://patch.msgid.link/20250826141314.1802610-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sctp/ipv6.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 3336dcfb4515..568ff8797c39 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -547,7 +547,9 @@ static void sctp_v6_from_sk(union sctp_addr *addr, struct sock *sk) { addr->v6.sin6_family = AF_INET6; addr->v6.sin6_port = 0; + addr->v6.sin6_flowinfo = 0; addr->v6.sin6_addr = sk->sk_v6_rcv_saddr; + addr->v6.sin6_scope_id = 0; } /* Initialize sk->sk_rcv_saddr from sctp_addr. */ -- cgit v1.2.3 From 9b8c88f875c04d4cb9111bd5dd9291c7e9691bf5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Aug 2025 13:44:35 +0000 Subject: l2tp: do not use sock_hold() in pppol2tp_session_get_sock() pppol2tp_session_get_sock() is using RCU, it must be ready for sk_refcnt being zero. Commit ee40fb2e1eb5 ("l2tp: protect sock pointer of struct pppol2tp_session with RCU") was correct because it had a call_rcu(..., pppol2tp_put_sk) which was later removed in blamed commit. pppol2tp_recv() can use pppol2tp_session_get_sock() as well. Fixes: c5cbaef992d6 ("l2tp: refactor ppp socket/session relationship") Signed-off-by: Eric Dumazet Cc: James Chapman Reviewed-by: Guillaume Nault Link: https://patch.msgid.link/20250826134435.1683435-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/l2tp/l2tp_ppp.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index fc5c2fd8f34c..5e12e7ce17d8 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -129,22 +129,12 @@ static const struct ppp_channel_ops pppol2tp_chan_ops = { static const struct proto_ops pppol2tp_ops; -/* Retrieves the pppol2tp socket associated to a session. - * A reference is held on the returned socket, so this function must be paired - * with sock_put(). - */ +/* Retrieves the pppol2tp socket associated to a session. */ static struct sock *pppol2tp_session_get_sock(struct l2tp_session *session) { struct pppol2tp_session *ps = l2tp_session_priv(session); - struct sock *sk; - - rcu_read_lock(); - sk = rcu_dereference(ps->sk); - if (sk) - sock_hold(sk); - rcu_read_unlock(); - return sk; + return rcu_dereference(ps->sk); } /* Helpers to obtain tunnel/session contexts from sockets. @@ -206,14 +196,13 @@ end: static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int data_len) { - struct pppol2tp_session *ps = l2tp_session_priv(session); - struct sock *sk = NULL; + struct sock *sk; /* If the socket is bound, send it in to PPP's input queue. Otherwise * queue it on the session socket. */ rcu_read_lock(); - sk = rcu_dereference(ps->sk); + sk = pppol2tp_session_get_sock(session); if (!sk) goto no_sock; @@ -510,13 +499,14 @@ static void pppol2tp_show(struct seq_file *m, void *arg) struct l2tp_session *session = arg; struct sock *sk; + rcu_read_lock(); sk = pppol2tp_session_get_sock(session); if (sk) { struct pppox_sock *po = pppox_sk(sk); seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan)); - sock_put(sk); } + rcu_read_unlock(); } static void pppol2tp_session_init(struct l2tp_session *session) @@ -1530,6 +1520,7 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v) port = ntohs(inet->inet_sport); } + rcu_read_lock(); sk = pppol2tp_session_get_sock(session); if (sk) { state = sk->sk_state; @@ -1565,8 +1556,8 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v) struct pppox_sock *po = pppox_sk(sk); seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan)); - sock_put(sk); } + rcu_read_unlock(); } static int pppol2tp_seq_show(struct seq_file *m, void *v) -- cgit v1.2.3 From 1cc8a5b534e5f9b5e129e54ee2e63c9f5da4f39a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 Aug 2025 17:21:49 +0000 Subject: net: rose: fix a typo in rose_clear_routes() syzbot crashed in rose_clear_routes(), after a recent patch typo. KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017] CPU: 0 UID: 0 PID: 10591 Comm: syz.3.1856 Not tainted syzkaller #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/12/2025 RIP: 0010:rose_clear_routes net/rose/rose_route.c:565 [inline] RIP: 0010:rose_rt_ioctl+0x162/0x1250 net/rose/rose_route.c:760 rose_ioctl+0x3ce/0x8b0 net/rose/af_rose.c:1381 sock_do_ioctl+0xd9/0x300 net/socket.c:1238 sock_ioctl+0x576/0x790 net/socket.c:1359 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:598 [inline] __se_sys_ioctl+0xfc/0x170 fs/ioctl.c:584 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: da9c9c877597 ("net: rose: include node references in rose_neigh refcount") Reported-by: syzbot+2eb8d1719f7cfcfa6840@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/68af3e29.a70a0220.3cafd4.002e.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Takamitsu Iwai Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250827172149.5359-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/rose/rose_route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 1adee1fbc2ed..a1e9b05ef6f5 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -562,7 +562,7 @@ static int rose_clear_routes(void) rose_node = rose_node->next; if (!t->loopback) { - for (i = 0; i < rose_node->count; i++) + for (i = 0; i < t->count; i++) rose_neigh_put(t->neighbour[i]); rose_remove_node(t); } -- cgit v1.2.3 From 97bcc5b6f45425ac56fb04b0893cdaa607ec7e45 Mon Sep 17 00:00:00 2001 From: Krishna Kumar Date: Mon, 25 Aug 2025 08:40:04 +0530 Subject: net: Prevent RPS table overwrite of active flows This patch fixes an issue where two different flows on the same RXq produce the same hash resulting in continuous flow overwrites. Flow #1: A packet for Flow #1 comes in, kernel calls the steering function. The driver gives back a filter id. The kernel saves this filter id in the selected slot. Later, the driver's service task checks if any filters have expired and then installs the rule for Flow #1. Flow #2: A packet for Flow #2 comes in. It goes through the same steps. But this time, the chosen slot is being used by Flow #1. The driver gives a new filter id and the kernel saves it in the same slot. When the driver's service task runs, it runs through all the flows, checks if Flow #1 should be expired, the kernel returns True as the slot has a different filter id, and then the driver installs the rule for Flow #2. Flow #1: Another packet for Flow #1 comes in. The same thing repeats. The slot is overwritten with a new filter id for Flow #1. This causes a repeated cycle of flow programming for missed packets, wasting CPU cycles while not improving performance. This problem happens at higher rates when the RPS table is small, but tests show it still happens even with 12,000 connections and an RPS size of 16K per queue (global table size = 144x16K = 64K). This patch prevents overwriting an rps_dev_flow entry if it is active. The intention is that it is better to do aRFS for the first flow instead of hurting all flows on the same hash. Without this, two (or more) flows on one RX queue with the same hash can keep overwriting each other. This causes the driver to reprogram the flow repeatedly. Changes: 1. Add a new 'hash' field to struct rps_dev_flow. 2. Add rps_flow_is_active(): a helper function to check if a flow is active or not, extracted from rps_may_expire_flow(). It is further simplified as per reviewer feedback. 3. In set_rps_cpu(): - Avoid overwriting by programming a new filter if: - The slot is not in use, or - The slot is in use but the flow is not active, or - The slot has an active flow with the same hash, but target CPU differs. - Save the hash in the rps_dev_flow entry. 4. rps_may_expire_flow(): Use earlier extracted rps_flow_is_active(). Testing & results: - Driver: ice (E810 NIC), Kernel: net-next - #CPUs = #RXq = 144 (1:1) - Number of flows: 12K - Eight RPS settings from 256 to 32768. Though RPS=256 is not ideal, it is still sufficient to cover 12K flows (256*144 rx-queues = 64K global table slots) - Global Table Size = 144 * RPS (effectively equal to 256 * RPS) - Each RPS test duration = 8 mins (org code) + 8 mins (new code). - Metrics captured on client Legend for following tables: Steer-C: #times ndo_rx_flow_steer() was Called by set_rps_cpu() Steer-L: #times ice_arfs_flow_steer() Looped over aRFS entries Add: #times driver actually programmed aRFS (ice_arfs_build_entry()) Del: #times driver deleted the flow (ice_arfs_del_flow_rules()) Units: K = 1,000 times, M = 1 million times |-------|---------|------| Org Code |---------|---------| | RPS | Latency | CPU | Add | Del | Steer-C | Steer-L | |-------|---------|------|--------|--------|---------|---------| | 256 | 227.0 | 93.2 | 1.6M | 1.6M | 121.7M | 267.6M | | 512 | 225.9 | 94.1 | 11.5M | 11.2M | 65.7M | 199.6M | | 1024 | 223.5 | 95.6 | 16.5M | 16.5M | 27.1M | 187.3M | | 2048 | 222.2 | 96.3 | 10.5M | 10.5M | 12.5M | 115.2M | | 4096 | 223.9 | 94.1 | 5.5M | 5.5M | 7.2M | 65.9M | | 8192 | 224.7 | 92.5 | 2.7M | 2.7M | 3.0M | 29.9M | | 16384 | 223.5 | 92.5 | 1.3M | 1.3M | 1.4M | 13.9M | | 32768 | 219.6 | 93.2 | 838.1K | 838.1K | 965.1K | 8.9M | |-------|---------|------| New Code |---------|---------| | 256 | 201.5 | 99.1 | 13.4K | 5.0K | 13.7K | 75.2K | | 512 | 202.5 | 98.2 | 11.2K | 5.9K | 11.2K | 55.5K | | 1024 | 207.3 | 93.9 | 11.5K | 9.7K | 11.5K | 59.6K | | 2048 | 207.5 | 96.7 | 11.8K | 11.1K | 15.5K | 79.3K | | 4096 | 206.9 | 96.6 | 11.8K | 11.7K | 11.8K | 63.2K | | 8192 | 205.8 | 96.7 | 11.9K | 11.8K | 11.9K | 63.9K | | 16384 | 200.9 | 98.2 | 11.9K | 11.9K | 11.9K | 64.2K | | 32768 | 202.5 | 98.0 | 11.9K | 11.9K | 11.9K | 64.2K | |-------|---------|------|--------|--------|---------|---------| Some observations: 1. Overall Latency improved: (1790.19-1634.94)/1790.19*100 = 8.67% 2. Overall CPU increased: (777.32-751.49)/751.45*100 = 3.44% 3. Flow Management (add/delete) remained almost constant at ~11K compared to values in millions. Signed-off-by: Krishna Kumar Link: https://patch.msgid.link/20250825031005.3674864-2-krikku@gmail.com Signed-off-by: Jakub Kicinski --- include/net/rps.h | 7 ++++-- net/core/dev.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++------ net/core/net-sysfs.c | 4 +++- 3 files changed, 65 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/rps.h b/include/net/rps.h index d8ab3a08bcc4..9917dce42ca4 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -25,13 +25,16 @@ struct rps_map { /* * The rps_dev_flow structure contains the mapping of a flow to a CPU, the - * tail pointer for that CPU's input queue at the time of last enqueue, and - * a hardware filter index. + * tail pointer for that CPU's input queue at the time of last enqueue, a + * hardware filter index, and the hash of the flow if aRFS is enabled. */ struct rps_dev_flow { u16 cpu; u16 filter; unsigned int last_qtail; +#ifdef CONFIG_RFS_ACCEL + u32 hash; +#endif }; #define RPS_NO_FILTER 0xffff diff --git a/net/core/dev.c b/net/core/dev.c index 93a25d87b86b..fbf0894c55d0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4849,6 +4849,36 @@ static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table) return hash_32(hash, flow_table->log); } +#ifdef CONFIG_RFS_ACCEL +/** + * rps_flow_is_active - check whether the flow is recently active. + * @rflow: Specific flow to check activity. + * @flow_table: per-queue flowtable that @rflow belongs to. + * @cpu: CPU saved in @rflow. + * + * If the CPU has processed many packets since the flow's last activity + * (beyond 10 times the table size), the flow is considered stale. + * + * Return: true if flow was recently active. + */ +static bool rps_flow_is_active(struct rps_dev_flow *rflow, + struct rps_dev_flow_table *flow_table, + unsigned int cpu) +{ + unsigned int flow_last_active; + unsigned int sd_input_head; + + if (cpu >= nr_cpu_ids) + return false; + + sd_input_head = READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head); + flow_last_active = READ_ONCE(rflow->last_qtail); + + return (int)(sd_input_head - flow_last_active) < + (int)(10 << flow_table->log); +} +#endif + static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *rflow, u16 next_cpu) @@ -4859,8 +4889,11 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *old_rflow; + struct rps_dev_flow *tmp_rflow; + unsigned int tmp_cpu; u16 rxq_index; u32 flow_id; + u32 hash; int rc; /* Should we steer this flow to a different hardware queue? */ @@ -4875,14 +4908,32 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, flow_table = rcu_dereference(rxqueue->rps_flow_table); if (!flow_table) goto out; - flow_id = rfs_slot(skb_get_hash(skb), flow_table); + + hash = skb_get_hash(skb); + flow_id = rfs_slot(hash, flow_table); + + tmp_rflow = &flow_table->flows[flow_id]; + tmp_cpu = READ_ONCE(tmp_rflow->cpu); + + if (READ_ONCE(tmp_rflow->filter) != RPS_NO_FILTER) { + if (rps_flow_is_active(tmp_rflow, flow_table, + tmp_cpu)) { + if (hash != READ_ONCE(tmp_rflow->hash) || + next_cpu == tmp_cpu) + goto out; + } + } + rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); if (rc < 0) goto out; + old_rflow = rflow; - rflow = &flow_table->flows[flow_id]; + rflow = tmp_rflow; WRITE_ONCE(rflow->filter, rc); + WRITE_ONCE(rflow->hash, hash); + if (old_rflow->filter == rc) WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER); out: @@ -5017,17 +5068,16 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, struct rps_dev_flow_table *flow_table; struct rps_dev_flow *rflow; bool expire = true; - unsigned int cpu; rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); if (flow_table && flow_id < (1UL << flow_table->log)) { + unsigned int cpu; + rflow = &flow_table->flows[flow_id]; cpu = READ_ONCE(rflow->cpu); - if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids && - ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) - - READ_ONCE(rflow->last_qtail)) < - (int)(10 << flow_table->log))) + if (READ_ONCE(rflow->filter) == filter_id && + rps_flow_is_active(rflow, flow_table, cpu)) expire = false; } rcu_read_unlock(); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index c28cd6665444..5ea9f64adce3 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1120,8 +1120,10 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, return -ENOMEM; table->log = ilog2(mask) + 1; - for (count = 0; count <= mask; count++) + for (count = 0; count <= mask; count++) { table->flows[count].cpu = RPS_NO_CPU; + table->flows[count].filter = RPS_NO_FILTER; + } } else { table = NULL; } -- cgit v1.2.3 From 48aa30443e52c9666d5cd5e67532e475f212337e Mon Sep 17 00:00:00 2001 From: Krishna Kumar Date: Mon, 25 Aug 2025 08:40:05 +0530 Subject: net: Cache hash and flow_id to avoid recalculation get_rps_cpu() can cache flow_id and hash as both are required by set_rps_cpu() instead of recalculating them twice. Signed-off-by: Krishna Kumar Link: https://patch.msgid.link/20250825031005.3674864-3-krikku@gmail.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index fbf0894c55d0..1d1650d9ecff 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4881,7 +4881,8 @@ static bool rps_flow_is_active(struct rps_dev_flow *rflow, static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, - struct rps_dev_flow *rflow, u16 next_cpu) + struct rps_dev_flow *rflow, u16 next_cpu, u32 hash, + u32 flow_id) { if (next_cpu < nr_cpu_ids) { u32 head; @@ -4892,8 +4893,6 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *tmp_rflow; unsigned int tmp_cpu; u16 rxq_index; - u32 flow_id; - u32 hash; int rc; /* Should we steer this flow to a different hardware queue? */ @@ -4909,9 +4908,6 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, if (!flow_table) goto out; - hash = skb_get_hash(skb); - flow_id = rfs_slot(hash, flow_table); - tmp_rflow = &flow_table->flows[flow_id]; tmp_cpu = READ_ONCE(tmp_rflow->cpu); @@ -4959,6 +4955,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow_table *flow_table; struct rps_map *map; int cpu = -1; + u32 flow_id; u32 tcpu; u32 hash; @@ -5005,7 +5002,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, /* OK, now we know there is a match, * we can look at the local (per receive queue) flow table */ - rflow = &flow_table->flows[rfs_slot(hash, flow_table)]; + flow_id = rfs_slot(hash, flow_table); + rflow = &flow_table->flows[flow_id]; tcpu = rflow->cpu; /* @@ -5024,7 +5022,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) - rflow->last_qtail)) >= 0)) { tcpu = next_cpu; - rflow = set_rps_cpu(dev, skb, rflow, next_cpu); + rflow = set_rps_cpu(dev, skb, rflow, next_cpu, hash, + flow_id); } if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { -- cgit v1.2.3 From 5189446ba995556eaa3755a6e875bc06675b88bd Mon Sep 17 00:00:00 2001 From: Oscar Maes Date: Wed, 27 Aug 2025 08:23:21 +0200 Subject: net: ipv4: fix regression in local-broadcast routes Commit 9e30ecf23b1b ("net: ipv4: fix incorrect MTU in broadcast routes") introduced a regression where local-broadcast packets would have their gateway set in __mkroute_output, which was caused by fi = NULL being removed. Fix this by resetting the fib_info for local-broadcast packets. This preserves the intended changes for directed-broadcast packets. Cc: stable@vger.kernel.org Fixes: 9e30ecf23b1b ("net: ipv4: fix incorrect MTU in broadcast routes") Reported-by: Brett A C Sheffield Closes: https://lore.kernel.org/regressions/20250822165231.4353-4-bacs@librecast.net Signed-off-by: Oscar Maes Reviewed-by: David Ahern Link: https://patch.msgid.link/20250827062322.4807-1-oscmaes92@gmail.com Signed-off-by: Paolo Abeni --- net/ipv4/route.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f639a2ae881a..baa43e5966b1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2575,12 +2575,16 @@ static struct rtable *__mkroute_output(const struct fib_result *res, !netif_is_l3_master(dev_out)) return ERR_PTR(-EINVAL); - if (ipv4_is_lbcast(fl4->daddr)) + if (ipv4_is_lbcast(fl4->daddr)) { type = RTN_BROADCAST; - else if (ipv4_is_multicast(fl4->daddr)) + + /* reset fi to prevent gateway resolution */ + fi = NULL; + } else if (ipv4_is_multicast(fl4->daddr)) { type = RTN_MULTICAST; - else if (ipv4_is_zeronet(fl4->daddr)) + } else if (ipv4_is_zeronet(fl4->daddr)) { return ERR_PTR(-EINVAL); + } if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; -- cgit v1.2.3 From f86f42ed2c471da5b061492bb8ab1d3d73c19c58 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Aug 2025 12:50:27 +0000 Subject: net: add sk_drops_read(), sk_drops_inc() and sk_drops_reset() helpers We want to split sk->sk_drops in the future to reduce potential contention on this field. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250826125031.1578842-2-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/sock.h | 17 ++++++++++++++++- include/net/tcp.h | 2 +- net/core/datagram.c | 2 +- net/core/sock.c | 14 +++++++------- net/ipv4/ping.c | 2 +- net/ipv4/raw.c | 6 +++--- net/ipv4/udp.c | 14 +++++++------- net/ipv6/datagram.c | 2 +- net/ipv6/raw.c | 8 ++++---- net/ipv6/udp.c | 6 +++--- net/iucv/af_iucv.c | 4 ++-- net/netlink/af_netlink.c | 4 ++-- net/packet/af_packet.c | 2 +- net/phonet/pep.c | 6 +++--- net/phonet/socket.c | 2 +- net/sctp/diag.c | 2 +- net/tipc/socket.c | 6 +++--- 17 files changed, 57 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 63a6a48afb48..34d7029eb622 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2682,11 +2682,26 @@ struct sock_skb_cb { #define sock_skb_cb_check_size(size) \ BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET) +static inline void sk_drops_inc(struct sock *sk) +{ + atomic_inc(&sk->sk_drops); +} + +static inline int sk_drops_read(const struct sock *sk) +{ + return atomic_read(&sk->sk_drops); +} + +static inline void sk_drops_reset(struct sock *sk) +{ + atomic_set(&sk->sk_drops, 0); +} + static inline void sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb) { SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? - atomic_read(&sk->sk_drops) : 0; + sk_drops_read(sk) : 0; } static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) diff --git a/include/net/tcp.h b/include/net/tcp.h index 2936b8175950..16dc9cebb9d2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2612,7 +2612,7 @@ static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb) */ static inline void tcp_listendrop(const struct sock *sk) { - atomic_inc(&((struct sock *)sk)->sk_drops); + sk_drops_inc((struct sock *)sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); } diff --git a/net/core/datagram.c b/net/core/datagram.c index 94cc4705e91d..ba8253aa6e07 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -345,7 +345,7 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, spin_unlock_bh(&sk_queue->lock); } - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); return err; } EXPORT_SYMBOL(__sk_queue_drop_skb); diff --git a/net/core/sock.c b/net/core/sock.c index 8002ac6293dc..75368823969a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -491,13 +491,13 @@ int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) struct sk_buff_head *list = &sk->sk_receive_queue; if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); trace_sock_rcvqueue_full(sk, skb); return -ENOMEM; } if (!sk_rmem_schedule(sk, skb, skb->truesize)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); return -ENOBUFS; } @@ -562,7 +562,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, skb->dev = NULL; if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); reason = SKB_DROP_REASON_SOCKET_RCVBUFF; goto discard_and_relse; } @@ -585,7 +585,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, reason = SKB_DROP_REASON_PFMEMALLOC; if (err == -ENOBUFS) reason = SKB_DROP_REASON_SOCKET_BACKLOG; - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); goto discard_and_relse; } @@ -2505,7 +2505,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; newsk->sk_reserved_mem = 0; - atomic_set(&newsk->sk_drops, 0); + sk_drops_reset(newsk); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; atomic_set(&newsk->sk_zckey, 0); @@ -3713,7 +3713,7 @@ void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) */ smp_wmb(); refcount_set(&sk->sk_refcnt, 1); - atomic_set(&sk->sk_drops, 0); + sk_drops_reset(sk); } EXPORT_SYMBOL(sock_init_data_uid); @@ -3973,7 +3973,7 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem) mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); - mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); + mem[SK_MEMINFO_DROPS] = sk_drops_read(sk); } #ifdef CONFIG_PROC_FS diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 031df4c19fcc..f119da68fc30 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -1119,7 +1119,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, from_kuid_munged(seq_user_ns(f), sk_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops)); + sk_drops_read(sp)); } static int ping_v4_seq_show(struct seq_file *seq, void *v) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 1d2c89d63cc7..0f9f02f6146e 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -178,7 +178,7 @@ static int raw_v4_input(struct net *net, struct sk_buff *skb, if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); continue; } @@ -311,7 +311,7 @@ static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) int raw_rcv(struct sock *sk, struct sk_buff *skb) { if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } @@ -1045,7 +1045,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) 0, 0L, 0, from_kuid_munged(seq_user_ns(seq), sk_uid(sp)), 0, sock_i_ino(sp), - refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); + refcount_read(&sp->sk_refcnt), sp, sk_drops_read(sp)); } static int raw_seq_show(struct seq_file *seq, void *v) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cc3ce0f762ec..732bdad43626 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1787,7 +1787,7 @@ uncharge_drop: atomic_sub(skb->truesize, &sk->sk_rmem_alloc); drop: - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); busylock_release(busy); return err; } @@ -1852,7 +1852,7 @@ static struct sk_buff *__first_packet_length(struct sock *sk, IS_UDPLITE(sk)); __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, IS_UDPLITE(sk)); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); __skb_unlink(skb, rcvq); *total += skb->truesize; kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); @@ -2008,7 +2008,7 @@ try_again: __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite); __UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); goto try_again; } @@ -2078,7 +2078,7 @@ try_again: if (unlikely(err)) { if (!peeking) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } @@ -2449,7 +2449,7 @@ csum_error: __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -2534,7 +2534,7 @@ start_lookup: nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); __UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); __UDP_INC_STATS(net, UDP_MIB_INERRORS, @@ -3386,7 +3386,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, from_kuid_munged(seq_user_ns(f), sk_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops)); + sk_drops_read(sp)); } int udp4_seq_show(struct seq_file *seq, void *v) diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 972bf0426d59..33ebe93d80e3 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -1068,5 +1068,5 @@ void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops)); + sk_drops_read(sp)); } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 4c3f8245c40f..4026192143ec 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -163,7 +163,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); continue; } @@ -361,7 +361,7 @@ static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb) if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) && skb_checksum_complete(skb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } @@ -389,7 +389,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) struct raw6_sock *rp = raw6_sk(sk); if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } @@ -414,7 +414,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) if (inet_test_bit(HDRINCL, sk)) { if (skb_checksum_complete(skb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 6a68f77da44b..a35ee6d693a8 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -524,7 +524,7 @@ try_again: } if (unlikely(err)) { if (!peeking) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); SNMP_INC_STATS(mib, UDP_MIB_INERRORS); } kfree_skb(skb); @@ -908,7 +908,7 @@ csum_error: __UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: __UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -1013,7 +1013,7 @@ start_lookup: } nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); __UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); __UDP6_INC_STATS(net, UDP_MIB_INERRORS, diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index cc2b3c44bc05..6c717a7ef292 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1187,7 +1187,7 @@ static void iucv_process_message(struct sock *sk, struct sk_buff *skb, IUCV_SKB_CB(skb)->offset = 0; if (sk_filter(sk, skb)) { - atomic_inc(&sk->sk_drops); /* skb rejected by filter */ + sk_drops_inc(sk); /* skb rejected by filter */ kfree_skb(skb); return; } @@ -2011,7 +2011,7 @@ static int afiucv_hs_callback_rx(struct sock *sk, struct sk_buff *skb) skb_reset_network_header(skb); IUCV_SKB_CB(skb)->offset = 0; if (sk_filter(sk, skb)) { - atomic_inc(&sk->sk_drops); /* skb rejected by filter */ + sk_drops_inc(sk); /* skb rejected by filter */ kfree_skb(skb); return NET_RX_SUCCESS; } diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index e2f7080dd5d7..2b46c0cd752a 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -356,7 +356,7 @@ static void netlink_overrun(struct sock *sk) sk_error_report(sk); } } - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); } static void netlink_rcv_wake(struct sock *sk) @@ -2711,7 +2711,7 @@ static int netlink_native_seq_show(struct seq_file *seq, void *v) sk_wmem_alloc_get(s), READ_ONCE(nlk->cb_running), refcount_read(&s->sk_refcnt), - atomic_read(&s->sk_drops), + sk_drops_read(s), sock_i_ino(s) ); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index a7017d7f0927..9d42c4bd6e39 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2265,7 +2265,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, drop_n_acct: atomic_inc(&po->tp_drops); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR; drop_n_restore: diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 62527e1ebb88..4db564d9d522 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -376,7 +376,7 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb) case PNS_PEP_CTRL_REQ: if (skb_queue_len(&pn->ctrlreq_queue) >= PNPIPE_CTRLREQ_MAX) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); break; } __skb_pull(skb, 4); @@ -397,7 +397,7 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb) } if (pn->rx_credits == 0) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); err = -ENOBUFS; break; } @@ -567,7 +567,7 @@ static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb) } if (pn->rx_credits == 0) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); err = NET_RX_DROP; break; } diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 2b61a40b568e..db2d552e9b32 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -587,7 +587,7 @@ static int pn_sock_seq_show(struct seq_file *seq, void *v) from_kuid_munged(seq_user_ns(seq), sk_uid(sk)), sock_i_ino(sk), refcount_read(&sk->sk_refcnt), sk, - atomic_read(&sk->sk_drops)); + sk_drops_read(sk)); } seq_pad(seq, '\n'); return 0; diff --git a/net/sctp/diag.c b/net/sctp/diag.c index 23359e522273..996c2018f0e6 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -173,7 +173,7 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); - mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); + mem[SK_MEMINFO_DROPS] = sk_drops_read(sk); if (nla_put(skb, INET_DIAG_SKMEMINFO, sizeof(mem), &mem) < 0) goto errout; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index e028bf658499..1574a83384f8 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2366,7 +2366,7 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit) { trace_tipc_sk_dump(sk, skb, TIPC_DUMP_ALL, "err_overload2!"); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); err = TIPC_ERR_OVERLOAD; } @@ -2458,7 +2458,7 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, trace_tipc_sk_dump(sk, skb, TIPC_DUMP_ALL, "err_overload!"); /* Overload => reject message back to sender */ onode = tipc_own_addr(sock_net(sk)); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); if (tipc_msg_reverse(onode, &skb, TIPC_ERR_OVERLOAD)) { trace_tipc_sk_rej_msg(sk, skb, TIPC_DUMP_ALL, "@sk_enqueue!"); @@ -3657,7 +3657,7 @@ int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb, nla_put_u32(skb, TIPC_NLA_SOCK_STAT_SENDQ, skb_queue_len(&sk->sk_write_queue)) || nla_put_u32(skb, TIPC_NLA_SOCK_STAT_DROP, - atomic_read(&sk->sk_drops))) + sk_drops_read(sk))) goto stat_msg_cancel; if (tsk->cong_link_cnt && -- cgit v1.2.3 From cb4d5a6eb600a43c2e3ec7f54e06d07aa33d8062 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Aug 2025 12:50:28 +0000 Subject: net: add sk_drops_skbadd() helper Existing sk_drops_add() helper is renamed to sk_drops_skbadd(). Add sk_drops_add() and convert sk_drops_inc() to use it. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250826125031.1578842-3-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/skmsg.h | 2 +- include/net/sock.h | 11 ++++++++--- include/net/udp.h | 2 +- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv6/tcp_ipv6.c | 4 ++-- net/mptcp/protocol.c | 2 +- 7 files changed, 16 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 0b9095a281b8..49847888c287 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -315,7 +315,7 @@ static inline bool sk_psock_test_state(const struct sk_psock *psock, static inline void sock_drop(struct sock *sk, struct sk_buff *skb) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); kfree_skb(skb); } diff --git a/include/net/sock.h b/include/net/sock.h index 34d7029eb622..9edb42ff0622 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2682,9 +2682,14 @@ struct sock_skb_cb { #define sock_skb_cb_check_size(size) \ BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET) +static inline void sk_drops_add(struct sock *sk, int segs) +{ + atomic_add(segs, &sk->sk_drops); +} + static inline void sk_drops_inc(struct sock *sk) { - atomic_inc(&sk->sk_drops); + sk_drops_add(sk, 1); } static inline int sk_drops_read(const struct sock *sk) @@ -2704,11 +2709,11 @@ sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb) sk_drops_read(sk) : 0; } -static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) +static inline void sk_drops_skbadd(struct sock *sk, const struct sk_buff *skb) { int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs); - atomic_add(segs, &sk->sk_drops); + sk_drops_add(sk, segs); } static inline ktime_t sock_read_timestamp(struct sock *sk) diff --git a/include/net/udp.h b/include/net/udp.h index e2af3bda90c9..7b26d4c50f33 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -627,7 +627,7 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk, return segs; drop: - atomic_add(drop_count, &sk->sk_drops); + sk_drops_add(sk, drop_count); SNMP_ADD_STATS(__UDPX_MIB(sk, ipv4), UDP_MIB_INERRORS, drop_count); kfree_skb(skb); return NULL; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a52a747d8a55..f1be65af1a77 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4830,7 +4830,7 @@ static bool tcp_ooo_try_coalesce(struct sock *sk, noinline_for_tracing static void tcp_drop_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); sk_skb_reason_drop(sk, skb, reason); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a0c93b24c6e0..7c1d612afca1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2254,7 +2254,7 @@ lookup: &iph->saddr, &iph->daddr, AF_INET, dif, sdif); if (unlikely(drop_reason)) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); reqsk_put(req); goto discard_it; } @@ -2399,7 +2399,7 @@ discard_it: return 0; discard_and_relse: - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); if (refcounted) sock_put(sk); goto discard_it; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 8b2e7b7afbd8..b4e56b877273 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1809,7 +1809,7 @@ lookup: &hdr->saddr, &hdr->daddr, AF_INET6, dif, sdif); if (drop_reason) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); reqsk_put(req); goto discard_it; } @@ -1948,7 +1948,7 @@ discard_it: return 0; discard_and_relse: - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); if (refcounted) sock_put(sk); goto discard_it; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f2e728239480..ad41c48126e4 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -137,7 +137,7 @@ struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk) static void mptcp_drop(struct sock *sk, struct sk_buff *skb) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); __kfree_skb(skb); } -- cgit v1.2.3 From c51613fa276f038bdd18656a57a90ccc5d4e5200 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Aug 2025 12:50:29 +0000 Subject: net: add sk->sk_drop_counters Some sockets suffer from heavy false sharing on sk->sk_drops, and fields in the same cache line. Add sk->sk_drop_counters to: - move the drop counter(s) to dedicated cache lines. - Add basic NUMA awareness to these drop counter(s). Following patches will use this infrastructure for UDP and RAW sockets. sk_clone_lock() is not yet ready, it would need to properly set newsk->sk_drop_counters if we plan to use this for TCP sockets. v2: used Paolo suggestion from https://lore.kernel.org/netdev/8f09830a-d83d-43c9-b36b-88ba0a23e9b2@redhat.com/ Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250826125031.1578842-4-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/sock.h | 32 +++++++++++++++++++++++++++++++- net/core/sock.c | 2 ++ 2 files changed, 33 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 9edb42ff0622..73cd3316e288 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -102,6 +102,11 @@ struct net; typedef __u32 __bitwise __portpair; typedef __u64 __bitwise __addrpair; +struct socket_drop_counters { + atomic_t drops0 ____cacheline_aligned_in_smp; + atomic_t drops1 ____cacheline_aligned_in_smp; +}; + /** * struct sock_common - minimal network layer representation of sockets * @skc_daddr: Foreign IPv4 addr @@ -282,6 +287,7 @@ struct sk_filter; * @sk_err_soft: errors that don't cause failure but are the cause of a * persistent failure not just 'timed out' * @sk_drops: raw/udp drops counter + * @sk_drop_counters: optional pointer to socket_drop_counters * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner @@ -449,6 +455,7 @@ struct sock { #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif + struct socket_drop_counters *sk_drop_counters; __cacheline_group_end(sock_read_rxtx); __cacheline_group_begin(sock_write_rxtx); @@ -2684,7 +2691,18 @@ struct sock_skb_cb { static inline void sk_drops_add(struct sock *sk, int segs) { - atomic_add(segs, &sk->sk_drops); + struct socket_drop_counters *sdc = sk->sk_drop_counters; + + if (sdc) { + int n = numa_node_id() % 2; + + if (n) + atomic_add(segs, &sdc->drops1); + else + atomic_add(segs, &sdc->drops0); + } else { + atomic_add(segs, &sk->sk_drops); + } } static inline void sk_drops_inc(struct sock *sk) @@ -2694,11 +2712,23 @@ static inline void sk_drops_inc(struct sock *sk) static inline int sk_drops_read(const struct sock *sk) { + const struct socket_drop_counters *sdc = sk->sk_drop_counters; + + if (sdc) { + DEBUG_NET_WARN_ON_ONCE(atomic_read(&sk->sk_drops)); + return atomic_read(&sdc->drops0) + atomic_read(&sdc->drops1); + } return atomic_read(&sk->sk_drops); } static inline void sk_drops_reset(struct sock *sk) { + struct socket_drop_counters *sdc = sk->sk_drop_counters; + + if (sdc) { + atomic_set(&sdc->drops0, 0); + atomic_set(&sdc->drops1, 0); + } atomic_set(&sk->sk_drops, 0); } diff --git a/net/core/sock.c b/net/core/sock.c index 75368823969a..e66ad1ec3a2d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2505,6 +2505,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; newsk->sk_reserved_mem = 0; + DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters); sk_drops_reset(newsk); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; @@ -4457,6 +4458,7 @@ static int __init sock_struct_check(void) #ifdef CONFIG_MEMCG CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); #endif + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_drop_counters); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); -- cgit v1.2.3 From b81aa23234d94d99951761d9864061d774633ba9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Aug 2025 12:50:31 +0000 Subject: inet: raw: add drop_counters to raw sockets When a packet flood hits one or more RAW sockets, many cpus have to update sk->sk_drops. This slows down other cpus, because currently sk_drops is in sock_write_rx group. Add a socket_drop_counters structure to raw sockets. Using dedicated cache lines to hold drop counters makes sure that consumers no longer suffer from false sharing if/when producers only change sk->sk_drops. This adds 128 bytes per RAW socket. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250826125031.1578842-6-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/ipv6.h | 2 +- include/net/raw.h | 1 + net/ipv4/raw.c | 1 + net/ipv6/raw.c | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index bc6ec2959173..261d02efb615 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -295,7 +295,7 @@ struct raw6_sock { __u32 offset; /* checksum offset */ struct icmp6_filter filter; __u32 ip6mr_table; - + struct socket_drop_counters drop_counters; struct ipv6_pinfo inet6; }; diff --git a/include/net/raw.h b/include/net/raw.h index 32a61481a253..d52709139060 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -81,6 +81,7 @@ struct raw_sock { struct inet_sock inet; struct icmp_filter filter; u32 ipmr_table; + struct socket_drop_counters drop_counters; }; #define raw_sk(ptr) container_of_const(ptr, struct raw_sock, inet.sk) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 0f9f02f6146e..d54ebb7df966 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -793,6 +793,7 @@ static int raw_sk_init(struct sock *sk) { struct raw_sock *rp = raw_sk(sk); + sk->sk_drop_counters = &rp->drop_counters; if (inet_sk(sk)->inet_num == IPPROTO_ICMP) memset(&rp->filter, 0, sizeof(rp->filter)); return 0; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 4026192143ec..4ae07a67b4d4 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1175,6 +1175,7 @@ static int rawv6_init_sk(struct sock *sk) { struct raw6_sock *rp = raw6_sk(sk); + sk->sk_drop_counters = &rp->drop_counters; switch (inet_sk(sk)->inet_num) { case IPPROTO_ICMPV6: rp->checksum = 1; -- cgit v1.2.3 From 75575e2d252afb29fdbcbeec4d67e042007add52 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Tue, 26 Aug 2025 20:26:01 +0300 Subject: wifi: mac80211: do not permit 40 MHz EHT operation on 5/6 GHz The EHT PHY requirements state that 80 MHz must be supported on the 5 and 6 GHz bands unless the STA is 20 MHz only. So if the channel width is limited to 40 MHz on a band other than 2.4 GHz, then disable EHT and downgrade to HE. The primary case where this can happen is if the hardware disables puncturing using IEEE80211_HW_DISALLOW_PUNCTURING. Signed-off-by: Benjamin Berg Cc: stable@vger.kernel.org Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250826202553.a6582f3abf57.Ic670429dc7127f68c818b4290d950ebfb5a0b9e1@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 8 ++++++++ net/mac80211/tests/chan-mode.c | 30 +++++++++++++++++++++++++----- 2 files changed, 33 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 1008eb8e9b13..dd650a127a31 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1189,6 +1189,14 @@ again: "required MCSes not supported, disabling EHT\n"); } + if (conn->mode >= IEEE80211_CONN_MODE_EHT && + channel->band != NL80211_BAND_2GHZ && + conn->bw_limit == IEEE80211_CONN_BW_LIMIT_40) { + conn->mode = IEEE80211_CONN_MODE_HE; + link_id_info(sdata, link_id, + "required bandwidth not supported, disabling EHT\n"); + } + /* the mode can only decrease, so this must terminate */ if (ap_mode != conn->mode) { kfree(elems); diff --git a/net/mac80211/tests/chan-mode.c b/net/mac80211/tests/chan-mode.c index 96c7b3ab2744..adc069065e73 100644 --- a/net/mac80211/tests/chan-mode.c +++ b/net/mac80211/tests/chan-mode.c @@ -2,7 +2,7 @@ /* * KUnit tests for channel mode functions * - * Copyright (C) 2024 Intel Corporation + * Copyright (C) 2024-2025 Intel Corporation */ #include #include @@ -28,6 +28,10 @@ static const struct determine_chan_mode_case { u8 vht_basic_mcs_1_4, vht_basic_mcs_5_8; u8 he_basic_mcs_1_4, he_basic_mcs_5_8; u8 eht_mcs7_min_nss; + u16 eht_disabled_subchannels; + u8 eht_bw; + enum ieee80211_conn_bw_limit conn_bw_limit; + enum ieee80211_conn_bw_limit expected_bw_limit; int error; } determine_chan_mode_cases[] = { { @@ -128,6 +132,14 @@ static const struct determine_chan_mode_case { .conn_mode = IEEE80211_CONN_MODE_EHT, .eht_mcs7_min_nss = 0x15, .error = EINVAL, + }, { + .desc = "80 MHz EHT is downgraded to 40 MHz HE due to puncturing", + .conn_mode = IEEE80211_CONN_MODE_EHT, + .expected_mode = IEEE80211_CONN_MODE_HE, + .conn_bw_limit = IEEE80211_CONN_BW_LIMIT_80, + .expected_bw_limit = IEEE80211_CONN_BW_LIMIT_40, + .eht_disabled_subchannels = 0x08, + .eht_bw = IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ, } }; KUNIT_ARRAY_PARAM_DESC(determine_chan_mode, determine_chan_mode_cases, desc) @@ -138,7 +150,7 @@ static void test_determine_chan_mode(struct kunit *test) struct t_sdata *t_sdata = T_SDATA(test); struct ieee80211_conn_settings conn = { .mode = params->conn_mode, - .bw_limit = IEEE80211_CONN_BW_LIMIT_20, + .bw_limit = params->conn_bw_limit, }; struct cfg80211_bss cbss = { .channel = &t_sdata->band_5ghz.channels[0], @@ -191,14 +203,21 @@ static void test_determine_chan_mode(struct kunit *test) 0x7f, 0x01, 0x00, 0x88, 0x88, 0x88, 0x00, 0x00, 0x00, /* EHT Operation */ - WLAN_EID_EXTENSION, 0x09, WLAN_EID_EXT_EHT_OPERATION, - 0x01, params->eht_mcs7_min_nss ? params->eht_mcs7_min_nss : 0x11, - 0x00, 0x00, 0x00, 0x00, 0x24, 0x00, + WLAN_EID_EXTENSION, 0x0b, WLAN_EID_EXT_EHT_OPERATION, + 0x03, params->eht_mcs7_min_nss ? params->eht_mcs7_min_nss : 0x11, + 0x00, 0x00, 0x00, params->eht_bw, + params->eht_bw == IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ ? 42 : 36, + 0x00, + u16_get_bits(params->eht_disabled_subchannels, 0xff), + u16_get_bits(params->eht_disabled_subchannels, 0xff00), }; struct ieee80211_chan_req chanreq = {}; struct cfg80211_chan_def ap_chandef = {}; struct ieee802_11_elems *elems; + /* To force EHT downgrade to HE on punctured 80 MHz downgraded to 40 MHz */ + set_bit(IEEE80211_HW_DISALLOW_PUNCTURING, t_sdata->local.hw.flags); + if (params->strict) set_bit(IEEE80211_HW_STRICT, t_sdata->local.hw.flags); else @@ -237,6 +256,7 @@ static void test_determine_chan_mode(struct kunit *test) } else { KUNIT_ASSERT_NOT_ERR_OR_NULL(test, elems); KUNIT_ASSERT_EQ(test, conn.mode, params->expected_mode); + KUNIT_ASSERT_EQ(test, conn.bw_limit, params->expected_bw_limit); } } -- cgit v1.2.3 From 13d8e05adf9dd06c74fcc6ba42ec4bf780fd557f Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Wed, 27 Aug 2025 17:39:55 +0300 Subject: queue_api: add support for fetching per queue DMA dev For zerocopy (io_uring, devmem), there is an assumption that the parent device can do DMA. However that is not always the case: - Scalable Function netdevs [1] have the DMA device in the grandparent. - For Multi-PF netdevs [2] queues can be associated to different DMA devices. This patch introduces the a queue based interface for allowing drivers to expose a different DMA device for zerocopy. [1] Documentation/networking/device_drivers/ethernet/mellanox/mlx5/switchdev.rst [2] Documentation/networking/multi-pf-netdev.rst Signed-off-by: Dragos Tatulea Reviewed-by: Pavel Begunkov Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20250827144017.1529208-3-dtatulea@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/netdev_queues.h | 7 +++++++ net/core/Makefile | 1 + net/core/netdev_queues.c | 27 +++++++++++++++++++++++++++ 3 files changed, 35 insertions(+) create mode 100644 net/core/netdev_queues.c (limited to 'net') diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index 6e835972abd1..b9d02bc65c97 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -127,6 +127,9 @@ void netdev_stat_queue_sum(struct net_device *netdev, * @ndo_queue_stop: Stop the RX queue at the specified index. The stopped * queue's memory is written at the specified address. * + * @ndo_queue_get_dma_dev: Get dma device for zero-copy operations to be used + * for this queue. Return NULL on error. + * * Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while * the interface is closed. @ndo_queue_start and @ndo_queue_stop will only * be called for an interface which is open. @@ -144,6 +147,8 @@ struct netdev_queue_mgmt_ops { int (*ndo_queue_stop)(struct net_device *dev, void *per_queue_mem, int idx); + struct device * (*ndo_queue_get_dma_dev)(struct net_device *dev, + int idx); }; /** @@ -321,4 +326,6 @@ static inline void netif_subqueue_sent(const struct net_device *dev, get_desc, start_thrs); \ }) +struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx); + #endif diff --git a/net/core/Makefile b/net/core/Makefile index b2a76ce33932..9ef2099c5426 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o obj-y += net-sysfs.o obj-y += hotdata.o obj-y += netdev_rx_queue.o +obj-y += netdev_queues.o obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o diff --git a/net/core/netdev_queues.c b/net/core/netdev_queues.c new file mode 100644 index 000000000000..251f27a8307f --- /dev/null +++ b/net/core/netdev_queues.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include + +/** + * netdev_queue_get_dma_dev() - get dma device for zero-copy operations + * @dev: net_device + * @idx: queue index + * + * Get dma device for zero-copy operations to be used for this queue. + * When such device is not available or valid, the function will return NULL. + * + * Return: Device or NULL on error + */ +struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx) +{ + const struct netdev_queue_mgmt_ops *queue_ops = dev->queue_mgmt_ops; + struct device *dma_dev; + + if (queue_ops && queue_ops->ndo_queue_get_dma_dev) + dma_dev = queue_ops->ndo_queue_get_dma_dev(dev, idx); + else + dma_dev = dev->dev.parent; + + return dma_dev && dma_dev->dma_mask ? dma_dev : NULL; +} + -- cgit v1.2.3 From 7c7e94603a76d62efbc4da4d0eb7a221add0ecfa Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Wed, 27 Aug 2025 17:39:57 +0300 Subject: net: devmem: get netdev DMA device via new API Switch to the new API for fetching DMA devices for a netdev. The API is called with queue index 0 for now which is equivalent with the previous behavior. This patch will allow devmem to work with devices where the DMA device is not stored in the parent device. mlx5 SFs are an example of such a device. Multi-PF netdevs are still problematic (as they were before this change). Upcoming patches will address this for the rx binding. Signed-off-by: Dragos Tatulea Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20250827144017.1529208-5-dtatulea@nvidia.com Signed-off-by: Jakub Kicinski --- net/core/devmem.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/devmem.c b/net/core/devmem.c index 24c591ab38ae..c58b24128727 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -182,6 +182,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, { struct net_devmem_dmabuf_binding *binding; static u32 id_alloc_next; + struct device *dma_dev; struct scatterlist *sg; struct dma_buf *dmabuf; unsigned int sg_idx, i; @@ -192,6 +193,13 @@ net_devmem_bind_dmabuf(struct net_device *dev, if (IS_ERR(dmabuf)) return ERR_CAST(dmabuf); + dma_dev = netdev_queue_get_dma_dev(dev, 0); + if (!dma_dev) { + err = -EOPNOTSUPP; + NL_SET_ERR_MSG(extack, "Device doesn't support DMA"); + goto err_put_dmabuf; + } + binding = kzalloc_node(sizeof(*binding), GFP_KERNEL, dev_to_node(&dev->dev)); if (!binding) { @@ -209,7 +217,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, binding->dmabuf = dmabuf; binding->direction = direction; - binding->attachment = dma_buf_attach(binding->dmabuf, dev->dev.parent); + binding->attachment = dma_buf_attach(binding->dmabuf, dma_dev); if (IS_ERR(binding->attachment)) { err = PTR_ERR(binding->attachment); NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device"); -- cgit v1.2.3 From 512c88fb0e884cbb4c495b8f3351a9185d1d50b1 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Wed, 27 Aug 2025 17:39:59 +0300 Subject: net: devmem: pull out dma_dev out of net_devmem_bind_dmabuf Fetch the DMA device before calling net_devmem_bind_dmabuf() and pass it on as a parameter. This is needed for an upcoming change which will read the DMA device per queue. This patch has no functional changes. Signed-off-by: Dragos Tatulea Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20250827144017.1529208-7-dtatulea@nvidia.com Signed-off-by: Jakub Kicinski --- net/core/devmem.c | 14 ++++++-------- net/core/devmem.h | 2 ++ net/core/netdev-genl.c | 12 ++++++++---- 3 files changed, 16 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/core/devmem.c b/net/core/devmem.c index c58b24128727..d9de31a6cc7f 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -176,30 +176,28 @@ err_close_rxq: struct net_devmem_dmabuf_binding * net_devmem_bind_dmabuf(struct net_device *dev, + struct device *dma_dev, enum dma_data_direction direction, unsigned int dmabuf_fd, struct netdev_nl_sock *priv, struct netlink_ext_ack *extack) { struct net_devmem_dmabuf_binding *binding; static u32 id_alloc_next; - struct device *dma_dev; struct scatterlist *sg; struct dma_buf *dmabuf; unsigned int sg_idx, i; unsigned long virtual; int err; - dmabuf = dma_buf_get(dmabuf_fd); - if (IS_ERR(dmabuf)) - return ERR_CAST(dmabuf); - - dma_dev = netdev_queue_get_dma_dev(dev, 0); if (!dma_dev) { - err = -EOPNOTSUPP; NL_SET_ERR_MSG(extack, "Device doesn't support DMA"); - goto err_put_dmabuf; + return ERR_PTR(-EOPNOTSUPP); } + dmabuf = dma_buf_get(dmabuf_fd); + if (IS_ERR(dmabuf)) + return ERR_CAST(dmabuf); + binding = kzalloc_node(sizeof(*binding), GFP_KERNEL, dev_to_node(&dev->dev)); if (!binding) { diff --git a/net/core/devmem.h b/net/core/devmem.h index 41cd6e1c9141..101150d761af 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -85,6 +85,7 @@ struct dmabuf_genpool_chunk_owner { void __net_devmem_dmabuf_binding_free(struct work_struct *wq); struct net_devmem_dmabuf_binding * net_devmem_bind_dmabuf(struct net_device *dev, + struct device *dma_dev, enum dma_data_direction direction, unsigned int dmabuf_fd, struct netdev_nl_sock *priv, struct netlink_ext_ack *extack); @@ -170,6 +171,7 @@ static inline void net_devmem_put_net_iov(struct net_iov *niov) static inline struct net_devmem_dmabuf_binding * net_devmem_bind_dmabuf(struct net_device *dev, + struct device *dma_dev, enum dma_data_direction direction, unsigned int dmabuf_fd, struct netdev_nl_sock *priv, diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 6314eb7bdf69..3e2d6aa6e060 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -876,6 +876,7 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) u32 ifindex, dmabuf_fd, rxq_idx; struct netdev_nl_sock *priv; struct net_device *netdev; + struct device *dma_dev; struct sk_buff *rsp; struct nlattr *attr; int rem, err = 0; @@ -921,8 +922,9 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) goto err_unlock; } - binding = net_devmem_bind_dmabuf(netdev, DMA_FROM_DEVICE, dmabuf_fd, - priv, info->extack); + dma_dev = netdev_queue_get_dma_dev(netdev, 0); + binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_FROM_DEVICE, + dmabuf_fd, priv, info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); goto err_unlock; @@ -986,6 +988,7 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) struct net_devmem_dmabuf_binding *binding; struct netdev_nl_sock *priv; struct net_device *netdev; + struct device *dma_dev; u32 ifindex, dmabuf_fd; struct sk_buff *rsp; int err = 0; @@ -1032,8 +1035,9 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) goto err_unlock_netdev; } - binding = net_devmem_bind_dmabuf(netdev, DMA_TO_DEVICE, dmabuf_fd, priv, - info->extack); + dma_dev = netdev_queue_get_dma_dev(netdev, 0); + binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_TO_DEVICE, + dmabuf_fd, priv, info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); goto err_unlock_netdev; -- cgit v1.2.3 From 1b416902cd255f51be37c1b7f7307b9f7027e04f Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Wed, 27 Aug 2025 17:40:00 +0300 Subject: net: devmem: pre-read requested rx queues during bind Instead of reading the requested rx queues after binding the buffer, read the rx queues in advance in a bitmap and iterate over them when needed. This is a preparation for fetching the DMA device for each queue. This patch has no functional changes besides adding an extra rq index bounds check. Signed-off-by: Dragos Tatulea Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20250827144017.1529208-8-dtatulea@nvidia.com Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 85 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 58 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 3e2d6aa6e060..739598d34657 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -869,17 +869,55 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, return err; } -int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) +static int netdev_nl_read_rxq_bitmap(struct genl_info *info, + u32 rxq_bitmap_len, + unsigned long *rxq_bitmap) { + const int maxtype = ARRAY_SIZE(netdev_queue_id_nl_policy) - 1; struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)]; + struct nlattr *attr; + int rem, err = 0; + u32 rxq_idx; + + nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES, + genlmsg_data(info->genlhdr), + genlmsg_len(info->genlhdr), rem) { + err = nla_parse_nested(tb, maxtype, attr, + netdev_queue_id_nl_policy, info->extack); + if (err < 0) + return err; + + if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) || + NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE)) + return -EINVAL; + + if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) { + NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]); + return -EINVAL; + } + + rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]); + if (rxq_idx >= rxq_bitmap_len) { + NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_ID]); + return -EINVAL; + } + + bitmap_set(rxq_bitmap, rxq_idx, 1); + } + + return 0; +} + +int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) +{ struct net_devmem_dmabuf_binding *binding; u32 ifindex, dmabuf_fd, rxq_idx; struct netdev_nl_sock *priv; struct net_device *netdev; + unsigned long *rxq_bitmap; struct device *dma_dev; struct sk_buff *rsp; - struct nlattr *attr; - int rem, err = 0; + int err = 0; void *hdr; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || @@ -922,37 +960,26 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) goto err_unlock; } + rxq_bitmap = bitmap_zalloc(netdev->real_num_rx_queues, GFP_KERNEL); + if (!rxq_bitmap) { + err = -ENOMEM; + goto err_unlock; + } + + err = netdev_nl_read_rxq_bitmap(info, netdev->real_num_rx_queues, + rxq_bitmap); + if (err) + goto err_rxq_bitmap; + dma_dev = netdev_queue_get_dma_dev(netdev, 0); binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_FROM_DEVICE, dmabuf_fd, priv, info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); - goto err_unlock; + goto err_rxq_bitmap; } - nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES, - genlmsg_data(info->genlhdr), - genlmsg_len(info->genlhdr), rem) { - err = nla_parse_nested( - tb, ARRAY_SIZE(netdev_queue_id_nl_policy) - 1, attr, - netdev_queue_id_nl_policy, info->extack); - if (err < 0) - goto err_unbind; - - if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) || - NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE)) { - err = -EINVAL; - goto err_unbind; - } - - if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) { - NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]); - err = -EINVAL; - goto err_unbind; - } - - rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]); - + for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) { err = net_devmem_bind_dmabuf_to_queue(netdev, rxq_idx, binding, info->extack); if (err) @@ -966,6 +993,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) if (err) goto err_unbind; + bitmap_free(rxq_bitmap); + netdev_unlock(netdev); mutex_unlock(&priv->lock); @@ -974,6 +1003,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) err_unbind: net_devmem_unbind_dmabuf(binding); +err_rxq_bitmap: + bitmap_free(rxq_bitmap); err_unlock: netdev_unlock(netdev); err_unlock_sock: -- cgit v1.2.3 From b8aab4bb9585078012f5b6020454337a47081501 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Wed, 27 Aug 2025 17:40:01 +0300 Subject: net: devmem: allow binding on rx queues with same DMA devices Multi-PF netdevs have queues belonging to different PFs which also means different DMA devices. This means that the binding on the DMA buffer can be done to the incorrect device. This change allows devmem binding to multiple queues only when the queues have the same DMA device. Otherwise an error is returned. Signed-off-by: Dragos Tatulea Link: https://patch.msgid.link/20250827144017.1529208-9-dtatulea@nvidia.com Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 739598d34657..470fabbeacd9 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -908,6 +908,30 @@ static int netdev_nl_read_rxq_bitmap(struct genl_info *info, return 0; } +static struct device * +netdev_nl_get_dma_dev(struct net_device *netdev, unsigned long *rxq_bitmap, + struct netlink_ext_ack *extack) +{ + struct device *dma_dev = NULL; + u32 rxq_idx, prev_rxq_idx; + + for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) { + struct device *rxq_dma_dev; + + rxq_dma_dev = netdev_queue_get_dma_dev(netdev, rxq_idx); + if (dma_dev && rxq_dma_dev != dma_dev) { + NL_SET_ERR_MSG_FMT(extack, "DMA device mismatch between queue %u and %u (multi-PF device?)", + rxq_idx, prev_rxq_idx); + return ERR_PTR(-EOPNOTSUPP); + } + + dma_dev = rxq_dma_dev; + prev_rxq_idx = rxq_idx; + } + + return dma_dev; +} + int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) { struct net_devmem_dmabuf_binding *binding; @@ -971,7 +995,12 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) if (err) goto err_rxq_bitmap; - dma_dev = netdev_queue_get_dma_dev(netdev, 0); + dma_dev = netdev_nl_get_dma_dev(netdev, rxq_bitmap, info->extack); + if (IS_ERR(dma_dev)) { + err = PTR_ERR(dma_dev); + goto err_rxq_bitmap; + } + binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_FROM_DEVICE, dmabuf_fd, priv, info->extack); if (IS_ERR(binding)) { -- cgit v1.2.3 From 3133d5c15cb568470f4ec3ea9e0599543eecf3ea Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 Aug 2025 12:53:46 +0000 Subject: net_sched: remove BH blocking in eight actions Followup of f45b45cbfae3 ("Merge branch 'net_sched-act-extend-rcu-use-in-dump-methods'") We never grab tcf_lock from BH context in these modules: act_connmark act_csum act_ct act_ctinfo act_mpls act_nat act_pedit act_skbedit No longer block BH when acquiring tcf_lock from init functions. Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250827125349.3505302-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/act_connmark.c | 4 ++-- net/sched/act_csum.c | 4 ++-- net/sched/act_ct.c | 4 ++-- net/sched/act_ctinfo.c | 4 ++-- net/sched/act_mpls.c | 4 ++-- net/sched/act_nat.c | 4 ++-- net/sched/act_pedit.c | 4 ++-- net/sched/act_skbedit.c | 4 ++-- 8 files changed, 16 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 3e89927d7116..bf2d6b6da042 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -169,10 +169,10 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, nparms->action = parm->action; - spin_lock_bh(&ci->tcf_lock); + spin_lock(&ci->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); oparms = rcu_replace_pointer(ci->parms, nparms, lockdep_is_held(&ci->tcf_lock)); - spin_unlock_bh(&ci->tcf_lock); + spin_unlock(&ci->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 0939e6b2ba4d..8bad91753615 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -101,11 +101,11 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, params_new->update_flags = parm->update_flags; params_new->action = parm->action; - spin_lock_bh(&p->tcf_lock); + spin_lock(&p->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params_new = rcu_replace_pointer(p->params, params_new, lockdep_is_held(&p->tcf_lock)); - spin_unlock_bh(&p->tcf_lock); + spin_unlock(&p->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 6749a4a9a9cd..6d2355e73b0f 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -1410,11 +1410,11 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, goto cleanup; params->action = parm->action; - spin_lock_bh(&c->tcf_lock); + spin_lock(&c->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params = rcu_replace_pointer(c->params, params, lockdep_is_held(&c->tcf_lock)); - spin_unlock_bh(&c->tcf_lock); + spin_unlock(&c->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 71efe04d00b5..6f79eed9a544 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -258,11 +258,11 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, cp_new->action = actparm->action; - spin_lock_bh(&ci->tcf_lock); + spin_lock(&ci->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch); cp_new = rcu_replace_pointer(ci->params, cp_new, lockdep_is_held(&ci->tcf_lock)); - spin_unlock_bh(&ci->tcf_lock); + spin_unlock(&ci->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index 6654011dcd2b..ed7bdaa23f0d 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -296,10 +296,10 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla, htons(ETH_P_MPLS_UC)); p->action = parm->action; - spin_lock_bh(&m->tcf_lock); + spin_lock(&m->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); p = rcu_replace_pointer(m->mpls_p, p, lockdep_is_held(&m->tcf_lock)); - spin_unlock_bh(&m->tcf_lock); + spin_unlock(&m->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 26241d80ebe0..9cc2a1772cf8 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -95,10 +95,10 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, p = to_tcf_nat(*a); - spin_lock_bh(&p->tcf_lock); + spin_lock(&p->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); oparm = rcu_replace_pointer(p->parms, nparm, lockdep_is_held(&p->tcf_lock)); - spin_unlock_bh(&p->tcf_lock); + spin_unlock(&p->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 4b65901397a8..8fc8f577cb7a 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -280,10 +280,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, p = to_pedit(*a); nparms->action = parm->action; - spin_lock_bh(&p->tcf_lock); + spin_lock(&p->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); oparms = rcu_replace_pointer(p->parms, nparms, 1); - spin_unlock_bh(&p->tcf_lock); + spin_unlock(&p->tcf_lock); if (oparms) call_rcu(&oparms->rcu, tcf_pedit_cleanup_rcu); diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 8c1d1554f657..aa6b1744de21 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -261,11 +261,11 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, params_new->mask = *mask; params_new->action = parm->action; - spin_lock_bh(&d->tcf_lock); + spin_lock(&d->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params_new = rcu_replace_pointer(d->params, params_new, lockdep_is_held(&d->tcf_lock)); - spin_unlock_bh(&d->tcf_lock); + spin_unlock(&d->tcf_lock); if (params_new) kfree_rcu(params_new, rcu); if (goto_ch) -- cgit v1.2.3 From 48b5e5dbdb234ffc951cacceaec7f8ee37c83b2d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 Aug 2025 12:53:47 +0000 Subject: net_sched: act_vlan: use RCU in tcf_vlan_dump() Also storing tcf_action into struct tcf_vlan_params makes sure there is no discrepancy in tcf_vlan_act(). No longer block BH in tcf_vlan_init() when acquiring tcf_lock. Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250827125349.3505302-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_vlan.h | 1 + net/sched/act_vlan.c | 20 +++++++++----------- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h index 3f5e9242b5e8..beadee41669a 100644 --- a/include/net/tc_act/tc_vlan.h +++ b/include/net/tc_act/tc_vlan.h @@ -10,6 +10,7 @@ #include struct tcf_vlan_params { + int action; int tcfv_action; unsigned char tcfv_push_dst[ETH_ALEN]; unsigned char tcfv_push_src[ETH_ALEN]; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 383bf18b6862..b46f980f3b2a 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -25,7 +25,6 @@ TC_INDIRECT_SCOPE int tcf_vlan_act(struct sk_buff *skb, { struct tcf_vlan *v = to_vlan(a); struct tcf_vlan_params *p; - int action; int err; u16 tci; @@ -38,8 +37,6 @@ TC_INDIRECT_SCOPE int tcf_vlan_act(struct sk_buff *skb, if (skb_at_tc_ingress(skb)) skb_push_rcsum(skb, skb->mac_len); - action = READ_ONCE(v->tcf_action); - p = rcu_dereference_bh(v->vlan_p); switch (p->tcfv_action) { @@ -97,7 +94,7 @@ out: skb_pull_rcsum(skb, skb->mac_len); skb_reset_mac_len(skb); - return action; + return p->action; drop: tcf_action_inc_drop_qstats(&v->common); @@ -255,10 +252,11 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, ETH_ALEN); } - spin_lock_bh(&v->tcf_lock); + p->action = parm->action; + spin_lock(&v->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); p = rcu_replace_pointer(v->vlan_p, p, lockdep_is_held(&v->tcf_lock)); - spin_unlock_bh(&v->tcf_lock); + spin_unlock(&v->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); @@ -297,9 +295,9 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t t; - spin_lock_bh(&v->tcf_lock); - opt.action = v->tcf_action; - p = rcu_dereference_protected(v->vlan_p, lockdep_is_held(&v->tcf_lock)); + rcu_read_lock(); + p = rcu_dereference(v->vlan_p); + opt.action = p->action; opt.v_action = p->tcfv_action; if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -325,12 +323,12 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &v->tcf_tm); if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD)) goto nla_put_failure; - spin_unlock_bh(&v->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&v->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From e97ae742972f6cb57986a5ebb846048f80b90003 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 Aug 2025 12:53:48 +0000 Subject: net_sched: act_tunnel_key: use RCU in tunnel_key_dump() Also storing tcf_action into struct tcf_tunnel_key_params makes sure there is no discrepancy in tunnel_key_act(). No longer block BH in tunnel_key_init() when acquiring tcf_lock. Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250827125349.3505302-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_tunnel_key.h | 1 + net/sched/act_tunnel_key.c | 20 +++++++++----------- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/tc_act/tc_tunnel_key.h b/include/net/tc_act/tc_tunnel_key.h index 879fe8cff581..0f1925f97520 100644 --- a/include/net/tc_act/tc_tunnel_key.h +++ b/include/net/tc_act/tc_tunnel_key.h @@ -14,6 +14,7 @@ struct tcf_tunnel_key_params { struct rcu_head rcu; int tcft_action; + int action; struct metadata_dst *tcft_enc_metadata; }; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 2cef4b08befb..e1c8b48c217c 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -29,13 +29,11 @@ TC_INDIRECT_SCOPE int tunnel_key_act(struct sk_buff *skb, { struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params; - int action; params = rcu_dereference_bh(t->params); tcf_lastuse_update(&t->tcf_tm); tcf_action_update_bstats(&t->common, skb); - action = READ_ONCE(t->tcf_action); switch (params->tcft_action) { case TCA_TUNNEL_KEY_ACT_RELEASE: @@ -51,7 +49,7 @@ TC_INDIRECT_SCOPE int tunnel_key_act(struct sk_buff *skb, break; } - return action; + return params->action; } static const struct nla_policy @@ -532,11 +530,12 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, params_new->tcft_action = parm->t_action; params_new->tcft_enc_metadata = metadata; - spin_lock_bh(&t->tcf_lock); + params_new->action = parm->action; + spin_lock(&t->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params_new = rcu_replace_pointer(t->params, params_new, lockdep_is_held(&t->tcf_lock)); - spin_unlock_bh(&t->tcf_lock); + spin_unlock(&t->tcf_lock); tunnel_key_release_params(params_new); if (goto_ch) tcf_chain_put_by_act(goto_ch); @@ -726,10 +725,9 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t tm; - spin_lock_bh(&t->tcf_lock); - params = rcu_dereference_protected(t->params, - lockdep_is_held(&t->tcf_lock)); - opt.action = t->tcf_action; + rcu_read_lock(); + params = rcu_dereference(t->params); + opt.action = params->action; opt.t_action = params->tcft_action; if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt)) @@ -766,12 +764,12 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm), &tm, TCA_TUNNEL_KEY_PAD)) goto nla_put_failure; - spin_unlock_bh(&t->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&t->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From 53df77e7859042a92914d664c860f65d9689f88d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 Aug 2025 12:53:49 +0000 Subject: net_sched: act_skbmod: use RCU in tcf_skbmod_dump() Also storing tcf_action into struct tcf_skbmod_params makes sure there is no discrepancy in tcf_skbmod_act(). No longer block BH in tcf_skbmod_init() when acquiring tcf_lock. Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250827125349.3505302-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_skbmod.h | 1 + net/sched/act_skbmod.c | 26 ++++++++++++-------------- 2 files changed, 13 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/net/tc_act/tc_skbmod.h b/include/net/tc_act/tc_skbmod.h index 7c240d2fed4e..626704cd6241 100644 --- a/include/net/tc_act/tc_skbmod.h +++ b/include/net/tc_act/tc_skbmod.h @@ -12,6 +12,7 @@ struct tcf_skbmod_params { struct rcu_head rcu; u64 flags; /*up to 64 types of operations; extend if needed */ + int action; u8 eth_dst[ETH_ALEN]; u16 eth_type; u8 eth_src[ETH_ALEN]; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index dc0229693461..fce625eafcb2 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -27,19 +27,18 @@ TC_INDIRECT_SCOPE int tcf_skbmod_act(struct sk_buff *skb, struct tcf_result *res) { struct tcf_skbmod *d = to_skbmod(a); - int action, max_edit_len, err; struct tcf_skbmod_params *p; + int max_edit_len, err; u64 flags; tcf_lastuse_update(&d->tcf_tm); bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb); - action = READ_ONCE(d->tcf_action); - if (unlikely(action == TC_ACT_SHOT)) + p = rcu_dereference_bh(d->skbmod_p); + if (unlikely(p->action == TC_ACT_SHOT)) goto drop; max_edit_len = skb_mac_header_len(skb); - p = rcu_dereference_bh(d->skbmod_p); flags = p->flags; /* tcf_skbmod_init() guarantees "flags" to be one of the following: @@ -85,7 +84,7 @@ TC_INDIRECT_SCOPE int tcf_skbmod_act(struct sk_buff *skb, INET_ECN_set_ce(skb); out: - return action; + return p->action; drop: qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats)); @@ -193,9 +192,9 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, } p->flags = lflags; - + p->action = parm->action; if (ovr) - spin_lock_bh(&d->tcf_lock); + spin_lock(&d->tcf_lock); /* Protected by tcf_lock if overwriting existing action. */ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); p_old = rcu_dereference_protected(d->skbmod_p, 1); @@ -209,7 +208,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, rcu_assign_pointer(d->skbmod_p, p); if (ovr) - spin_unlock_bh(&d->tcf_lock); + spin_unlock(&d->tcf_lock); if (p_old) kfree_rcu(p_old, rcu); @@ -248,10 +247,9 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, opt.index = d->tcf_index; opt.refcnt = refcount_read(&d->tcf_refcnt) - ref; opt.bindcnt = atomic_read(&d->tcf_bindcnt) - bind; - spin_lock_bh(&d->tcf_lock); - opt.action = d->tcf_action; - p = rcu_dereference_protected(d->skbmod_p, - lockdep_is_held(&d->tcf_lock)); + rcu_read_lock(); + p = rcu_dereference(d->skbmod_p); + opt.action = p->action; opt.flags = p->flags; if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -269,10 +267,10 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_64bit(skb, TCA_SKBMOD_TM, sizeof(t), &t, TCA_SKBMOD_PAD)) goto nla_put_failure; - spin_unlock_bh(&d->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&d->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From db2ab24a341ce89351a1bede37a96a3e3ce1726a Mon Sep 17 00:00:00 2001 From: Lauri Vasama Date: Wed, 27 Aug 2025 16:39:00 +0300 Subject: Add RWF_NOSIGNAL flag for pwritev2 For a user mode library to avoid generating SIGPIPE signals (e.g. because this behaviour is not portable across operating systems) is cumbersome. It is generally bad form to change the process-wide signal mask in a library, so a local solution is needed instead. For I/O performed directly using system calls (synchronous or readiness based asynchronous) this currently involves applying a thread-specific signal mask before the operation and reverting it afterwards. This can be avoided when it is known that the file descriptor refers to neither a pipe nor a socket, but a conservative implementation must always apply the mask. This incurs the cost of two additional system calls. In the case of sockets, the existing MSG_NOSIGNAL flag can be used with send. For asynchronous I/O performed using io_uring, currently the only option (apart from MSG_NOSIGNAL for sockets), is to mask SIGPIPE entirely in the call to io_uring_enter. Thankfully io_uring_enter takes a signal mask, so only a single syscall is needed. However, copying the signal mask on every call incurs a non-zero performance penalty. Furthermore, this mask applies to all completions, meaning that if the non-signaling behaviour is desired only for some subset of operations, the desired signals must be raised manually from user-mode depending on the completed operation. Add RWF_NOSIGNAL flag for pwritev2. This flag prevents the SIGPIPE signal from being raised when writing on disconnected pipes or sockets. The flag is handled directly by the pipe filesystem and converted to the existing MSG_NOSIGNAL flag for sockets. Signed-off-by: Lauri Vasama Link: https://lore.kernel.org/20250827133901.1820771-1-git@vasama.org Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- fs/pipe.c | 6 ++++-- include/linux/fs.h | 1 + include/uapi/linux/fs.h | 5 ++++- net/socket.c | 3 +++ 4 files changed, 12 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/fs/pipe.c b/fs/pipe.c index 731622d0738d..42fead1efe52 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -458,7 +458,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) mutex_lock(&pipe->mutex); if (!pipe->readers) { - send_sig(SIGPIPE, current, 0); + if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0) + send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; } @@ -498,7 +499,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) for (;;) { if (!pipe->readers) { - send_sig(SIGPIPE, current, 0); + if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0) + send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; diff --git a/include/linux/fs.h b/include/linux/fs.h index 780e9c774c54..34693cae15a2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -356,6 +356,7 @@ struct readahead_control; #define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_ATOMIC (__force int) RWF_ATOMIC #define IOCB_DONTCACHE (__force int) RWF_DONTCACHE +#define IOCB_NOSIGNAL (__force int) RWF_NOSIGNAL /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 0bd678a4a10e..beb4c2d1e41c 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -430,10 +430,13 @@ typedef int __bitwise __kernel_rwf_t; /* buffered IO that drops the cache after reading or writing data */ #define RWF_DONTCACHE ((__force __kernel_rwf_t)0x00000080) +/* prevent pipe and socket writes from raising SIGPIPE */ +#define RWF_NOSIGNAL ((__force __kernel_rwf_t)0x00000100) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC |\ - RWF_DONTCACHE) + RWF_DONTCACHE | RWF_NOSIGNAL) #define PROCFS_IOCTL_MAGIC 'f' diff --git a/net/socket.c b/net/socket.c index 682969deaed3..bac335ecee4c 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1176,6 +1176,9 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) if (sock->type == SOCK_SEQPACKET) msg.msg_flags |= MSG_EOR; + if (iocb->ki_flags & IOCB_NOSIGNAL) + msg.msg_flags |= MSG_NOSIGNAL; + res = __sock_sendmsg(sock, &msg); *from = msg.msg_iter; return res; -- cgit v1.2.3 From 862c628108562d8c7a516a900034823b381d3cba Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 27 Aug 2025 20:40:14 +0000 Subject: Bluetooth: Fix use-after-free in l2cap_sock_cleanup_listen() syzbot reported the splat below without a repro. In the splat, a single thread calling bt_accept_dequeue() freed sk and touched it after that. The root cause would be the racy l2cap_sock_cleanup_listen() call added by the cited commit. bt_accept_dequeue() is called under lock_sock() except for l2cap_sock_release(). Two threads could see the same socket during the list iteration in bt_accept_dequeue(): CPU1 CPU2 (close()) ---- ---- sock_hold(sk) sock_hold(sk); lock_sock(sk) <-- block close() sock_put(sk) bt_accept_unlink(sk) sock_put(sk) <-- refcnt by bt_accept_enqueue() release_sock(sk) lock_sock(sk) sock_put(sk) bt_accept_unlink(sk) sock_put(sk) <-- last refcnt bt_accept_unlink(sk) <-- UAF Depending on the timing, the other thread could show up in the "Freed by task" part. Let's call l2cap_sock_cleanup_listen() under lock_sock() in l2cap_sock_release(). [0]: BUG: KASAN: slab-use-after-free in debug_spin_lock_before kernel/locking/spinlock_debug.c:86 [inline] BUG: KASAN: slab-use-after-free in do_raw_spin_lock+0x26f/0x2b0 kernel/locking/spinlock_debug.c:115 Read of size 4 at addr ffff88803b7eb1c4 by task syz.5.3276/16995 CPU: 3 UID: 0 PID: 16995 Comm: syz.5.3276 Not tainted syzkaller #0 PREEMPT(full) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xcd/0x630 mm/kasan/report.c:482 kasan_report+0xe0/0x110 mm/kasan/report.c:595 debug_spin_lock_before kernel/locking/spinlock_debug.c:86 [inline] do_raw_spin_lock+0x26f/0x2b0 kernel/locking/spinlock_debug.c:115 spin_lock_bh include/linux/spinlock.h:356 [inline] release_sock+0x21/0x220 net/core/sock.c:3746 bt_accept_dequeue+0x505/0x600 net/bluetooth/af_bluetooth.c:312 l2cap_sock_cleanup_listen+0x5c/0x2a0 net/bluetooth/l2cap_sock.c:1451 l2cap_sock_release+0x5c/0x210 net/bluetooth/l2cap_sock.c:1425 __sock_release+0xb3/0x270 net/socket.c:649 sock_close+0x1c/0x30 net/socket.c:1439 __fput+0x3ff/0xb70 fs/file_table.c:468 task_work_run+0x14d/0x240 kernel/task_work.c:227 resume_user_mode_work include/linux/resume_user_mode.h:50 [inline] exit_to_user_mode_loop+0xeb/0x110 kernel/entry/common.c:43 exit_to_user_mode_prepare include/linux/irq-entry-common.h:225 [inline] syscall_exit_to_user_mode_work include/linux/entry-common.h:175 [inline] syscall_exit_to_user_mode include/linux/entry-common.h:210 [inline] do_syscall_64+0x3f6/0x4c0 arch/x86/entry/syscall_64.c:100 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f2accf8ebe9 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffdb6cb1378 EFLAGS: 00000246 ORIG_RAX: 00000000000001b4 RAX: 0000000000000000 RBX: 00000000000426fb RCX: 00007f2accf8ebe9 RDX: 0000000000000000 RSI: 000000000000001e RDI: 0000000000000003 RBP: 00007f2acd1b7da0 R08: 0000000000000001 R09: 00000012b6cb166f R10: 0000001b30e20000 R11: 0000000000000246 R12: 00007f2acd1b609c R13: 00007f2acd1b6090 R14: ffffffffffffffff R15: 00007ffdb6cb1490 Allocated by task 5326: kasan_save_stack+0x33/0x60 mm/kasan/common.c:47 kasan_save_track+0x14/0x30 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:388 [inline] __kasan_kmalloc+0xaa/0xb0 mm/kasan/common.c:405 kasan_kmalloc include/linux/kasan.h:260 [inline] __do_kmalloc_node mm/slub.c:4365 [inline] __kmalloc_noprof+0x223/0x510 mm/slub.c:4377 kmalloc_noprof include/linux/slab.h:909 [inline] sk_prot_alloc+0x1a8/0x2a0 net/core/sock.c:2239 sk_alloc+0x36/0xc20 net/core/sock.c:2295 bt_sock_alloc+0x3b/0x3a0 net/bluetooth/af_bluetooth.c:151 l2cap_sock_alloc.constprop.0+0x33/0x1d0 net/bluetooth/l2cap_sock.c:1894 l2cap_sock_new_connection_cb+0x101/0x240 net/bluetooth/l2cap_sock.c:1482 l2cap_connect_cfm+0x4c4/0xf80 net/bluetooth/l2cap_core.c:7287 hci_connect_cfm include/net/bluetooth/hci_core.h:2050 [inline] hci_remote_features_evt+0x4dd/0x970 net/bluetooth/hci_event.c:3712 hci_event_func net/bluetooth/hci_event.c:7519 [inline] hci_event_packet+0xa0d/0x11c0 net/bluetooth/hci_event.c:7573 hci_rx_work+0x2c5/0x16b0 net/bluetooth/hci_core.c:4071 process_one_work+0x9cf/0x1b70 kernel/workqueue.c:3236 process_scheduled_works kernel/workqueue.c:3319 [inline] worker_thread+0x6c8/0xf10 kernel/workqueue.c:3400 kthread+0x3c2/0x780 kernel/kthread.c:463 ret_from_fork+0x5d7/0x6f0 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 Freed by task 16995: kasan_save_stack+0x33/0x60 mm/kasan/common.c:47 kasan_save_track+0x14/0x30 mm/kasan/common.c:68 kasan_save_free_info+0x3b/0x60 mm/kasan/generic.c:576 poison_slab_object mm/kasan/common.c:243 [inline] __kasan_slab_free+0x60/0x70 mm/kasan/common.c:275 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2417 [inline] slab_free mm/slub.c:4680 [inline] kfree+0x2b4/0x4d0 mm/slub.c:4879 sk_prot_free net/core/sock.c:2278 [inline] __sk_destruct+0x75f/0x9a0 net/core/sock.c:2373 sk_destruct+0xc2/0xf0 net/core/sock.c:2401 __sk_free+0xf4/0x3e0 net/core/sock.c:2412 sk_free+0x6a/0x90 net/core/sock.c:2423 sock_put include/net/sock.h:1960 [inline] bt_accept_unlink+0x245/0x2e0 net/bluetooth/af_bluetooth.c:262 bt_accept_dequeue+0x517/0x600 net/bluetooth/af_bluetooth.c:308 l2cap_sock_cleanup_listen+0x5c/0x2a0 net/bluetooth/l2cap_sock.c:1451 l2cap_sock_release+0x5c/0x210 net/bluetooth/l2cap_sock.c:1425 __sock_release+0xb3/0x270 net/socket.c:649 sock_close+0x1c/0x30 net/socket.c:1439 __fput+0x3ff/0xb70 fs/file_table.c:468 task_work_run+0x14d/0x240 kernel/task_work.c:227 resume_user_mode_work include/linux/resume_user_mode.h:50 [inline] exit_to_user_mode_loop+0xeb/0x110 kernel/entry/common.c:43 exit_to_user_mode_prepare include/linux/irq-entry-common.h:225 [inline] syscall_exit_to_user_mode_work include/linux/entry-common.h:175 [inline] syscall_exit_to_user_mode include/linux/entry-common.h:210 [inline] do_syscall_64+0x3f6/0x4c0 arch/x86/entry/syscall_64.c:100 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 1728137b33c0 ("Bluetooth: L2CAP: Fix use-after-free in l2cap_sock_ready_cb") Reported-by: syzbot+e5e64cdf8e92046dd3e1@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-bluetooth/68af6b9d.a70a0220.3cafd4.0032.GAE@google.com/ Signed-off-by: Kuniyuki Iwashima Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_sock.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index f4257c4d3052..814fb8610ac4 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1422,7 +1422,10 @@ static int l2cap_sock_release(struct socket *sock) if (!sk) return 0; + lock_sock_nested(sk, L2CAP_NESTING_PARENT); l2cap_sock_cleanup_listen(sk); + release_sock(sk); + bt_sock_unlink(&l2cap_sk_list, sk); err = l2cap_sock_shutdown(sock, SHUT_RDWR); -- cgit v1.2.3 From 9f74c0ea9b26d1505d55b61e36b1623dd347e1d1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 Aug 2025 16:23:52 +0000 Subject: net_sched: gen_estimator: fix est_timer() vs CONFIG_PREEMPT_RT=y syzbot reported a WARNING in est_timer() [1] Problem here is that with CONFIG_PREEMPT_RT=y, timer callbacks can be preempted. Adopt preempt_disable_nested()/preempt_enable_nested() to fix this. [1] WARNING: CPU: 0 PID: 16 at ./include/linux/seqlock.h:221 __seqprop_assert include/linux/seqlock.h:221 [inline] WARNING: CPU: 0 PID: 16 at ./include/linux/seqlock.h:221 est_timer+0x6dc/0x9f0 net/core/gen_estimator.c:93 Modules linked in: CPU: 0 UID: 0 PID: 16 Comm: ktimers/0 Not tainted syzkaller #0 PREEMPT_{RT,(full)} Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/12/2025 RIP: 0010:__seqprop_assert include/linux/seqlock.h:221 [inline] RIP: 0010:est_timer+0x6dc/0x9f0 net/core/gen_estimator.c:93 Call Trace: call_timer_fn+0x17e/0x5f0 kernel/time/timer.c:1747 expire_timers kernel/time/timer.c:1798 [inline] __run_timers kernel/time/timer.c:2372 [inline] __run_timer_base+0x648/0x970 kernel/time/timer.c:2384 run_timer_base kernel/time/timer.c:2393 [inline] run_timer_softirq+0xb7/0x180 kernel/time/timer.c:2403 handle_softirqs+0x22c/0x710 kernel/softirq.c:579 __do_softirq kernel/softirq.c:613 [inline] run_ktimerd+0xcf/0x190 kernel/softirq.c:1043 smpboot_thread_fn+0x53f/0xa60 kernel/smpboot.c:160 kthread+0x70e/0x8a0 kernel/kthread.c:463 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 Fixes: d2d6422f8bd1 ("x86: Allow to enable PREEMPT_RT.") Reported-by: syzbot+72db9ee39db57c3fecc5@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/68adf6fa.a70a0220.3cafd4.0000.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Reviewed-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20250827162352.3960779-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/gen_estimator.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 7d426a8e29f3..f112156db587 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -90,10 +90,12 @@ static void est_timer(struct timer_list *t) rate = (b_packets - est->last_packets) << (10 - est->intvl_log); rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log); + preempt_disable_nested(); write_seqcount_begin(&est->seq); est->avbps += brate; est->avpps += rate; write_seqcount_end(&est->seq); + preempt_enable_nested(); est->last_bytes = b_bytes; est->last_packets = b_packets; -- cgit v1.2.3 From 9a574257b968426df5c180df1199d4b082f80ff9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Aug 2025 10:27:34 +0000 Subject: inet_diag: annotate data-races in inet_diag_msg_common_fill() inet_diag_msg_common_fill() can run without socket lock. Add READ_ONCE() or data_race() annotations. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250828102738.2065992-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_diag.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 9d4dcd17728c..7a9c347bc66f 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -71,25 +71,25 @@ static void inet_diag_unlock_handler(const struct inet_diag_handler *handler) void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk) { - r->idiag_family = sk->sk_family; + r->idiag_family = READ_ONCE(sk->sk_family); - r->id.idiag_sport = htons(sk->sk_num); - r->id.idiag_dport = sk->sk_dport; - r->id.idiag_if = sk->sk_bound_dev_if; + r->id.idiag_sport = htons(READ_ONCE(sk->sk_num)); + r->id.idiag_dport = READ_ONCE(sk->sk_dport); + r->id.idiag_if = READ_ONCE(sk->sk_bound_dev_if); sock_diag_save_cookie(sk, r->id.idiag_cookie); #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) { - *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr; - *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr; + if (r->idiag_family == AF_INET6) { + data_race(*(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr); + data_race(*(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr); } else #endif { memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); - r->id.idiag_src[0] = sk->sk_rcv_saddr; - r->id.idiag_dst[0] = sk->sk_daddr; + r->id.idiag_src[0] = READ_ONCE(sk->sk_rcv_saddr); + r->id.idiag_dst[0] = READ_ONCE(sk->sk_daddr); } } EXPORT_SYMBOL_GPL(inet_diag_msg_common_fill); -- cgit v1.2.3 From 8e60447f0831cdcafa2233e5547ee0eba8a5f8da Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Aug 2025 10:27:35 +0000 Subject: tcp: annotate data-races in tcp_req_diag_fill() req->num_retrans and rsk_timer.expires are read locklessly, and can be changed from tcp_rtx_synack(). Add READ_ONCE()/WRITE_ONCE() annotations. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250828102738.2065992-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_diag.c | 4 ++-- net/ipv4/tcp_output.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 2f3a779ce7a2..4ed6b93527f4 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -248,12 +248,12 @@ static int tcp_req_diag_fill(struct sock *sk, struct sk_buff *skb, inet_diag_msg_common_fill(r, sk); r->idiag_state = TCP_SYN_RECV; r->idiag_timer = 1; - r->idiag_retrans = reqsk->num_retrans; + r->idiag_retrans = READ_ONCE(reqsk->num_retrans); BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) != offsetof(struct sock, sk_cookie)); - tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies; + tmo = READ_ONCE(inet_reqsk(sk)->rsk_timer.expires) - jiffies; r->idiag_expires = jiffies_delta_to_msecs(tmo); r->idiag_rqueue = 0; r->idiag_wqueue = 0; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 06b26a6efd62..e180364b8dda 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -4438,7 +4438,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) tcp_sk_rw(sk)->total_retrans++; } trace_tcp_retransmit_synack(sk, req); - req->num_retrans++; + WRITE_ONCE(req->num_retrans, req->num_retrans + 1); } return res; } -- cgit v1.2.3 From 4fd84a0aaf2ba125b441aa09d415022385e66bf2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Aug 2025 10:27:36 +0000 Subject: inet_diag: annotate data-races in inet_diag_bc_sk() inet_diag_bc_sk() runs with an unlocked socket, annotate potential races with READ_ONCE(). Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250828102738.2065992-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_diag.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 7a9c347bc66f..3827e9979d4f 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -580,7 +580,7 @@ static void entry_fill_addrs(struct inet_diag_entry *entry, const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) { + if (entry->family == AF_INET6) { entry->saddr = sk->sk_v6_rcv_saddr.s6_addr32; entry->daddr = sk->sk_v6_daddr.s6_addr32; } else @@ -593,18 +593,18 @@ static void entry_fill_addrs(struct inet_diag_entry *entry, int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) { - struct inet_sock *inet = inet_sk(sk); + const struct inet_sock *inet = inet_sk(sk); struct inet_diag_entry entry; if (!bc) return 1; - entry.family = sk->sk_family; + entry.family = READ_ONCE(sk->sk_family); entry_fill_addrs(&entry, sk); - entry.sport = inet->inet_num; - entry.dport = ntohs(inet->inet_dport); - entry.ifindex = sk->sk_bound_dev_if; - entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0; + entry.sport = READ_ONCE(inet->inet_num); + entry.dport = ntohs(READ_ONCE(inet->inet_dport)); + entry.ifindex = READ_ONCE(sk->sk_bound_dev_if); + entry.userlocks = sk_fullsock(sk) ? READ_ONCE(sk->sk_userlocks) : 0; if (sk_fullsock(sk)) entry.mark = READ_ONCE(sk->sk_mark); else if (sk->sk_state == TCP_NEW_SYN_RECV) -- cgit v1.2.3 From 9529320ad64e614cfaf96e6b8e3d8c0a1245160c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Aug 2025 10:27:37 +0000 Subject: inet_diag: change inet_diag_bc_sk() first argument We want to have access to the inet_diag_dump_data structure in the following patch. This patch removes duplication in callers. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250828102738.2065992-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/inet_diag.h | 2 +- net/ipv4/inet_diag.c | 3 ++- net/ipv4/raw_diag.c | 10 +++------- net/ipv4/tcp_diag.c | 8 +++----- net/ipv4/udp_diag.c | 10 +++------- net/mptcp/mptcp_diag.c | 15 ++++----------- 6 files changed, 16 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 30bf8f7ea62b..86a0641ec36e 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -46,7 +46,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, const struct inet_diag_req_v2 *req, u16 nlmsg_flags, bool net_admin); -int inet_diag_bc_sk(const struct nlattr *_bc, struct sock *sk); +int inet_diag_bc_sk(const struct inet_diag_dump_data *cb_data, struct sock *sk); void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 3827e9979d4f..117103042687 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -591,8 +591,9 @@ static void entry_fill_addrs(struct inet_diag_entry *entry, } } -int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) +int inet_diag_bc_sk(const struct inet_diag_dump_data *cb_data, struct sock *sk) { + const struct nlattr *bc = cb_data->inet_diag_nla_bc; const struct inet_sock *inet = inet_sk(sk); struct inet_diag_entry entry; diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c index cc793bd8de25..943e5998e0ad 100644 --- a/net/ipv4/raw_diag.c +++ b/net/ipv4/raw_diag.c @@ -126,9 +126,9 @@ static int raw_diag_dump_one(struct netlink_callback *cb, static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, - struct nlattr *bc, bool net_admin) + bool net_admin) { - if (!inet_diag_bc_sk(bc, sk)) + if (!inet_diag_bc_sk(cb->data, sk)) return 0; return inet_sk_diag_fill(sk, NULL, skb, cb, r, NLM_F_MULTI, net_admin); @@ -140,17 +140,13 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); struct net *net = sock_net(skb->sk); - struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; struct hlist_head *hlist; struct sock *sk = NULL; - struct nlattr *bc; if (IS_ERR(hashinfo)) return; - cb_data = cb->data; - bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; @@ -174,7 +170,7 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, if (r->id.idiag_dport != inet->inet_dport && r->id.idiag_dport) goto next; - if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) + if (sk_diag_dump(sk, skb, cb, r, net_admin) < 0) goto out_unlock; next: num++; diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 4ed6b93527f4..d83efd91f461 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -320,11 +320,9 @@ static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, u32 idiag_states = r->idiag_states; struct inet_hashinfo *hashinfo; int i, num, s_i, s_num; - struct nlattr *bc; struct sock *sk; hashinfo = net->ipv4.tcp_death_row.hashinfo; - bc = cb_data->inet_diag_nla_bc; if (idiag_states & TCPF_SYN_RECV) idiag_states |= TCPF_NEW_SYN_RECV; s_i = cb->args[1]; @@ -365,7 +363,7 @@ static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, r->id.idiag_sport) goto next_listen; - if (!inet_diag_bc_sk(bc, sk)) + if (!inet_diag_bc_sk(cb_data, sk)) goto next_listen; if (inet_sk_diag_fill(sk, inet_csk(sk), skb, @@ -432,7 +430,7 @@ resume_bind_walk: r->sdiag_family != sk->sk_family) goto next_bind; - if (!inet_diag_bc_sk(bc, sk)) + if (!inet_diag_bc_sk(cb_data, sk)) goto next_bind; sock_hold(sk); @@ -519,7 +517,7 @@ next_chunk: goto next_normal; twsk_build_assert(); - if (!inet_diag_bc_sk(bc, sk)) + if (!inet_diag_bc_sk(cb_data, sk)) goto next_normal; if (!refcount_inc_not_zero(&sk->sk_refcnt)) diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 38cb3a28e4ed..6e491c720c90 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -16,9 +16,9 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, - struct nlattr *bc, bool net_admin) + bool net_admin) { - if (!inet_diag_bc_sk(bc, sk)) + if (!inet_diag_bc_sk(cb->data, sk)) return 0; return inet_sk_diag_fill(sk, NULL, skb, cb, req, NLM_F_MULTI, @@ -92,12 +92,8 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct net *net = sock_net(skb->sk); - struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; - struct nlattr *bc; - cb_data = cb->data; - bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; @@ -130,7 +126,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, r->id.idiag_dport) goto next; - if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) { + if (sk_diag_dump(sk, skb, cb, r, net_admin) < 0) { spin_unlock_bh(&hslot->lock); goto done; } diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index 0566dd793810..ac974299de71 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -15,9 +15,9 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, - struct nlattr *bc, bool net_admin) + bool net_admin) { - if (!inet_diag_bc_sk(bc, sk)) + if (!inet_diag_bc_sk(cb->data, sk)) return 0; return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, req, NLM_F_MULTI, @@ -76,9 +76,7 @@ static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callba const struct inet_diag_req_v2 *r, bool net_admin) { - struct inet_diag_dump_data *cb_data = cb->data; struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx; - struct nlattr *bc = cb_data->inet_diag_nla_bc; struct net *net = sock_net(skb->sk); struct inet_hashinfo *hinfo; int i; @@ -121,7 +119,7 @@ static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callba if (!refcount_inc_not_zero(&sk->sk_refcnt)) goto next_listen; - ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin); + ret = sk_diag_dump(sk, skb, cb, r, net_admin); sock_put(sk); @@ -154,15 +152,10 @@ static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); - struct inet_diag_dump_data *cb_data; struct mptcp_sock *msk; - struct nlattr *bc; BUILD_BUG_ON(sizeof(cb->ctx) < sizeof(*diag_ctx)); - cb_data = cb->data; - bc = cb_data->inet_diag_nla_bc; - while ((msk = mptcp_token_iter_next(net, &diag_ctx->s_slot, &diag_ctx->s_num)) != NULL) { struct inet_sock *inet = (struct inet_sock *)msk; @@ -181,7 +174,7 @@ static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, r->id.idiag_dport) goto next; - ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin); + ret = sk_diag_dump(sk, skb, cb, r, net_admin); next: sock_put(sk); if (ret < 0) { -- cgit v1.2.3 From 95fa78830e5b2eb2041174c7f9549c746e003dd6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Aug 2025 10:27:38 +0000 Subject: inet_diag: avoid cache line misses in inet_diag_bc_sk() inet_diag_bc_sk() pulls five cache lines per socket, while most filters only need the two first ones. Add three booleans to struct inet_diag_dump_data, that are selectively set if a filter needs specific socket fields. - mark_needed /* INET_DIAG_BC_MARK_COND present. */ - cgroup_needed /* INET_DIAG_BC_CGROUP_COND present. */ - userlocks_needed /* INET_DIAG_BC_AUTO present. */ This removes millions of cache lines misses per ss invocation when simple filters are specified on busy servers. offsetof(struct sock, sk_userlocks) = 0xf3 offsetof(struct sock, sk_mark) = 0x20c offsetof(struct sock, sk_cgrp_data) = 0x298 Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250828102738.2065992-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/inet_diag.h | 5 +++++ net/ipv4/inet_diag.c | 52 ++++++++++++++++++++++++++++------------------- 2 files changed, 36 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 86a0641ec36e..704fd415c2b4 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -38,6 +38,11 @@ struct inet_diag_dump_data { #define inet_diag_nla_bpf_stgs req_nlas[INET_DIAG_REQ_SK_BPF_STORAGES] struct bpf_sk_storage_diag *bpf_stg_diag; + bool mark_needed; /* INET_DIAG_BC_MARK_COND present. */ +#ifdef CONFIG_SOCK_CGROUP_DATA + bool cgroup_needed; /* INET_DIAG_BC_CGROUP_COND present. */ +#endif + bool userlocks_needed; /* INET_DIAG_BC_AUTO present. */ }; struct inet_connection_sock; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 117103042687..f0b6c5a411a2 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -605,18 +605,22 @@ int inet_diag_bc_sk(const struct inet_diag_dump_data *cb_data, struct sock *sk) entry.sport = READ_ONCE(inet->inet_num); entry.dport = ntohs(READ_ONCE(inet->inet_dport)); entry.ifindex = READ_ONCE(sk->sk_bound_dev_if); - entry.userlocks = sk_fullsock(sk) ? READ_ONCE(sk->sk_userlocks) : 0; - if (sk_fullsock(sk)) - entry.mark = READ_ONCE(sk->sk_mark); - else if (sk->sk_state == TCP_NEW_SYN_RECV) - entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; - else if (sk->sk_state == TCP_TIME_WAIT) - entry.mark = inet_twsk(sk)->tw_mark; - else - entry.mark = 0; + if (cb_data->userlocks_needed) + entry.userlocks = sk_fullsock(sk) ? READ_ONCE(sk->sk_userlocks) : 0; + if (cb_data->mark_needed) { + if (sk_fullsock(sk)) + entry.mark = READ_ONCE(sk->sk_mark); + else if (sk->sk_state == TCP_NEW_SYN_RECV) + entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; + else if (sk->sk_state == TCP_TIME_WAIT) + entry.mark = inet_twsk(sk)->tw_mark; + else + entry.mark = 0; + } #ifdef CONFIG_SOCK_CGROUP_DATA - entry.cgroup_id = sk_fullsock(sk) ? - cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)) : 0; + if (cb_data->cgroup_needed) + entry.cgroup_id = sk_fullsock(sk) ? + cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)) : 0; #endif return inet_diag_bc_run(bc, &entry); @@ -716,16 +720,21 @@ static bool valid_cgroupcond(const struct inet_diag_bc_op *op, int len, } #endif -static int inet_diag_bc_audit(const struct nlattr *attr, +static int inet_diag_bc_audit(struct inet_diag_dump_data *cb_data, const struct sk_buff *skb) { - bool net_admin = netlink_net_capable(skb, CAP_NET_ADMIN); + const struct nlattr *attr = cb_data->inet_diag_nla_bc; const void *bytecode, *bc; int bytecode_len, len; + bool net_admin; + + if (!attr) + return 0; - if (!attr || nla_len(attr) < sizeof(struct inet_diag_bc_op)) + if (nla_len(attr) < sizeof(struct inet_diag_bc_op)) return -EINVAL; + net_admin = netlink_net_capable(skb, CAP_NET_ADMIN); bytecode = bc = nla_data(attr); len = bytecode_len = nla_len(attr); @@ -757,14 +766,18 @@ static int inet_diag_bc_audit(const struct nlattr *attr, return -EPERM; if (!valid_markcond(bc, len, &min_len)) return -EINVAL; + cb_data->mark_needed = true; break; #ifdef CONFIG_SOCK_CGROUP_DATA case INET_DIAG_BC_CGROUP_COND: if (!valid_cgroupcond(bc, len, &min_len)) return -EINVAL; + cb_data->cgroup_needed = true; break; #endif case INET_DIAG_BC_AUTO: + cb_data->userlocks_needed = true; + fallthrough; case INET_DIAG_BC_JMP: case INET_DIAG_BC_NOP: break; @@ -841,13 +854,10 @@ static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen) kfree(cb_data); return err; } - nla = cb_data->inet_diag_nla_bc; - if (nla) { - err = inet_diag_bc_audit(nla, skb); - if (err) { - kfree(cb_data); - return err; - } + err = inet_diag_bc_audit(cb_data, skb); + if (err) { + kfree(cb_data); + return err; } nla = cb_data->inet_diag_nla_bpf_stgs; -- cgit v1.2.3 From caedcc5b6df1b2e2b5f39079e3369c1d4d5c5f50 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Aug 2025 19:58:16 +0000 Subject: net: dst: introduce dst->dev_rcu Followup of commit 88fe14253e18 ("net: dst: add four helpers to annotate data-races around dst->dev"). We want to gradually add explicit RCU protection to dst->dev, including lockdep support. Add an union to alias dst->dev_rcu and dst->dev. Add dst_dev_net_rcu() helper. Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250828195823.3958522-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/dst.h | 16 +++++++++++----- net/core/dst.c | 2 +- net/ipv4/route.c | 4 ++-- 3 files changed, 14 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/dst.h b/include/net/dst.h index bab01363bb97..f8aa1239b4db 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -24,7 +24,10 @@ struct sk_buff; struct dst_entry { - struct net_device *dev; + union { + struct net_device *dev; + struct net_device __rcu *dev_rcu; + }; struct dst_ops *ops; unsigned long _metrics; unsigned long expires; @@ -570,9 +573,12 @@ static inline struct net_device *dst_dev(const struct dst_entry *dst) static inline struct net_device *dst_dev_rcu(const struct dst_entry *dst) { - /* In the future, use rcu_dereference(dst->dev) */ - WARN_ON_ONCE(!rcu_read_lock_held()); - return READ_ONCE(dst->dev); + return rcu_dereference(dst->dev_rcu); +} + +static inline struct net *dst_dev_net_rcu(const struct dst_entry *dst) +{ + return dev_net_rcu(dst_dev_rcu(dst)); } static inline struct net_device *skb_dst_dev(const struct sk_buff *skb) @@ -592,7 +598,7 @@ static inline struct net *skb_dst_dev_net(const struct sk_buff *skb) static inline struct net *skb_dst_dev_net_rcu(const struct sk_buff *skb) { - return dev_net_rcu(skb_dst_dev(skb)); + return dev_net_rcu(skb_dst_dev_rcu(skb)); } struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie); diff --git a/net/core/dst.c b/net/core/dst.c index e2de8b68c41d..e9d35f49c9e7 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -150,7 +150,7 @@ void dst_dev_put(struct dst_entry *dst) dst->ops->ifdown(dst, dev); WRITE_ONCE(dst->input, dst_discard); WRITE_ONCE(dst->output, dst_discard_out); - WRITE_ONCE(dst->dev, blackhole_netdev); + rcu_assign_pointer(dst->dev_rcu, blackhole_netdev); netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker, GFP_ATOMIC); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index cc86a917a1bb..44382d175589 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1027,7 +1027,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) return; rcu_read_lock(); - net = dev_net_rcu(dst_dev(dst)); + net = dst_dev_net_rcu(dst); if (mtu < net->ipv4.ip_rt_min_pmtu) { lock = true; mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu); @@ -1327,7 +1327,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst) struct net *net; rcu_read_lock(); - net = dev_net_rcu(dst_dev(dst)); + net = dst_dev_net_rcu(dst); advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, net->ipv4.ip_rt_min_advmss); rcu_read_unlock(); -- cgit v1.2.3 From b775ecf1655cedbc465fd6699ab18a2bc4e7a352 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Aug 2025 19:58:17 +0000 Subject: ipv6: start using dst_dev_rcu() Refactor icmpv6_xrlim_allow() and ip6_dst_hoplimit() so that we acquire rcu_read_lock() a bit longer to be able to use dst_dev_rcu() instead of dst_dev(). __ip6_rt_update_pmtu() and rt6_do_redirect can directly use dst_dev_rcu() in sections already holding rcu_read_lock(). Small changes to use dst_dev_net_rcu() in ip6_default_advmss(), ipv6_sock_ac_join(), ip6_mc_find_dev() and ndisc_send_skb(). Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250828195823.3958522-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/anycast.c | 2 +- net/ipv6/icmp.c | 6 +++--- net/ipv6/mcast.c | 2 +- net/ipv6/ndisc.c | 2 +- net/ipv6/output_core.c | 8 +++++--- net/ipv6/route.c | 7 +++---- 6 files changed, 14 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index f8a8e46286b8..52599584422b 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -104,7 +104,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) rcu_read_lock(); rt = rt6_lookup(net, addr, NULL, 0, NULL, 0); if (rt) { - dev = dst_dev(&rt->dst); + dev = dst_dev_rcu(&rt->dst); netdev_hold(dev, &dev_tracker, GFP_ATOMIC); ip6_rt_put(rt); } else if (ishost) { diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 44550957fd4e..95cdd4cacb00 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -209,7 +209,8 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, * this lookup should be more aggressive (not longer than timeout). */ dst = ip6_route_output(net, sk, fl6); - dev = dst_dev(dst); + rcu_read_lock(); + dev = dst_dev_rcu(dst); if (dst->error) { IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); @@ -224,11 +225,10 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, if (rt->rt6i_dst.plen < 128) tmo >>= ((128 - rt->rt6i_dst.plen)>>5); - rcu_read_lock(); peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr); res = inet_peer_xrlim_allow(peer, tmo); - rcu_read_unlock(); } + rcu_read_unlock(); if (!res) __ICMP6_INC_STATS(net, ip6_dst_idev(dst), ICMP6_MIB_RATELIMITHOST); diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 55c49dc14b1b..016b572e7d6f 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -180,7 +180,7 @@ static struct net_device *ip6_mc_find_dev(struct net *net, rcu_read_lock(); rt = rt6_lookup(net, group, NULL, 0, NULL, 0); if (rt) { - dev = dst_dev(&rt->dst); + dev = dst_dev_rcu(&rt->dst); dev_hold(dev); ip6_rt_put(rt); } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 57aaa7ae8ac3..f427e41e9c49 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -505,7 +505,7 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len); - dev = dst_dev(dst); + dev = dst_dev_rcu(dst); idev = __in6_dev_get(dev); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index d21fe27fe21e..1c9b283a4132 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -104,18 +104,20 @@ EXPORT_SYMBOL(ip6_find_1stfragopt); int ip6_dst_hoplimit(struct dst_entry *dst) { int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); + + rcu_read_lock(); if (hoplimit == 0) { - struct net_device *dev = dst_dev(dst); + struct net_device *dev = dst_dev_rcu(dst); struct inet6_dev *idev; - rcu_read_lock(); idev = __in6_dev_get(dev); if (idev) hoplimit = READ_ONCE(idev->cnf.hop_limit); else hoplimit = READ_ONCE(dev_net(dev)->ipv6.devconf_all->hop_limit); - rcu_read_unlock(); } + rcu_read_unlock(); + return hoplimit; } EXPORT_SYMBOL(ip6_dst_hoplimit); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3299cfa12e21..3371f16b7a3e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2943,7 +2943,7 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (res.f6i->nh) { struct fib6_nh_match_arg arg = { - .dev = dst_dev(dst), + .dev = dst_dev_rcu(dst), .gw = &rt6->rt6i_gateway, }; @@ -3238,7 +3238,6 @@ EXPORT_SYMBOL_GPL(ip6_sk_redirect); static unsigned int ip6_default_advmss(const struct dst_entry *dst) { - struct net_device *dev = dst_dev(dst); unsigned int mtu = dst_mtu(dst); struct net *net; @@ -3246,7 +3245,7 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst) rcu_read_lock(); - net = dev_net_rcu(dev); + net = dst_dev_net_rcu(dst); if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) mtu = net->ipv6.sysctl.ip6_rt_min_advmss; @@ -4301,7 +4300,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu if (res.f6i->nh) { struct fib6_nh_match_arg arg = { - .dev = dst_dev(dst), + .dev = dst_dev_rcu(dst), .gw = &rt->rt6i_gateway, }; -- cgit v1.2.3 From 9085e56501d93af9f2d7bd16f7fcfacdde47b99c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Aug 2025 19:58:18 +0000 Subject: ipv6: use RCU in ip6_xmit() Use RCU in ip6_xmit() in order to use dst_dev_rcu() to prevent possible UAF. Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250828195823.3958522-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_output.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 1e1410237b6e..e234640433d6 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -268,35 +268,36 @@ bool ip6_autoflowlabel(struct net *net, const struct sock *sk) int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) { - struct net *net = sock_net(sk); const struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *first_hop = &fl6->daddr; struct dst_entry *dst = skb_dst(skb); - struct net_device *dev = dst_dev(dst); struct inet6_dev *idev = ip6_dst_idev(dst); struct hop_jumbo_hdr *hop_jumbo; int hoplen = sizeof(*hop_jumbo); + struct net *net = sock_net(sk); unsigned int head_room; + struct net_device *dev; struct ipv6hdr *hdr; u8 proto = fl6->flowi6_proto; int seg_len = skb->len; - int hlimit = -1; + int ret, hlimit = -1; u32 mtu; + rcu_read_lock(); + + dev = dst_dev_rcu(dst); head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev); if (opt) head_room += opt->opt_nflen + opt->opt_flen; if (unlikely(head_room > skb_headroom(skb))) { - /* Make sure idev stays alive */ - rcu_read_lock(); + /* idev stays alive while we hold rcu_read_lock(). */ skb = skb_expand_head(skb, head_room); if (!skb) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); - rcu_read_unlock(); - return -ENOBUFS; + ret = -ENOBUFS; + goto unlock; } - rcu_read_unlock(); } if (opt) { @@ -358,17 +359,21 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, * skb to its handler for processing */ skb = l3mdev_ip6_out((struct sock *)sk, skb); - if (unlikely(!skb)) - return 0; + if (unlikely(!skb)) { + ret = 0; + goto unlock; + } /* hooks should never assume socket lock is held. * we promote our socket to non const */ - return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, - net, (struct sock *)sk, skb, NULL, dev, - dst_output); + ret = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, + net, (struct sock *)sk, skb, NULL, dev, + dst_output); + goto unlock; } + ret = -EMSGSIZE; skb->dev = dev; /* ipv6_local_error() does not require socket lock, * we promote our socket to non const @@ -377,7 +382,9 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); - return -EMSGSIZE; +unlock: + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL(ip6_xmit); -- cgit v1.2.3 From 11709573cc4e48dc34c80fc7ab9ce5b159e29695 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Aug 2025 19:58:19 +0000 Subject: ipv6: use RCU in ip6_output() Use RCU in ip6_output() in order to use dst_dev_rcu() to prevent possible UAF. We can remove rcu_read_lock()/rcu_read_unlock() pairs from ip6_finish_output2(). Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250828195823.3958522-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_output.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index e234640433d6..9d64c13bab5e 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -60,7 +60,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - struct net_device *dev = dst_dev(dst); + struct net_device *dev = dst_dev_rcu(dst); struct inet6_dev *idev = ip6_dst_idev(dst); unsigned int hh_len = LL_RESERVED_SPACE(dev); const struct in6_addr *daddr, *nexthop; @@ -70,15 +70,12 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * /* Be paranoid, rather than too clever. */ if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { - /* Make sure idev stays alive */ - rcu_read_lock(); + /* idev stays alive because we hold rcu_read_lock(). */ skb = skb_expand_head(skb, hh_len); if (!skb) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); - rcu_read_unlock(); return -ENOMEM; } - rcu_read_unlock(); } hdr = ipv6_hdr(skb); @@ -123,7 +120,6 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); - rcu_read_lock(); nexthop = rt6_nexthop(dst_rt6_info(dst), daddr); neigh = __ipv6_neigh_lookup_noref(dev, nexthop); @@ -131,7 +127,6 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dev, false); if (IS_ERR(neigh)) { - rcu_read_unlock(); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); return -EINVAL; @@ -139,7 +134,6 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * } sock_confirm_neigh(skb, neigh); ret = neigh_output(neigh, skb, false); - rcu_read_unlock(); return ret; } @@ -233,22 +227,29 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - struct net_device *dev = dst_dev(dst), *indev = skb->dev; - struct inet6_dev *idev = ip6_dst_idev(dst); + struct net_device *dev, *indev = skb->dev; + struct inet6_dev *idev; + int ret; skb->protocol = htons(ETH_P_IPV6); + rcu_read_lock(); + dev = dst_dev_rcu(dst); + idev = ip6_dst_idev(dst); skb->dev = dev; if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); + rcu_read_unlock(); kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED); return 0; } - return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, - net, sk, skb, indev, dev, - ip6_finish_output, - !(IP6CB(skb)->flags & IP6SKB_REROUTED)); + ret = NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, + net, sk, skb, indev, dev, + ip6_finish_output, + !(IP6CB(skb)->flags & IP6SKB_REROUTED)); + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL(ip6_output); -- cgit v1.2.3 From 99a2ace61b211b0be861b07fbaa062fca4b58879 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Aug 2025 19:58:20 +0000 Subject: net: use dst_dev_rcu() in sk_setup_caps() Use RCU to protect accesses to dst->dev from sk_setup_caps() and sk_dst_gso_max_size(). Also use dst_dev_rcu() in ip6_dst_mtu_maybe_forward(), and ip_dst_mtu_maybe_forward(). ip4_dst_hoplimit() can use dst_dev_net_rcu(). Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250828195823.3958522-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 6 ++++-- include/net/ip6_route.h | 2 +- include/net/route.h | 2 +- net/core/sock.c | 16 ++++++++++------ 4 files changed, 16 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/ip.h b/include/net/ip.h index befcba575129..6dbd2bf8fa9c 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -467,12 +467,14 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, bool forwarding) { const struct rtable *rt = dst_rtable(dst); + const struct net_device *dev; unsigned int mtu, res; struct net *net; rcu_read_lock(); - net = dev_net_rcu(dst_dev(dst)); + dev = dst_dev_rcu(dst); + net = dev_net_rcu(dev); if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) || ip_mtu_locked(dst) || !forwarding) { @@ -486,7 +488,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, if (mtu) goto out; - mtu = READ_ONCE(dst_dev(dst)->mtu); + mtu = READ_ONCE(dev->mtu); if (unlikely(ip_mtu_locked(dst))) { if (rt->rt_uses_gateway && mtu > 576) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 9255f21818ee..59f48ca3abdf 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -337,7 +337,7 @@ static inline unsigned int ip6_dst_mtu_maybe_forward(const struct dst_entry *dst mtu = IPV6_MIN_MTU; rcu_read_lock(); - idev = __in6_dev_get(dst_dev(dst)); + idev = __in6_dev_get(dst_dev_rcu(dst)); if (idev) mtu = READ_ONCE(idev->cnf.mtu6); rcu_read_unlock(); diff --git a/include/net/route.h b/include/net/route.h index c71998f464f8..f90106f383c5 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -390,7 +390,7 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst) const struct net *net; rcu_read_lock(); - net = dev_net_rcu(dst_dev(dst)); + net = dst_dev_net_rcu(dst); hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); rcu_read_unlock(); } diff --git a/net/core/sock.c b/net/core/sock.c index e66ad1ec3a2d..9a8290fcc35d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2587,7 +2587,7 @@ free: } EXPORT_SYMBOL_GPL(sk_clone_lock); -static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) +static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev) { bool is_ipv6 = false; u32 max_size; @@ -2597,8 +2597,8 @@ static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); #endif /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ - max_size = is_ipv6 ? READ_ONCE(dst_dev(dst)->gso_max_size) : - READ_ONCE(dst_dev(dst)->gso_ipv4_max_size); + max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) : + READ_ONCE(dev->gso_ipv4_max_size); if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) max_size = GSO_LEGACY_MAX_SIZE; @@ -2607,9 +2607,12 @@ static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { + const struct net_device *dev; u32 max_segs = 1; - sk->sk_route_caps = dst_dev(dst)->features; + rcu_read_lock(); + dev = dst_dev_rcu(dst); + sk->sk_route_caps = dev->features; if (sk_is_tcp(sk)) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -2625,13 +2628,14 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; - sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst); + sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev); /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ - max_segs = max_t(u32, READ_ONCE(dst_dev(dst)->gso_max_segs), 1); + max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1); } } sk->sk_gso_max_segs = max_segs; sk_dst_set(sk, dst); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(sk_setup_caps); -- cgit v1.2.3 From 50c127a69cd6285300931853b352a1918cfa180f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Aug 2025 19:58:21 +0000 Subject: tcp_metrics: use dst_dev_net_rcu() Replace three dst_dev() with a lockdep enabled helper. Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250828195823.3958522-7-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_metrics.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 03c068ea27b6..10e86f1008e9 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -170,7 +170,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, struct net *net; spin_lock_bh(&tcp_metrics_lock); - net = dev_net_rcu(dst_dev(dst)); + net = dst_dev_net_rcu(dst); /* While waiting for the spin-lock the cache might have been populated * with this entry and so we have to check again. @@ -273,7 +273,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, return NULL; } - net = dev_net_rcu(dst_dev(dst)); + net = dst_dev_net_rcu(dst); hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); @@ -318,7 +318,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, else return NULL; - net = dev_net_rcu(dst_dev(dst)); + net = dst_dev_net_rcu(dst); hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); -- cgit v1.2.3 From b62a59c18b692f892dcb8109c1c2e653b2abc95c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Aug 2025 19:58:22 +0000 Subject: tcp: use dst_dev_rcu() in tcp_fastopen_active_disable_ofo_check() Use RCU to avoid a pair of atomic operations and a potential UAF on dst_dev()->flags. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250828195823.3958522-8-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_fastopen.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index f1884f0c9e52..7d945a527daf 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -576,11 +576,12 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk) } } else if (tp->syn_fastopen_ch && atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) { - dst = sk_dst_get(sk); - dev = dst ? dst_dev(dst) : NULL; + rcu_read_lock(); + dst = __sk_dst_get(sk); + dev = dst ? dst_dev_rcu(dst) : NULL; if (!(dev && (dev->flags & IFF_LOOPBACK))) atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0); - dst_release(dst); + rcu_read_unlock(); } } -- cgit v1.2.3 From 6ad8de3cefdb6ffa6708b21c567df0dbf82c43a8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Aug 2025 19:58:23 +0000 Subject: ipv4: start using dst_dev_rcu() Change icmpv4_xrlim_allow(), ip_defrag() to prevent possible UAF. Change ipmr_prepare_xmit(), ipmr_queue_fwd_xmit(), ip_mr_output(), ipv4_neigh_lookup() to use lockdep enabled dst_dev_rcu(). Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250828195823.3958522-9-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/icmp.c | 6 +++--- net/ipv4/ip_fragment.c | 6 ++++-- net/ipv4/ipmr.c | 6 +++--- net/ipv4/route.c | 4 ++-- 4 files changed, 12 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 7248c15cbd75..823c70e34de8 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -319,17 +319,17 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, return true; /* No rate limit on loopback */ - dev = dst_dev(dst); + rcu_read_lock(); + dev = dst_dev_rcu(dst); if (dev && (dev->flags & IFF_LOOPBACK)) goto out; - rcu_read_lock(); peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, l3mdev_master_ifindex_rcu(dev)); rc = inet_peer_xrlim_allow(peer, READ_ONCE(net->ipv4.sysctl_icmp_ratelimit)); - rcu_read_unlock(); out: + rcu_read_unlock(); if (!rc) __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST); else diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index b2584cce90ae..f7012479713b 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -476,14 +476,16 @@ out_fail: /* Process an incoming IP datagram fragment. */ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) { - struct net_device *dev = skb->dev ? : skb_dst_dev(skb); - int vif = l3mdev_master_ifindex_rcu(dev); + struct net_device *dev; struct ipq *qp; + int vif; __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS); /* Lookup (or create) queue header */ rcu_read_lock(); + dev = skb->dev ? : skb_dst_dev_rcu(skb); + vif = l3mdev_master_ifindex_rcu(dev); qp = ip_find(net, ip_hdr(skb), user, vif); if (qp) { int ret, refs = 0; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 345e5faac634..ca9eaee4c2ef 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1905,7 +1905,7 @@ static int ipmr_prepare_xmit(struct net *net, struct mr_table *mrt, return -1; } - encap += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; + encap += LL_RESERVED_SPACE(dst_dev_rcu(&rt->dst)) + rt->dst.header_len; if (skb_cow(skb, encap)) { ip_rt_put(rt); @@ -1958,7 +1958,7 @@ static void ipmr_queue_fwd_xmit(struct net *net, struct mr_table *mrt, * result in receiving multiple packets. */ NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, - net, NULL, skb, skb->dev, rt->dst.dev, + net, NULL, skb, skb->dev, dst_dev_rcu(&rt->dst), ipmr_forward_finish); return; @@ -2302,7 +2302,7 @@ int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb) guard(rcu)(); - dev = rt->dst.dev; + dev = dst_dev_rcu(&rt->dst); if (IPCB(skb)->flags & IPSKB_FORWARDED) goto mc_output; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 44382d175589..50309f2ab132 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -414,11 +414,11 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr) { const struct rtable *rt = container_of(dst, struct rtable, dst); - struct net_device *dev = dst_dev(dst); + struct net_device *dev; struct neighbour *n; rcu_read_lock(); - + dev = dst_dev_rcu(dst); if (likely(rt->rt_gw_family == AF_INET)) { n = ip_neigh_gw4(dev, rt->rt_gw4); } else if (rt->rt_gw_family == AF_INET6) { -- cgit v1.2.3 From a59076f2669ec23a122549e1f4114e8d4255b632 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Sat, 16 Aug 2025 10:28:57 -0700 Subject: lsm: security_lsmblob_to_secctx module selection Add a parameter lsmid to security_lsmblob_to_secctx() to identify which of the security modules that may be active should provide the security context. If the value of lsmid is LSM_ID_UNDEF the first LSM providing a hook is used. security_secid_to_secctx() is unchanged, and will always report the first LSM providing a hook. Signed-off-by: Casey Schaufler [PM: subj tweak] Signed-off-by: Paul Moore --- include/linux/security.h | 6 ++++-- kernel/audit.c | 4 ++-- kernel/auditsc.c | 8 +++++--- net/netlabel/netlabel_user.c | 3 ++- security/security.c | 18 ++++++++++++++++-- 5 files changed, 29 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/linux/security.h b/include/linux/security.h index 521bcb5b9717..6d1ed6e7387b 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -567,7 +567,8 @@ int security_getprocattr(struct task_struct *p, int lsmid, const char *name, int security_setprocattr(int lsmid, const char *name, void *value, size_t size); int security_ismaclabel(const char *name); int security_secid_to_secctx(u32 secid, struct lsm_context *cp); -int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp); +int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp, + int lsmid); int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid); void security_release_secctx(struct lsm_context *cp); void security_inode_invalidate_secctx(struct inode *inode); @@ -1551,7 +1552,8 @@ static inline int security_secid_to_secctx(u32 secid, struct lsm_context *cp) } static inline int security_lsmprop_to_secctx(struct lsm_prop *prop, - struct lsm_context *cp) + struct lsm_context *cp, + int lsmid) { return -EOPNOTSUPP; } diff --git a/kernel/audit.c b/kernel/audit.c index 547967cb4266..226c8ae00d04 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1473,7 +1473,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, case AUDIT_SIGNAL_INFO: if (lsmprop_is_set(&audit_sig_lsm)) { err = security_lsmprop_to_secctx(&audit_sig_lsm, - &lsmctx); + &lsmctx, LSM_ID_UNDEF); if (err < 0) return err; } @@ -2188,7 +2188,7 @@ int audit_log_task_context(struct audit_buffer *ab) if (!lsmprop_is_set(&prop)) return 0; - error = security_lsmprop_to_secctx(&prop, &ctx); + error = security_lsmprop_to_secctx(&prop, &ctx, LSM_ID_UNDEF); if (error < 0) { if (error != -EINVAL) goto error_path; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 8ec768e2c1e5..3b606fd4ae8e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1109,7 +1109,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, from_kuid(&init_user_ns, auid), from_kuid(&init_user_ns, uid), sessionid); if (lsmprop_is_set(prop)) { - if (security_lsmprop_to_secctx(prop, &ctx) < 0) { + if (security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF) < 0) { audit_log_format(ab, " obj=(none)"); rc = 1; } else { @@ -1395,7 +1395,8 @@ static void show_special(struct audit_context *context, int *call_panic) struct lsm_context lsmctx; if (security_lsmprop_to_secctx(&context->ipc.oprop, - &lsmctx) < 0) { + &lsmctx, + LSM_ID_UNDEF) < 0) { *call_panic = 1; } else { audit_log_format(ab, " obj=%s", lsmctx.context); @@ -1560,7 +1561,8 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, if (lsmprop_is_set(&n->oprop)) { struct lsm_context ctx; - if (security_lsmprop_to_secctx(&n->oprop, &ctx) < 0) { + if (security_lsmprop_to_secctx(&n->oprop, &ctx, + LSM_ID_UNDEF) < 0) { if (call_panic) *call_panic = 2; } else { diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c index 0d04d23aafe7..6d6545297ee3 100644 --- a/net/netlabel/netlabel_user.c +++ b/net/netlabel/netlabel_user.c @@ -98,7 +98,8 @@ struct audit_buffer *netlbl_audit_start_common(int type, audit_info->sessionid); if (lsmprop_is_set(&audit_info->prop) && - security_lsmprop_to_secctx(&audit_info->prop, &ctx) > 0) { + security_lsmprop_to_secctx(&audit_info->prop, &ctx, + LSM_ID_UNDEF) > 0) { audit_log_format(audit_buf, " subj=%s", ctx.context); security_release_secctx(&ctx); } diff --git a/security/security.c b/security/security.c index ad163f06bf7a..dd588f548a2b 100644 --- a/security/security.c +++ b/security/security.c @@ -4342,17 +4342,31 @@ EXPORT_SYMBOL(security_secid_to_secctx); * security_lsmprop_to_secctx() - Convert a lsm_prop to a secctx * @prop: lsm specific information * @cp: the LSM context + * @lsmid: which security module to report * * Convert a @prop entry to security context. If @cp is NULL the * length of the result will be returned. This does mean that the * length could change between calls to check the length and the * next call which actually allocates and returns the @cp. * + * @lsmid identifies which LSM should supply the context. + * A value of LSM_ID_UNDEF indicates that the first LSM suppling + * the hook should be used. This is used in cases where the + * ID of the supplying LSM is unambiguous. + * * Return: Return length of data on success, error on failure. */ -int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp) +int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp, + int lsmid) { - return call_int_hook(lsmprop_to_secctx, prop, cp); + struct lsm_static_call *scall; + + lsm_for_each_hook(scall, lsmprop_to_secctx) { + if (lsmid != LSM_ID_UNDEF && lsmid != scall->hl->lsmid->id) + continue; + return scall->hl->hook.lsmprop_to_secctx(prop, cp); + } + return LSM_RET_DEFAULT(lsmprop_to_secctx); } EXPORT_SYMBOL(security_lsmprop_to_secctx); -- cgit v1.2.3 From eb59d494eebd4c5414728a35cdea6a0ba78ff26e Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Sat, 16 Aug 2025 10:28:58 -0700 Subject: audit: add record for multiple task security contexts Replace the single skb pointer in an audit_buffer with a list of skb pointers. Add the audit_stamp information to the audit_buffer as there's no guarantee that there will be an audit_context containing the stamp associated with the event. At audit_log_end() time create auxiliary records as have been added to the list. Functions are created to manage the skb list in the audit_buffer. Create a new audit record AUDIT_MAC_TASK_CONTEXTS. An example of the MAC_TASK_CONTEXTS record is: type=MAC_TASK_CONTEXTS msg=audit(1600880931.832:113) subj_apparmor=unconfined subj_smack=_ When an audit event includes a AUDIT_MAC_TASK_CONTEXTS record the "subj=" field in other records in the event will be "subj=?". An AUDIT_MAC_TASK_CONTEXTS record is supplied when the system has multiple security modules that may make access decisions based on a subject security context. Refactor audit_log_task_context(), creating a new audit_log_subj_ctx(). This is used in netlabel auditing to provide multiple subject security contexts as necessary. Suggested-by: Paul Moore Signed-off-by: Casey Schaufler [PM: subj tweak, audit example readability indents] Signed-off-by: Paul Moore --- include/linux/audit.h | 16 ++++ include/uapi/linux/audit.h | 1 + kernel/audit.c | 208 ++++++++++++++++++++++++++++++++++++------- net/netlabel/netlabel_user.c | 9 +- security/apparmor/lsm.c | 3 + security/selinux/hooks.c | 3 + security/smack/smack_lsm.c | 3 + 7 files changed, 202 insertions(+), 41 deletions(-) (limited to 'net') diff --git a/include/linux/audit.h b/include/linux/audit.h index e3f06eba9c6e..a1f068bcb3a0 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -37,6 +37,8 @@ struct audit_watch; struct audit_tree; struct sk_buff; struct kern_ipc_perm; +struct lsm_id; +struct lsm_prop; struct audit_krule { u32 pflags; @@ -147,6 +149,9 @@ extern unsigned compat_signal_class[]; #define AUDIT_TTY_ENABLE BIT(0) #define AUDIT_TTY_LOG_PASSWD BIT(1) +/* bit values for audit_cfg_lsm */ +#define AUDIT_CFG_LSM_SECCTX_SUBJECT BIT(0) + struct filename; #define AUDIT_OFF 0 @@ -185,6 +190,7 @@ extern void audit_log_path_denied(int type, const char *operation); extern void audit_log_lost(const char *message); +extern int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop); extern int audit_log_task_context(struct audit_buffer *ab); extern void audit_log_task_info(struct audit_buffer *ab); @@ -210,6 +216,8 @@ extern u32 audit_enabled; extern int audit_signal_info(int sig, struct task_struct *t); +extern void audit_cfg_lsm(const struct lsm_id *lsmid, int flags); + #else /* CONFIG_AUDIT */ static inline __printf(4, 5) void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, @@ -245,6 +253,11 @@ static inline void audit_log_key(struct audit_buffer *ab, char *key) { } static inline void audit_log_path_denied(int type, const char *operation) { } +static inline int audit_log_subj_ctx(struct audit_buffer *ab, + struct lsm_prop *prop) +{ + return 0; +} static inline int audit_log_task_context(struct audit_buffer *ab) { return 0; @@ -269,6 +282,9 @@ static inline int audit_signal_info(int sig, struct task_struct *t) return 0; } +static inline void audit_cfg_lsm(const struct lsm_id *lsmid, int flags) +{ } + #endif /* CONFIG_AUDIT */ #ifdef CONFIG_AUDIT_COMPAT_GENERIC diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 9a4ecc9f6dc5..8cad2f307719 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -148,6 +148,7 @@ #define AUDIT_IPE_POLICY_LOAD 1422 /* IPE policy load */ #define AUDIT_LANDLOCK_ACCESS 1423 /* Landlock denial */ #define AUDIT_LANDLOCK_DOMAIN 1424 /* Landlock domain status */ +#define AUDIT_MAC_TASK_CONTEXTS 1425 /* Multiple LSM task contexts */ #define AUDIT_FIRST_KERN_ANOM_MSG 1700 #define AUDIT_LAST_KERN_ANOM_MSG 1799 diff --git a/kernel/audit.c b/kernel/audit.c index 226c8ae00d04..c924b30f2524 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -81,6 +82,11 @@ static u32 audit_failure = AUDIT_FAIL_PRINTK; /* private audit network namespace index */ static unsigned int audit_net_id; +/* Number of modules that provide a security context. + List of lsms that provide a security context */ +static u32 audit_subj_secctx_cnt; +static const struct lsm_id *audit_subj_lsms[MAX_LSM_COUNT]; + /** * struct audit_net - audit private network namespace data * @sk: communication socket @@ -195,8 +201,10 @@ static struct audit_ctl_mutex { * to place it on a transmit queue. Multiple audit_buffers can be in * use simultaneously. */ struct audit_buffer { - struct sk_buff *skb; /* formatted skb ready to send */ + struct sk_buff *skb; /* the skb for audit_log functions */ + struct sk_buff_head skb_list; /* formatted skbs, ready to send */ struct audit_context *ctx; /* NULL or associated context */ + struct audit_stamp stamp; /* audit stamp for these records */ gfp_t gfp_mask; }; @@ -278,6 +286,27 @@ static pid_t auditd_pid_vnr(void) return pid; } +/** + * audit_cfg_lsm - Identify a security module as providing a secctx. + * @lsmid: LSM identity + * @flags: which contexts are provided + * + * Description: + * Increments the count of the security modules providing a secctx. + * If the LSM id is already in the list leave it alone. + */ +void audit_cfg_lsm(const struct lsm_id *lsmid, int flags) +{ + int i; + + if (flags & AUDIT_CFG_LSM_SECCTX_SUBJECT) { + for (i = 0 ; i < audit_subj_secctx_cnt; i++) + if (audit_subj_lsms[i] == lsmid) + return; + audit_subj_lsms[audit_subj_secctx_cnt++] = lsmid; + } +} + /** * audit_get_sk - Return the audit socket for the given network namespace * @net: the destination network namespace @@ -1776,10 +1805,13 @@ __setup("audit_backlog_limit=", audit_backlog_limit_set); static void audit_buffer_free(struct audit_buffer *ab) { + struct sk_buff *skb; + if (!ab) return; - kfree_skb(ab->skb); + while ((skb = skb_dequeue(&ab->skb_list))) + kfree_skb(skb); kmem_cache_free(audit_buffer_cache, ab); } @@ -1795,6 +1827,10 @@ static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx, ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); if (!ab->skb) goto err; + + skb_queue_head_init(&ab->skb_list); + skb_queue_tail(&ab->skb_list, ab->skb); + if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) goto err; @@ -1860,7 +1896,6 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab; - struct audit_stamp stamp; if (audit_initialized != AUDIT_INITIALIZED) return NULL; @@ -1915,14 +1950,14 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, return NULL; } - audit_get_stamp(ab->ctx, &stamp); + audit_get_stamp(ab->ctx, &ab->stamp); /* cancel dummy context to enable supporting records */ if (ctx) ctx->dummy = 0; audit_log_format(ab, "audit(%llu.%03lu:%u): ", - (unsigned long long)stamp.ctime.tv_sec, - stamp.ctime.tv_nsec/1000000, - stamp.serial); + (unsigned long long)ab->stamp.ctime.tv_sec, + ab->stamp.ctime.tv_nsec/1000000, + ab->stamp.serial); return ab; } @@ -2178,31 +2213,128 @@ void audit_log_key(struct audit_buffer *ab, char *key) audit_log_format(ab, "(null)"); } -int audit_log_task_context(struct audit_buffer *ab) +/** + * audit_buffer_aux_new - Add an aux record buffer to the skb list + * @ab: audit_buffer + * @type: message type + * + * Aux records are allocated and added to the skb list of + * the "main" record. The ab->skb is reset to point to the + * aux record on its creation. When the aux record in complete + * ab->skb has to be reset to point to the "main" record. + * This allows the audit_log_ functions to be ignorant of + * which kind of record it is logging to. It also avoids adding + * special data for aux records. + * + * On success ab->skb will point to the new aux record. + * Returns 0 on success, -ENOMEM should allocation fail. + */ +static int audit_buffer_aux_new(struct audit_buffer *ab, int type) +{ + WARN_ON(ab->skb != skb_peek(&ab->skb_list)); + + ab->skb = nlmsg_new(AUDIT_BUFSIZ, ab->gfp_mask); + if (!ab->skb) + goto err; + if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) + goto err; + skb_queue_tail(&ab->skb_list, ab->skb); + + audit_log_format(ab, "audit(%llu.%03lu:%u): ", + (unsigned long long)ab->stamp.ctime.tv_sec, + ab->stamp.ctime.tv_nsec/1000000, + ab->stamp.serial); + + return 0; + +err: + kfree_skb(ab->skb); + ab->skb = skb_peek(&ab->skb_list); + return -ENOMEM; +} + +/** + * audit_buffer_aux_end - Switch back to the "main" record from an aux record + * @ab: audit_buffer + * + * Restores the "main" audit record to ab->skb. + */ +static void audit_buffer_aux_end(struct audit_buffer *ab) +{ + ab->skb = skb_peek(&ab->skb_list); +} + +/** + * audit_log_subj_ctx - Add LSM subject information + * @ab: audit_buffer + * @prop: LSM subject properties. + * + * Add a subj= field and, if necessary, a AUDIT_MAC_TASK_CONTEXTS record. + */ +int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop) { - struct lsm_prop prop; struct lsm_context ctx; + char *space = ""; int error; + int i; - security_current_getlsmprop_subj(&prop); - if (!lsmprop_is_set(&prop)) + security_current_getlsmprop_subj(prop); + if (!lsmprop_is_set(prop)) return 0; - error = security_lsmprop_to_secctx(&prop, &ctx, LSM_ID_UNDEF); - if (error < 0) { - if (error != -EINVAL) - goto error_path; + if (audit_subj_secctx_cnt < 2) { + error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF); + if (error < 0) { + if (error != -EINVAL) + goto error_path; + return 0; + } + audit_log_format(ab, " subj=%s", ctx.context); + security_release_secctx(&ctx); return 0; } - - audit_log_format(ab, " subj=%s", ctx.context); - security_release_secctx(&ctx); + /* Multiple LSMs provide contexts. Include an aux record. */ + audit_log_format(ab, " subj=?"); + error = audit_buffer_aux_new(ab, AUDIT_MAC_TASK_CONTEXTS); + if (error) + goto error_path; + + for (i = 0; i < audit_subj_secctx_cnt; i++) { + error = security_lsmprop_to_secctx(prop, &ctx, + audit_subj_lsms[i]->id); + if (error < 0) { + /* + * Don't print anything. An LSM like BPF could + * claim to support contexts, but only do so under + * certain conditions. + */ + if (error == -EOPNOTSUPP) + continue; + if (error != -EINVAL) + audit_panic("error in audit_log_subj_ctx"); + } else { + audit_log_format(ab, "%ssubj_%s=%s", space, + audit_subj_lsms[i]->name, ctx.context); + space = " "; + security_release_secctx(&ctx); + } + } + audit_buffer_aux_end(ab); return 0; error_path: - audit_panic("error in audit_log_task_context"); + audit_panic("error in audit_log_subj_ctx"); return error; } +EXPORT_SYMBOL(audit_log_subj_ctx); + +int audit_log_task_context(struct audit_buffer *ab) +{ + struct lsm_prop prop; + + security_current_getlsmprop_subj(&prop); + return audit_log_subj_ctx(ab, &prop); +} EXPORT_SYMBOL(audit_log_task_context); void audit_log_d_path_exe(struct audit_buffer *ab, @@ -2411,6 +2543,26 @@ int audit_signal_info(int sig, struct task_struct *t) return audit_signal_info_syscall(t); } +/** + * __audit_log_end - enqueue one audit record + * @skb: the buffer to send + */ +static void __audit_log_end(struct sk_buff *skb) +{ + struct nlmsghdr *nlh; + + if (audit_rate_check()) { + /* setup the netlink header, see the comments in + * kauditd_send_multicast_skb() for length quirks */ + nlh = nlmsg_hdr(skb); + nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; + + /* queue the netlink packet */ + skb_queue_tail(&audit_queue, skb); + } else + audit_log_lost("rate limit exceeded"); +} + /** * audit_log_end - end one audit record * @ab: the audit_buffer @@ -2423,25 +2575,15 @@ int audit_signal_info(int sig, struct task_struct *t) void audit_log_end(struct audit_buffer *ab) { struct sk_buff *skb; - struct nlmsghdr *nlh; if (!ab) return; - if (audit_rate_check()) { - skb = ab->skb; - ab->skb = NULL; + while ((skb = skb_dequeue(&ab->skb_list))) + __audit_log_end(skb); - /* setup the netlink header, see the comments in - * kauditd_send_multicast_skb() for length quirks */ - nlh = nlmsg_hdr(skb); - nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; - - /* queue the netlink packet and poke the kauditd thread */ - skb_queue_tail(&audit_queue, skb); - wake_up_interruptible(&kauditd_wait); - } else - audit_log_lost("rate limit exceeded"); + /* poke the kauditd thread */ + wake_up_interruptible(&kauditd_wait); audit_buffer_free(ab); } diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c index 6d6545297ee3..0da652844dd6 100644 --- a/net/netlabel/netlabel_user.c +++ b/net/netlabel/netlabel_user.c @@ -84,7 +84,6 @@ struct audit_buffer *netlbl_audit_start_common(int type, struct netlbl_audit *audit_info) { struct audit_buffer *audit_buf; - struct lsm_context ctx; if (audit_enabled == AUDIT_OFF) return NULL; @@ -96,13 +95,7 @@ struct audit_buffer *netlbl_audit_start_common(int type, audit_log_format(audit_buf, "netlabel: auid=%u ses=%u", from_kuid(&init_user_ns, audit_info->loginuid), audit_info->sessionid); - - if (lsmprop_is_set(&audit_info->prop) && - security_lsmprop_to_secctx(&audit_info->prop, &ctx, - LSM_ID_UNDEF) > 0) { - audit_log_format(audit_buf, " subj=%s", ctx.context); - security_release_secctx(&ctx); - } + audit_log_subj_ctx(audit_buf, &audit_info->prop); return audit_buf; } diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 8e1cc229b41b..220d1684b8d4 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -2530,6 +2530,9 @@ static int __init apparmor_init(void) security_add_hooks(apparmor_hooks, ARRAY_SIZE(apparmor_hooks), &apparmor_lsmid); + /* Inform the audit system that secctx is used */ + audit_cfg_lsm(&apparmor_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + /* Report that AppArmor successfully initialized */ apparmor_initialized = 1; if (aa_g_profile_mode == APPARMOR_COMPLAIN) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c95a5874bf7d..975b84b466b4 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -7618,6 +7618,9 @@ static __init int selinux_init(void) /* Set the security state for the initial task. */ cred_init_security(); + /* Inform the audit system that secctx is used */ + audit_cfg_lsm(&selinux_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + default_noexec = !(VM_DATA_DEFAULT_FLAGS & VM_EXEC); if (!default_noexec) pr_notice("SELinux: virtual memory is executable by default\n"); diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index fc340a6f0dde..eaff9b8901a7 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -5267,6 +5267,9 @@ static __init int smack_init(void) /* initialize the smack_known_list */ init_smack_known_list(); + /* Inform the audit system that secctx is used */ + audit_cfg_lsm(&smack_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + return 0; } -- cgit v1.2.3 From d77b6ff0ce35a6d0b0b7b9581bc3f76d041d4087 Mon Sep 17 00:00:00 2001 From: Stanislav Fort Date: Sun, 31 Aug 2025 16:56:23 +0200 Subject: batman-adv: fix OOB read/write in network-coding decode batadv_nc_skb_decode_packet() trusts coded_len and checks only against skb->len. XOR starts at sizeof(struct batadv_unicast_packet), reducing payload headroom, and the source skb length is not verified, allowing an out-of-bounds read and a small out-of-bounds write. Validate that coded_len fits within the payload area of both destination and source sk_buffs before XORing. Fixes: 2df5278b0267 ("batman-adv: network coding - receive coded packets and decode them") Cc: stable@vger.kernel.org Reported-by: Stanislav Fort Signed-off-by: Stanislav Fort Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/network-coding.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 9f56308779cc..af97d077369f 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -1687,7 +1687,12 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, coding_len = ntohs(coded_packet_tmp.coded_len); - if (coding_len > skb->len) + /* ensure dst buffer is large enough (payload only) */ + if (coding_len + h_size > skb->len) + return NULL; + + /* ensure src buffer is large enough (payload only) */ + if (coding_len + h_size > nc_packet->skb->len) return NULL; /* Here the magic is reversed: -- cgit v1.2.3 From cd8ae32e4e4652db55bce6b9c79267d8946765a9 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 29 Aug 2025 10:54:15 +0200 Subject: xfrm: xfrm_alloc_spi shouldn't use 0 as SPI x->id.spi == 0 means "no SPI assigned", but since commit 94f39804d891 ("xfrm: Duplicate SPI Handling"), we now create states and add them to the byspi list with this value. __xfrm_state_delete doesn't remove those states from the byspi list, since they shouldn't be there, and this shows up as a UAF the next time we go through the byspi list. Reported-by: syzbot+a25ee9d20d31e483ba7b@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=a25ee9d20d31e483ba7b Fixes: 94f39804d891 ("xfrm: Duplicate SPI Handling") Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 78fcbb89cf32..d213ca3653a8 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2583,6 +2583,8 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high, for (h = 0; h < range; h++) { u32 spi = (low == high) ? low : get_random_u32_inclusive(low, high); + if (spi == 0) + goto next; newspi = htonl(spi); spin_lock_bh(&net->xfrm.xfrm_state_lock); @@ -2598,6 +2600,7 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high, xfrm_state_put(x0); spin_unlock_bh(&net->xfrm.xfrm_state_lock); +next: if (signal_pending(current)) { err = -ERESTARTSYS; goto unlock; -- cgit v1.2.3 From edd3cb05c00a040dc72bed20b14b5ba865188bce Mon Sep 17 00:00:00 2001 From: Simon Schuster Date: Mon, 1 Sep 2025 15:09:51 +0200 Subject: copy_process: pass clone_flags as u64 across calltree With the introduction of clone3 in commit 7f192e3cd316 ("fork: add clone3") the effective bit width of clone_flags on all architectures was increased from 32-bit to 64-bit, with a new type of u64 for the flags. However, for most consumers of clone_flags the interface was not changed from the previous type of unsigned long. While this works fine as long as none of the new 64-bit flag bits (CLONE_CLEAR_SIGHAND and CLONE_INTO_CGROUP) are evaluated, this is still undesirable in terms of the principle of least surprise. Thus, this commit fixes all relevant interfaces of callees to sys_clone3/copy_process (excluding the architecture-specific copy_thread) to consistently pass clone_flags as u64, so that no truncation to 32-bit integers occurs on 32-bit architectures. Signed-off-by: Simon Schuster Link: https://lore.kernel.org/20250901-nios2-implement-clone3-v2-2-53fcf5577d57@siemens-energy.com Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Reviewed-by: Arnd Bergmann Signed-off-by: Christian Brauner --- block/blk-ioc.c | 2 +- fs/namespace.c | 2 +- include/linux/cgroup.h | 4 ++-- include/linux/cred.h | 2 +- include/linux/iocontext.h | 6 +++--- include/linux/ipc_namespace.h | 4 ++-- include/linux/lsm_hook_defs.h | 2 +- include/linux/mnt_namespace.h | 2 +- include/linux/nsproxy.h | 2 +- include/linux/pid_namespace.h | 4 ++-- include/linux/rseq.h | 4 ++-- include/linux/sched/task.h | 2 +- include/linux/security.h | 4 ++-- include/linux/sem.h | 4 ++-- include/linux/time_namespace.h | 4 ++-- include/linux/uprobes.h | 4 ++-- include/linux/user_events.h | 4 ++-- include/linux/utsname.h | 4 ++-- include/net/net_namespace.h | 4 ++-- include/trace/events/task.h | 6 +++--- ipc/namespace.c | 2 +- ipc/sem.c | 2 +- kernel/cgroup/namespace.c | 2 +- kernel/cred.c | 2 +- kernel/events/uprobes.c | 2 +- kernel/fork.c | 8 ++++---- kernel/nsproxy.c | 4 ++-- kernel/pid_namespace.c | 2 +- kernel/sched/core.c | 4 ++-- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 4 ++-- kernel/time/namespace.c | 2 +- kernel/utsname.c | 2 +- net/core/net_namespace.c | 2 +- security/apparmor/lsm.c | 2 +- security/security.c | 2 +- security/selinux/hooks.c | 2 +- security/tomoyo/tomoyo.c | 2 +- 38 files changed, 59 insertions(+), 59 deletions(-) (limited to 'net') diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 9fda3906e5f5..d15918d7fabb 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -286,7 +286,7 @@ out: } EXPORT_SYMBOL_GPL(set_task_ioprio); -int __copy_io(unsigned long clone_flags, struct task_struct *tsk) +int __copy_io(u64 clone_flags, struct task_struct *tsk) { struct io_context *ioc = current->io_context; diff --git a/fs/namespace.c b/fs/namespace.c index ddfd4457d338..d9c190ffa7df 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4200,7 +4200,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a } __latent_entropy -struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, +struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b18fb5fcb38e..56d9556a181a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -796,7 +796,7 @@ extern struct cgroup_namespace init_cgroup_ns; void free_cgroup_ns(struct cgroup_namespace *ns); -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, +struct cgroup_namespace *copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns); @@ -818,7 +818,7 @@ static inline void put_cgroup_ns(struct cgroup_namespace *ns) static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } static inline struct cgroup_namespace * -copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, +copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { return old_ns; diff --git a/include/linux/cred.h b/include/linux/cred.h index a102a10f833f..89ae50ad2ace 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -148,7 +148,7 @@ struct cred { extern void __put_cred(struct cred *); extern void exit_creds(struct task_struct *); -extern int copy_creds(struct task_struct *, unsigned long); +extern int copy_creds(struct task_struct *, u64); extern const struct cred *get_task_cred(struct task_struct *); extern struct cred *cred_alloc_blank(void); extern struct cred *prepare_creds(void); diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 14f7eaf1b443..079d8773790c 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -118,8 +118,8 @@ struct task_struct; #ifdef CONFIG_BLOCK void put_io_context(struct io_context *ioc); void exit_io_context(struct task_struct *task); -int __copy_io(unsigned long clone_flags, struct task_struct *tsk); -static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) +int __copy_io(u64 clone_flags, struct task_struct *tsk); +static inline int copy_io(u64 clone_flags, struct task_struct *tsk) { if (!current->io_context) return 0; @@ -129,7 +129,7 @@ static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) struct io_context; static inline void put_io_context(struct io_context *ioc) { } static inline void exit_io_context(struct task_struct *task) { } -static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) +static inline int copy_io(u64 clone_flags, struct task_struct *tsk) { return 0; } diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index e8240cf2611a..4b399893e2b3 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -129,7 +129,7 @@ static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; } #endif #if defined(CONFIG_IPC_NS) -extern struct ipc_namespace *copy_ipcs(unsigned long flags, +extern struct ipc_namespace *copy_ipcs(u64 flags, struct user_namespace *user_ns, struct ipc_namespace *ns); static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) @@ -151,7 +151,7 @@ static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns extern void put_ipc_ns(struct ipc_namespace *ns); #else -static inline struct ipc_namespace *copy_ipcs(unsigned long flags, +static inline struct ipc_namespace *copy_ipcs(u64 flags, struct user_namespace *user_ns, struct ipc_namespace *ns) { if (flags & CLONE_NEWIPC) diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index fd11fffdd3c3..adbe234a6f6c 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -211,7 +211,7 @@ LSM_HOOK(int, 0, file_open, struct file *file) LSM_HOOK(int, 0, file_post_open, struct file *file, int mask) LSM_HOOK(int, 0, file_truncate, struct file *file) LSM_HOOK(int, 0, task_alloc, struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) LSM_HOOK(void, LSM_RET_VOID, task_free, struct task_struct *task) LSM_HOOK(int, 0, cred_alloc_blank, struct cred *cred, gfp_t gfp) LSM_HOOK(void, LSM_RET_VOID, cred_free, struct cred *cred) diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 70b366b64816..ff290c87b2e7 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -11,7 +11,7 @@ struct fs_struct; struct user_namespace; struct ns_common; -extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, +extern struct mnt_namespace *copy_mnt_ns(u64, struct mnt_namespace *, struct user_namespace *, struct fs_struct *); extern void put_mnt_ns(struct mnt_namespace *ns); DEFINE_FREE(put_mnt_ns, struct mnt_namespace *, if (!IS_ERR_OR_NULL(_T)) put_mnt_ns(_T)) diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index dab6a1734a22..82533e899ff4 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -103,7 +103,7 @@ static inline struct cred *nsset_cred(struct nsset *set) * */ -int copy_namespaces(unsigned long flags, struct task_struct *tsk); +int copy_namespaces(u64 flags, struct task_struct *tsk); void exit_task_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); int exec_task_namespaces(void); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 7c67a5811199..0620a3e08e83 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -78,7 +78,7 @@ static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) } #endif -extern struct pid_namespace *copy_pid_ns(unsigned long flags, +extern struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *ns); extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd); @@ -97,7 +97,7 @@ static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) return 0; } -static inline struct pid_namespace *copy_pid_ns(unsigned long flags, +static inline struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *ns) { if (flags & CLONE_NEWPID) diff --git a/include/linux/rseq.h b/include/linux/rseq.h index bc8af3eb5598..a96fd345aa38 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -65,7 +65,7 @@ static inline void rseq_migrate(struct task_struct *t) * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. */ -static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) +static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { if (clone_flags & CLONE_VM) { t->rseq = NULL; @@ -107,7 +107,7 @@ static inline void rseq_preempt(struct task_struct *t) static inline void rseq_migrate(struct task_struct *t) { } -static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) +static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } static inline void rseq_execve(struct task_struct *t) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index ea41795a352b..34d6a0e108c3 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -63,7 +63,7 @@ extern int lockdep_tasklist_lock_is_held(void); extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); -extern int sched_fork(unsigned long clone_flags, struct task_struct *p); +extern int sched_fork(u64 clone_flags, struct task_struct *p); extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void sched_cancel_fork(struct task_struct *p); extern void sched_post_fork(struct task_struct *p); diff --git a/include/linux/security.h b/include/linux/security.h index 521bcb5b9717..9a1d4a6c8673 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -489,7 +489,7 @@ int security_file_receive(struct file *file); int security_file_open(struct file *file); int security_file_post_open(struct file *file, int mask); int security_file_truncate(struct file *file); -int security_task_alloc(struct task_struct *task, unsigned long clone_flags); +int security_task_alloc(struct task_struct *task, u64 clone_flags); void security_task_free(struct task_struct *task); int security_cred_alloc_blank(struct cred *cred, gfp_t gfp); void security_cred_free(struct cred *cred); @@ -1215,7 +1215,7 @@ static inline int security_file_truncate(struct file *file) } static inline int security_task_alloc(struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) { return 0; } diff --git a/include/linux/sem.h b/include/linux/sem.h index c4deefe42aeb..275269ce2ec8 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -9,12 +9,12 @@ struct task_struct; #ifdef CONFIG_SYSVIPC -extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk); +extern int copy_semundo(u64 clone_flags, struct task_struct *tsk); extern void exit_sem(struct task_struct *tsk); #else -static inline int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) +static inline int copy_semundo(u64 clone_flags, struct task_struct *tsk) { return 0; } diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index bb2c52f4fc94..b6e36525e0be 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -43,7 +43,7 @@ static inline struct time_namespace *get_time_ns(struct time_namespace *ns) return ns; } -struct time_namespace *copy_time_ns(unsigned long flags, +struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns); void free_time_ns(struct time_namespace *ns); @@ -129,7 +129,7 @@ static inline void put_time_ns(struct time_namespace *ns) } static inline -struct time_namespace *copy_time_ns(unsigned long flags, +struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns) { diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 516217c39094..915303a82d84 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -205,7 +205,7 @@ extern void uprobe_start_dup_mmap(void); extern void uprobe_end_dup_mmap(void); extern void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm); extern void uprobe_free_utask(struct task_struct *t); -extern void uprobe_copy_process(struct task_struct *t, unsigned long flags); +extern void uprobe_copy_process(struct task_struct *t, u64 flags); extern int uprobe_post_sstep_notifier(struct pt_regs *regs); extern int uprobe_pre_sstep_notifier(struct pt_regs *regs); extern void uprobe_notify_resume(struct pt_regs *regs); @@ -281,7 +281,7 @@ static inline bool uprobe_deny_signal(void) static inline void uprobe_free_utask(struct task_struct *t) { } -static inline void uprobe_copy_process(struct task_struct *t, unsigned long flags) +static inline void uprobe_copy_process(struct task_struct *t, u64 flags) { } static inline void uprobe_clear_state(struct mm_struct *mm) diff --git a/include/linux/user_events.h b/include/linux/user_events.h index 8afa8c3a0973..57d1ff006090 100644 --- a/include/linux/user_events.h +++ b/include/linux/user_events.h @@ -33,7 +33,7 @@ extern void user_event_mm_dup(struct task_struct *t, extern void user_event_mm_remove(struct task_struct *t); static inline void user_events_fork(struct task_struct *t, - unsigned long clone_flags) + u64 clone_flags) { struct user_event_mm *old_mm; @@ -68,7 +68,7 @@ static inline void user_events_exit(struct task_struct *t) } #else static inline void user_events_fork(struct task_struct *t, - unsigned long clone_flags) + u64 clone_flags) { } diff --git a/include/linux/utsname.h b/include/linux/utsname.h index bf7613ba412b..ba34ec0e2f95 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -35,7 +35,7 @@ static inline void get_uts_ns(struct uts_namespace *ns) refcount_inc(&ns->ns.count); } -extern struct uts_namespace *copy_utsname(unsigned long flags, +extern struct uts_namespace *copy_utsname(u64 flags, struct user_namespace *user_ns, struct uts_namespace *old_ns); extern void free_uts_ns(struct uts_namespace *ns); @@ -55,7 +55,7 @@ static inline void put_uts_ns(struct uts_namespace *ns) { } -static inline struct uts_namespace *copy_utsname(unsigned long flags, +static inline struct uts_namespace *copy_utsname(u64 flags, struct user_namespace *user_ns, struct uts_namespace *old_ns) { if (flags & CLONE_NEWUTS) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 025a7574b275..0e008cfe159d 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -204,7 +204,7 @@ struct net { extern struct net init_net; #ifdef CONFIG_NET_NS -struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, +struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns, struct net *old_net); void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid); @@ -218,7 +218,7 @@ extern struct task_struct *cleanup_net_task; #else /* CONFIG_NET_NS */ #include #include -static inline struct net *copy_net_ns(unsigned long flags, +static inline struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns, struct net *old_net) { if (flags & CLONE_NEWNET) diff --git a/include/trace/events/task.h b/include/trace/events/task.h index af535b053033..4f0759634306 100644 --- a/include/trace/events/task.h +++ b/include/trace/events/task.h @@ -8,14 +8,14 @@ TRACE_EVENT(task_newtask, - TP_PROTO(struct task_struct *task, unsigned long clone_flags), + TP_PROTO(struct task_struct *task, u64 clone_flags), TP_ARGS(task, clone_flags), TP_STRUCT__entry( __field( pid_t, pid) __array( char, comm, TASK_COMM_LEN) - __field( unsigned long, clone_flags) + __field( u64, clone_flags) __field( short, oom_score_adj) ), @@ -26,7 +26,7 @@ TRACE_EVENT(task_newtask, __entry->oom_score_adj = task->signal->oom_score_adj; ), - TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%hd", + TP_printk("pid=%d comm=%s clone_flags=%llx oom_score_adj=%hd", __entry->pid, __entry->comm, __entry->clone_flags, __entry->oom_score_adj) ); diff --git a/ipc/namespace.c b/ipc/namespace.c index 4df91ceeeafe..a712ec27209c 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -106,7 +106,7 @@ fail: return ERR_PTR(err); } -struct ipc_namespace *copy_ipcs(unsigned long flags, +struct ipc_namespace *copy_ipcs(u64 flags, struct user_namespace *user_ns, struct ipc_namespace *ns) { if (!(flags & CLONE_NEWIPC)) diff --git a/ipc/sem.c b/ipc/sem.c index a39cdc7bf88f..0f06e4bd4673 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -2303,7 +2303,7 @@ SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops, * parent and child tasks. */ -int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) +int copy_semundo(u64 clone_flags, struct task_struct *tsk) { struct sem_undo_list *undo_list; int error; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 144a464e45c6..dedadb525880 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -47,7 +47,7 @@ void free_cgroup_ns(struct cgroup_namespace *ns) } EXPORT_SYMBOL(free_cgroup_ns); -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, +struct cgroup_namespace *copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { diff --git a/kernel/cred.c b/kernel/cred.c index 9676965c0981..dbf6b687dc5c 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -287,7 +287,7 @@ struct cred *prepare_exec_creds(void) * The new process gets the current process's subjective credentials as its * objective and subjective credentials */ -int copy_creds(struct task_struct *p, unsigned long clone_flags) +int copy_creds(struct task_struct *p, u64 clone_flags) { struct cred *new; int ret; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7ca1940607bd..b2753014c6dd 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2160,7 +2160,7 @@ static void dup_xol_work(struct callback_head *work) /* * Called in context of a new clone/fork from copy_process. */ -void uprobe_copy_process(struct task_struct *t, unsigned long flags) +void uprobe_copy_process(struct task_struct *t, u64 flags) { struct uprobe_task *utask = current->utask; struct mm_struct *mm = current->mm; diff --git a/kernel/fork.c b/kernel/fork.c index 4e2c5a3e8989..d6e1fb11eff9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1507,7 +1507,7 @@ fail_nomem: return NULL; } -static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) +static int copy_mm(u64 clone_flags, struct task_struct *tsk) { struct mm_struct *mm, *oldmm; @@ -1545,7 +1545,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) +static int copy_fs(u64 clone_flags, struct task_struct *tsk) { struct fs_struct *fs = current->fs; if (clone_flags & CLONE_FS) { @@ -1566,7 +1566,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_files(unsigned long clone_flags, struct task_struct *tsk, +static int copy_files(u64 clone_flags, struct task_struct *tsk, int no_files) { struct files_struct *oldf, *newf; @@ -1645,7 +1645,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) posix_cputimers_group_init(pct, cpu_limit); } -static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) +static int copy_signal(u64 clone_flags, struct task_struct *tsk) { struct signal_struct *sig; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 5f31fdff8a38..8af3b9ec3aa8 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -64,7 +64,7 @@ static inline struct nsproxy *create_nsproxy(void) * Return the newly created nsproxy. Do not attach this to the task, * leave it to the caller to do proper locking and attach it to task. */ -static struct nsproxy *create_new_namespaces(unsigned long flags, +static struct nsproxy *create_new_namespaces(u64 flags, struct task_struct *tsk, struct user_namespace *user_ns, struct fs_struct *new_fs) { @@ -144,7 +144,7 @@ out_ns: * called from clone. This now handles copy for nsproxy and all * namespaces therein. */ -int copy_namespaces(unsigned long flags, struct task_struct *tsk) +int copy_namespaces(u64 flags, struct task_struct *tsk) { struct nsproxy *old_ns = tsk->nsproxy; struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 7098ed44e717..06bc7c7f78e0 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -171,7 +171,7 @@ static void destroy_pid_namespace_work(struct work_struct *work) } while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)); } -struct pid_namespace *copy_pid_ns(unsigned long flags, +struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *old_ns) { if (!(flags & CLONE_NEWPID)) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index be00629f0ba4..6fa85d30d965 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4472,7 +4472,7 @@ int wake_up_state(struct task_struct *p, unsigned int state) * __sched_fork() is basic setup which is also used by sched_init() to * initialize the boot CPU's idle task. */ -static void __sched_fork(unsigned long clone_flags, struct task_struct *p) +static void __sched_fork(u64 clone_flags, struct task_struct *p) { p->on_rq = 0; @@ -4707,7 +4707,7 @@ late_initcall(sched_core_sysctl_init); /* * fork()/clone()-time setup: */ -int sched_fork(unsigned long clone_flags, struct task_struct *p) +int sched_fork(u64 clone_flags, struct task_struct *p) { __sched_fork(clone_flags, p); /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b173a059315c..af0866ce2dfc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3542,7 +3542,7 @@ out: } } -void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +void init_numa_balancing(u64 clone_flags, struct task_struct *p) { int mm_users = 0; struct mm_struct *mm = p->mm; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index be9745d104f7..f9adfc912ddc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1935,12 +1935,12 @@ extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *p, struct task_struct *t, int cpu, int scpu); -extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); +extern void init_numa_balancing(u64 clone_flags, struct task_struct *p); #else /* !CONFIG_NUMA_BALANCING: */ static inline void -init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +init_numa_balancing(u64 clone_flags, struct task_struct *p) { } diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 667452768ed3..888872bcc5bb 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -130,7 +130,7 @@ fail: * * Return: timens_for_children namespace or ERR_PTR. */ -struct time_namespace *copy_time_ns(unsigned long flags, +struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns) { if (!(flags & CLONE_NEWTIME)) diff --git a/kernel/utsname.c b/kernel/utsname.c index b1ac3ca870f2..00d8d7922f86 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -86,7 +86,7 @@ fail: * utsname of this process won't be seen by parent, and vice * versa. */ -struct uts_namespace *copy_utsname(unsigned long flags, +struct uts_namespace *copy_utsname(u64 flags, struct user_namespace *user_ns, struct uts_namespace *old_ns) { struct uts_namespace *new_ns; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 1b6f3826dd0e..8ec9d83475bf 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -539,7 +539,7 @@ void net_drop_ns(void *p) net_passive_dec(net); } -struct net *copy_net_ns(unsigned long flags, +struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns, struct net *old_net) { struct ucounts *ucounts; diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 8e1cc229b41b..ba39cfe0cd08 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -112,7 +112,7 @@ static void apparmor_task_free(struct task_struct *task) } static int apparmor_task_alloc(struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) { struct aa_task_ctx *new = task_ctx(task); diff --git a/security/security.c b/security/security.c index ad163f06bf7a..a769140553bc 100644 --- a/security/security.c +++ b/security/security.c @@ -3185,7 +3185,7 @@ int security_file_truncate(struct file *file) * * Return: Returns a zero on success, negative values on failure. */ -int security_task_alloc(struct task_struct *task, unsigned long clone_flags) +int security_task_alloc(struct task_struct *task, u64 clone_flags) { int rc = lsm_task_alloc(task); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c95a5874bf7d..bb016dd511c1 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -4144,7 +4144,7 @@ static int selinux_file_open(struct file *file) /* task security operations */ static int selinux_task_alloc(struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) { u32 sid = current_sid(); diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c index d6ebcd9db80a..48fc59d38ab2 100644 --- a/security/tomoyo/tomoyo.c +++ b/security/tomoyo/tomoyo.c @@ -514,7 +514,7 @@ struct lsm_blob_sizes tomoyo_blob_sizes __ro_after_init = { * Returns 0. */ static int tomoyo_task_alloc(struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) { struct tomoyo_task *old = tomoyo_task(current); struct tomoyo_task *new = tomoyo_task(task); -- cgit v1.2.3 From 7051b54fb5aa2d0b77657aef7c272471b36c0327 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 29 Aug 2025 21:56:38 +0000 Subject: tcp: Remove sk->sk_prot->orphan_count. TCP tracks the number of orphaned (SOCK_DEAD but not yet destructed) sockets in tcp_orphan_count. In some code that was shared with DCCP, tcp_orphan_count is referenced via sk->sk_prot->orphan_count. Let's reference tcp_orphan_count directly. inet_csk_prepare_for_destroy_sock() is moved to inet_connection_sock.c due to header dependency. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Jason Xing Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250829215641.711664-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c | 4 ++-- drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h | 1 - include/net/inet_connection_sock.h | 8 +------- include/net/sock.h | 2 -- include/net/tcp.h | 10 ++++++++++ net/ipv4/inet_connection_sock.c | 11 +++++++++-- net/ipv4/inet_hashtables.c | 2 +- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_ipv4.c | 1 - net/ipv6/tcp_ipv6.c | 1 - 10 files changed, 24 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c index 000116e47e38..4ee970f3bad6 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c @@ -505,7 +505,7 @@ static void reset_listen_child(struct sock *child) chtls_send_reset(child, CPL_ABORT_SEND_RST, skb); sock_orphan(child); - INC_ORPHAN_COUNT(child); + tcp_orphan_count_inc(); if (child->sk_state == TCP_CLOSE) inet_csk_destroy_sock(child); } @@ -870,7 +870,7 @@ static void do_abort_syn_rcv(struct sock *child, struct sock *parent) * created only after 3 way handshake is done. */ sock_orphan(child); - INC_ORPHAN_COUNT(child); + tcp_orphan_count_inc(); chtls_release_resources(child); chtls_conn_done(child); } else { diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h index 667effc2a23c..29ceff5a5fcb 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h @@ -95,7 +95,6 @@ struct deferred_skb_cb { #define WSCALE_OK(tp) ((tp)->rx_opt.wscale_ok) #define TSTAMP_OK(tp) ((tp)->rx_opt.tstamp_ok) #define SACK_OK(tp) ((tp)->rx_opt.sack_ok) -#define INC_ORPHAN_COUNT(sk) this_cpu_inc(*(sk)->sk_prot->orphan_count) /* TLS SKB */ #define skb_ulp_tls_inline(skb) (ULP_SKB_CB(skb)->ulp.tls.ofld) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 1735db332aab..0737d8e178dd 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -299,14 +299,8 @@ reqsk_timeout(struct request_sock *req, unsigned long max_timeout) return (unsigned long)min_t(u64, timeout, max_timeout); } -static inline void inet_csk_prepare_for_destroy_sock(struct sock *sk) -{ - /* The below has to be done to allow calling inet_csk_destroy_sock */ - sock_set_flag(sk, SOCK_DEAD); - this_cpu_inc(*sk->sk_prot->orphan_count); -} - void inet_csk_destroy_sock(struct sock *sk); +void inet_csk_prepare_for_destroy_sock(struct sock *sk); void inet_csk_prepare_forced_close(struct sock *sk); /* diff --git a/include/net/sock.h b/include/net/sock.h index 73cd3316e288..1e7f124871d2 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1353,8 +1353,6 @@ struct proto { unsigned int useroffset; /* Usercopy region offset */ unsigned int usersize; /* Usercopy region size */ - unsigned int __percpu *orphan_count; - struct request_sock_ops *rsk_prot; struct timewait_sock_ops *twsk_prot; diff --git a/include/net/tcp.h b/include/net/tcp.h index 16dc9cebb9d2..0fb7923b8367 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -54,6 +54,16 @@ extern struct inet_hashinfo tcp_hashinfo; DECLARE_PER_CPU(unsigned int, tcp_orphan_count); int tcp_orphan_count_sum(void); +static inline void tcp_orphan_count_inc(void) +{ + this_cpu_inc(tcp_orphan_count); +} + +static inline void tcp_orphan_count_dec(void) +{ + this_cpu_dec(tcp_orphan_count); +} + DECLARE_PER_CPU(u32, tcp_tw_isn); void tcp_time_wait(struct sock *sk, int state, int timeo); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 0ef1eacd539d..142ff8d86fc2 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1296,12 +1296,19 @@ void inet_csk_destroy_sock(struct sock *sk) xfrm_sk_free_policy(sk); - this_cpu_dec(*sk->sk_prot->orphan_count); + tcp_orphan_count_dec(); sock_put(sk); } EXPORT_SYMBOL(inet_csk_destroy_sock); +void inet_csk_prepare_for_destroy_sock(struct sock *sk) +{ + /* The below has to be done to allow calling inet_csk_destroy_sock */ + sock_set_flag(sk, SOCK_DEAD); + tcp_orphan_count_inc(); +} + /* This function allows to force a closure of a socket after the call to * tcp_create_openreq_child(). */ @@ -1369,7 +1376,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req, sock_orphan(child); - this_cpu_inc(*sk->sk_prot->orphan_count); + tcp_orphan_count_inc(); if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 4bc2b1921d2b..ef4ccfd46ff6 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -707,7 +707,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) if (ok) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } else { - this_cpu_inc(*sk->sk_prot->orphan_count); + tcp_orphan_count_inc(); inet_sk_set_state(sk, TCP_CLOSE); sock_set_flag(sk, SOCK_DEAD); inet_csk_destroy_sock(sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9bc8317e92b7..40b774b4f587 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3195,7 +3195,7 @@ adjudge_to_death: /* remove backlog if any, without releasing ownership. */ __release_sock(sk); - this_cpu_inc(tcp_orphan_count); + tcp_orphan_count_inc(); /* Have we already been destroyed by a softirq or backlog? */ if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7c1d612afca1..1e58a8a9ff7a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3517,7 +3517,6 @@ struct proto tcp_prot = { .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, - .orphan_count = &tcp_orphan_count, .memory_allocated = &net_aligned_data.tcp_memory_allocated, .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b4e56b877273..07ba32156770 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2353,7 +2353,6 @@ struct proto tcpv6_prot = { .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, .memory_pressure = &tcp_memory_pressure, - .orphan_count = &tcp_orphan_count, .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), -- cgit v1.2.3 From c6dd1aa2cbb72b33e0569f3e71d95792beab5042 Mon Sep 17 00:00:00 2001 From: Fabian Bläse Date: Thu, 28 Aug 2025 11:14:35 +0200 Subject: icmp: fix icmp_ndo_send address translation for reply direction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The icmp_ndo_send function was originally introduced to ensure proper rate limiting when icmp_send is called by a network device driver, where the packet's source address may have already been transformed by SNAT. However, the original implementation only considers the IP_CT_DIR_ORIGINAL direction for SNAT and always replaced the packet's source address with that of the original-direction tuple. This causes two problems: 1. For SNAT: Reply-direction packets were incorrectly translated using the source address of the CT original direction, even though no translation is required. 2. For DNAT: Reply-direction packets were not handled at all. In DNAT, the original direction's destination is translated. Therefore, in the reply direction the source address must be set to the reply-direction source, so rate limiting works as intended. Fix this by using the connection direction to select the correct tuple for source address translation, and adjust the pre-checks to handle reply-direction packets in case of DNAT. Additionally, wrap the `ct->status` access in READ_ONCE(). This avoids possible KCSAN reports about concurrent updates to `ct->status`. Fixes: 0b41713b6066 ("icmp: introduce helper for nat'd source address in network device context") Signed-off-by: Fabian Bläse Cc: Jason A. Donenfeld Reviewed-by: Florian Westphal Signed-off-by: Jakub Kicinski --- net/ipv4/icmp.c | 6 ++++-- net/ipv6/ip6_icmp.c | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 2ffe73ea644f..c48c572f024d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -799,11 +799,12 @@ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) struct sk_buff *cloned_skb = NULL; struct ip_options opts = { 0 }; enum ip_conntrack_info ctinfo; + enum ip_conntrack_dir dir; struct nf_conn *ct; __be32 orig_ip; ct = nf_ct_get(skb_in, &ctinfo); - if (!ct || !(ct->status & IPS_SRC_NAT)) { + if (!ct || !(READ_ONCE(ct->status) & IPS_NAT_MASK)) { __icmp_send(skb_in, type, code, info, &opts); return; } @@ -818,7 +819,8 @@ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) goto out; orig_ip = ip_hdr(skb_in)->saddr; - ip_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.ip; + dir = CTINFO2DIR(ctinfo); + ip_hdr(skb_in)->saddr = ct->tuplehash[dir].tuple.src.u3.ip; __icmp_send(skb_in, type, code, info, &opts); ip_hdr(skb_in)->saddr = orig_ip; out: diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c index 9e3574880cb0..233914b63bdb 100644 --- a/net/ipv6/ip6_icmp.c +++ b/net/ipv6/ip6_icmp.c @@ -54,11 +54,12 @@ void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) struct inet6_skb_parm parm = { 0 }; struct sk_buff *cloned_skb = NULL; enum ip_conntrack_info ctinfo; + enum ip_conntrack_dir dir; struct in6_addr orig_ip; struct nf_conn *ct; ct = nf_ct_get(skb_in, &ctinfo); - if (!ct || !(ct->status & IPS_SRC_NAT)) { + if (!ct || !(READ_ONCE(ct->status) & IPS_NAT_MASK)) { __icmpv6_send(skb_in, type, code, info, &parm); return; } @@ -73,7 +74,8 @@ void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) goto out; orig_ip = ipv6_hdr(skb_in)->saddr; - ipv6_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.in6; + dir = CTINFO2DIR(ctinfo); + ipv6_hdr(skb_in)->saddr = ct->tuplehash[dir].tuple.src.u3.in6; __icmpv6_send(skb_in, type, code, info, &parm); ipv6_hdr(skb_in)->saddr = orig_ip; out: -- cgit v1.2.3 From 59f26d86b2a16f1406f3b42025062b6d1fba5dd5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Aug 2025 15:30:51 +0000 Subject: inet: ping: check sock_net() in ping_get_port() and ping_lookup() We need to check socket netns before considering them in ping_get_port(). Otherwise, one malicious netns could 'consume' all ports. Add corresponding check in ping_lookup(). Fixes: c319b4d76b9e ("net: ipv4: add IPPROTO_ICMP socket kind") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Yue Haibing Link: https://patch.msgid.link/20250829153054.474201-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/ping.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index f119da68fc30..74a0beddfcc4 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -77,6 +77,7 @@ static inline struct hlist_head *ping_hashslot(struct ping_table *table, int ping_get_port(struct sock *sk, unsigned short ident) { + struct net *net = sock_net(sk); struct inet_sock *isk, *isk2; struct hlist_head *hlist; struct sock *sk2 = NULL; @@ -90,9 +91,10 @@ int ping_get_port(struct sock *sk, unsigned short ident) for (i = 0; i < (1L << 16); i++, result++) { if (!result) result++; /* avoid zero */ - hlist = ping_hashslot(&ping_table, sock_net(sk), - result); + hlist = ping_hashslot(&ping_table, net, result); sk_for_each(sk2, hlist) { + if (!net_eq(sock_net(sk2), net)) + continue; isk2 = inet_sk(sk2); if (isk2->inet_num == result) @@ -108,8 +110,10 @@ next_port: if (i >= (1L << 16)) goto fail; } else { - hlist = ping_hashslot(&ping_table, sock_net(sk), ident); + hlist = ping_hashslot(&ping_table, net, ident); sk_for_each(sk2, hlist) { + if (!net_eq(sock_net(sk2), net)) + continue; isk2 = inet_sk(sk2); /* BUG? Why is this reuse and not reuseaddr? ping.c @@ -129,7 +133,7 @@ next_port: pr_debug("was not hashed\n"); sk_add_node_rcu(sk, hlist); sock_set_flag(sk, SOCK_RCU_FREE); - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + sock_prot_inuse_add(net, sk->sk_prot, 1); } spin_unlock(&ping_table.lock); return 0; @@ -188,6 +192,8 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) } sk_for_each_rcu(sk, hslot) { + if (!net_eq(sock_net(sk), net)) + continue; isk = inet_sk(sk); pr_debug("iterate\n"); -- cgit v1.2.3 From 10343e7e6c7c6558217b56fb44a538ad04752adb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Aug 2025 15:30:52 +0000 Subject: inet: ping: remove ping_hash() There is no point in keeping ping_hash(). Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Yue Haibing Link: https://patch.msgid.link/20250829153054.474201-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ping.h | 1 - net/ipv4/ping.c | 10 ---------- net/ipv6/ping.c | 1 - 3 files changed, 12 deletions(-) (limited to 'net') diff --git a/include/net/ping.h b/include/net/ping.h index bc7779262e60..9634b8800814 100644 --- a/include/net/ping.h +++ b/include/net/ping.h @@ -54,7 +54,6 @@ struct pingfakehdr { }; int ping_get_port(struct sock *sk, unsigned short ident); -int ping_hash(struct sock *sk); void ping_unhash(struct sock *sk); int ping_init_sock(struct sock *sk); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 74a0beddfcc4..75e1b0f5c697 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -67,7 +67,6 @@ static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask) pr_debug("hash(%u) = %u\n", num, res); return res; } -EXPORT_SYMBOL_GPL(ping_hash); static inline struct hlist_head *ping_hashslot(struct ping_table *table, struct net *net, unsigned int num) @@ -144,14 +143,6 @@ fail: } EXPORT_SYMBOL_GPL(ping_get_port); -int ping_hash(struct sock *sk) -{ - pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num); - BUG(); /* "Please do not press this button again." */ - - return 0; -} - void ping_unhash(struct sock *sk) { struct inet_sock *isk = inet_sk(sk); @@ -1008,7 +999,6 @@ struct proto ping_prot = { .bind = ping_bind, .backlog_rcv = ping_queue_rcv_skb, .release_cb = ip4_datagram_release_cb, - .hash = ping_hash, .unhash = ping_unhash, .get_port = ping_get_port, .put_port = ping_unhash, diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 82b0492923d4..d7a2cdaa2631 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -208,7 +208,6 @@ struct proto pingv6_prot = { .recvmsg = ping_recvmsg, .bind = ping_bind, .backlog_rcv = ping_queue_rcv_skb, - .hash = ping_hash, .unhash = ping_unhash, .get_port = ping_get_port, .put_port = ping_unhash, -- cgit v1.2.3 From 689adb36bd433b24390080606a07d664cca2982e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Aug 2025 15:30:53 +0000 Subject: inet: ping: make ping_port_rover per netns Provide isolation between netns for ping idents. Randomize initial ping_port_rover value at netns creation. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250829153054.474201-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 1 + net/ipv4/ping.c | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 6373e3f17da8..54a7d187f62a 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -251,6 +251,7 @@ struct netns_ipv4 { int sysctl_igmp_qrv; struct ping_group_range ping_group_range; + u16 ping_port_rover; atomic_t dev_addr_genid; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 75e1b0f5c697..98ccd4f9ed65 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -58,8 +58,6 @@ static struct ping_table ping_table; struct pingv6_ops pingv6_ops; EXPORT_SYMBOL_GPL(pingv6_ops); -static u16 ping_port_rover; - static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask) { u32 res = (num + net_hash_mix(net)) & mask; @@ -84,12 +82,12 @@ int ping_get_port(struct sock *sk, unsigned short ident) isk = inet_sk(sk); spin_lock(&ping_table.lock); if (ident == 0) { + u16 result = net->ipv4.ping_port_rover + 1; u32 i; - u16 result = ping_port_rover + 1; for (i = 0; i < (1L << 16); i++, result++) { if (!result) - result++; /* avoid zero */ + continue; /* avoid zero */ hlist = ping_hashslot(&ping_table, net, result); sk_for_each(sk2, hlist) { if (!net_eq(sock_net(sk2), net)) @@ -101,7 +99,7 @@ int ping_get_port(struct sock *sk, unsigned short ident) } /* found */ - ping_port_rover = ident = result; + net->ipv4.ping_port_rover = ident = result; break; next_port: ; @@ -1146,6 +1144,8 @@ static int __net_init ping_v4_proc_init_net(struct net *net) if (!proc_create_net("icmp", 0444, net->proc_net, &ping_v4_seq_ops, sizeof(struct ping_iter_state))) return -ENOMEM; + + net->ipv4.ping_port_rover = get_random_u16(); return 0; } -- cgit v1.2.3 From 51ba2d26bcc61b65b7bd26346580d016ce8f7fa0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Aug 2025 15:30:54 +0000 Subject: inet: ping: use EXPORT_IPV6_MOD[_GPL]() There is no neeed to export ping symbols when CONFIG_IPV6=y Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250829153054.474201-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/ping.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 98ccd4f9ed65..5321c5801c64 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -56,7 +56,7 @@ struct ping_table { static struct ping_table ping_table; struct pingv6_ops pingv6_ops; -EXPORT_SYMBOL_GPL(pingv6_ops); +EXPORT_IPV6_MOD_GPL(pingv6_ops); static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask) { @@ -139,7 +139,7 @@ fail: spin_unlock(&ping_table.lock); return -EADDRINUSE; } -EXPORT_SYMBOL_GPL(ping_get_port); +EXPORT_IPV6_MOD_GPL(ping_get_port); void ping_unhash(struct sock *sk) { @@ -154,7 +154,7 @@ void ping_unhash(struct sock *sk) } spin_unlock(&ping_table.lock); } -EXPORT_SYMBOL_GPL(ping_unhash); +EXPORT_IPV6_MOD_GPL(ping_unhash); /* Called under rcu_read_lock() */ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) @@ -274,7 +274,7 @@ out_release_group: put_group_info(group_info); return ret; } -EXPORT_SYMBOL_GPL(ping_init_sock); +EXPORT_IPV6_MOD_GPL(ping_init_sock); void ping_close(struct sock *sk, long timeout) { @@ -284,7 +284,7 @@ void ping_close(struct sock *sk, long timeout) sk_common_release(sk); } -EXPORT_SYMBOL_GPL(ping_close); +EXPORT_IPV6_MOD_GPL(ping_close); static int ping_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) @@ -462,7 +462,7 @@ out: pr_debug("ping_v4_bind -> %d\n", err); return err; } -EXPORT_SYMBOL_GPL(ping_bind); +EXPORT_IPV6_MOD_GPL(ping_bind); /* * Is this a supported type of ICMP message? @@ -595,7 +595,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info) out: return; } -EXPORT_SYMBOL_GPL(ping_err); +EXPORT_IPV6_MOD_GPL(ping_err); /* * Copy and checksum an ICMP Echo packet from user space into a buffer @@ -625,7 +625,7 @@ int ping_getfrag(void *from, char *to, return 0; } -EXPORT_SYMBOL_GPL(ping_getfrag); +EXPORT_IPV6_MOD_GPL(ping_getfrag); static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh, struct flowi4 *fl4) @@ -686,7 +686,7 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, return 0; } -EXPORT_SYMBOL_GPL(ping_common_sendmsg); +EXPORT_IPV6_MOD_GPL(ping_common_sendmsg); static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { @@ -931,7 +931,7 @@ out: pr_debug("ping_recvmsg -> %d\n", err); return err; } -EXPORT_SYMBOL_GPL(ping_recvmsg); +EXPORT_IPV6_MOD_GPL(ping_recvmsg); static enum skb_drop_reason __ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) @@ -952,7 +952,7 @@ int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { return __ping_queue_rcv_skb(sk, skb) ? -1 : 0; } -EXPORT_SYMBOL_GPL(ping_queue_rcv_skb); +EXPORT_IPV6_MOD_GPL(ping_queue_rcv_skb); /* @@ -980,7 +980,7 @@ enum skb_drop_reason ping_rcv(struct sk_buff *skb) kfree_skb_reason(skb, SKB_DROP_REASON_NO_SOCKET); return SKB_DROP_REASON_NO_SOCKET; } -EXPORT_SYMBOL_GPL(ping_rcv); +EXPORT_IPV6_MOD_GPL(ping_rcv); struct proto ping_prot = { .name = "PING", @@ -1002,7 +1002,7 @@ struct proto ping_prot = { .put_port = ping_unhash, .obj_size = sizeof(struct inet_sock), }; -EXPORT_SYMBOL(ping_prot); +EXPORT_IPV6_MOD(ping_prot); #ifdef CONFIG_PROC_FS @@ -1067,7 +1067,7 @@ void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family) return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN; } -EXPORT_SYMBOL_GPL(ping_seq_start); +EXPORT_IPV6_MOD_GPL(ping_seq_start); static void *ping_v4_seq_start(struct seq_file *seq, loff_t *pos) { @@ -1086,14 +1086,14 @@ void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++*pos; return sk; } -EXPORT_SYMBOL_GPL(ping_seq_next); +EXPORT_IPV6_MOD_GPL(ping_seq_next); void ping_seq_stop(struct seq_file *seq, void *v) __releases(ping_table.lock) { spin_unlock(&ping_table.lock); } -EXPORT_SYMBOL_GPL(ping_seq_stop); +EXPORT_IPV6_MOD_GPL(ping_seq_stop); static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, int bucket) -- cgit v1.2.3 From ba1e9421cf1a8369d25c3832439702a015d6b5f9 Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Thu, 28 Aug 2025 20:41:17 +0800 Subject: net/smc: fix one NULL pointer dereference in smc_ib_is_sg_need_sync() BUG: kernel NULL pointer dereference, address: 00000000000002ec PGD 0 P4D 0 Oops: Oops: 0000 [#1] SMP PTI CPU: 28 UID: 0 PID: 343 Comm: kworker/28:1 Kdump: loaded Tainted: G OE 6.17.0-rc2+ #9 NONE Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014 Workqueue: smc_hs_wq smc_listen_work [smc] RIP: 0010:smc_ib_is_sg_need_sync+0x9e/0xd0 [smc] ... Call Trace: smcr_buf_map_link+0x211/0x2a0 [smc] __smc_buf_create+0x522/0x970 [smc] smc_buf_create+0x3a/0x110 [smc] smc_find_rdma_v2_device_serv+0x18f/0x240 [smc] ? smc_vlan_by_tcpsk+0x7e/0xe0 [smc] smc_listen_find_device+0x1dd/0x2b0 [smc] smc_listen_work+0x30f/0x580 [smc] process_one_work+0x18c/0x340 worker_thread+0x242/0x360 kthread+0xe7/0x220 ret_from_fork+0x13a/0x160 ret_from_fork_asm+0x1a/0x30 If the software RoCE device is used, ibdev->dma_device is a null pointer. As a result, the problem occurs. Null pointer detection is added to prevent problems. Fixes: 0ef69e788411c ("net/smc: optimize for smc_sndbuf_sync_sg_for_device and smc_rmb_sync_sg_for_cpu") Signed-off-by: Liu Jian Reviewed-by: Guangguan Wang Reviewed-by: Zhu Yanjun Reviewed-by: D. Wythe Link: https://patch.msgid.link/20250828124117.2622624-1-liujian56@huawei.com Signed-off-by: Paolo Abeni --- net/smc/smc_ib.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 53828833a3f7..a42ef3f77b96 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -742,6 +742,9 @@ bool smc_ib_is_sg_need_sync(struct smc_link *lnk, unsigned int i; bool ret = false; + if (!lnk->smcibdev->ibdev->dma_device) + return ret; + /* for now there is just one DMA address */ for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, buf_slot->sgt[lnk->link_idx].nents, i) { -- cgit v1.2.3 From 773b27a8a2f00ce3134e92e50ea4794a98ba2b76 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Fri, 29 Aug 2025 15:28:26 +0800 Subject: net: mctp: mctp_fraq_queue should take ownership of passed skb As of commit f5d83cf0eeb9 ("net: mctp: unshare packets when reassembling"), we skb_unshare() in mctp_frag_queue(). The unshare may invalidate the original skb pointer, so we need to treat the skb as entirely owned by the fraq queue, even on failure. Fixes: f5d83cf0eeb9 ("net: mctp: unshare packets when reassembling") Signed-off-by: Jeremy Kerr Link: https://patch.msgid.link/20250829-mctp-skb-unshare-v1-1-1c28fe10235a@codeconstruct.com.au Signed-off-by: Paolo Abeni --- net/mctp/route.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/mctp/route.c b/net/mctp/route.c index 2b2b958ef6a3..4d314e062ba9 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -378,6 +378,7 @@ static void mctp_skb_set_flow(struct sk_buff *skb, struct mctp_sk_key *key) {} static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev) {} #endif +/* takes ownership of skb, both in success and failure cases */ static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb) { struct mctp_hdr *hdr = mctp_hdr(skb); @@ -387,8 +388,10 @@ static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb) & MCTP_HDR_SEQ_MASK; if (!key->reasm_head) { - /* Since we're manipulating the shared frag_list, ensure it isn't - * shared with any other SKBs. + /* Since we're manipulating the shared frag_list, ensure it + * isn't shared with any other SKBs. In the cloned case, + * this will free the skb; callers can no longer access it + * safely. */ key->reasm_head = skb_unshare(skb, GFP_ATOMIC); if (!key->reasm_head) @@ -402,10 +405,10 @@ static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb) exp_seq = (key->last_seq + 1) & MCTP_HDR_SEQ_MASK; if (this_seq != exp_seq) - return -EINVAL; + goto err_free; if (key->reasm_head->len + skb->len > mctp_message_maxlen) - return -EINVAL; + goto err_free; skb->next = NULL; skb->sk = NULL; @@ -419,6 +422,10 @@ static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb) key->reasm_head->truesize += skb->truesize; return 0; + +err_free: + kfree_skb(skb); + return -EINVAL; } static int mctp_dst_input(struct mctp_dst *dst, struct sk_buff *skb) @@ -532,18 +539,16 @@ static int mctp_dst_input(struct mctp_dst *dst, struct sk_buff *skb) * key isn't observable yet */ mctp_frag_queue(key, skb); + skb = NULL; /* if the key_add fails, we've raced with another * SOM packet with the same src, dest and tag. There's * no way to distinguish future packets, so all we - * can do is drop; we'll free the skb on exit from - * this function. + * can do is drop. */ rc = mctp_key_add(key, msk); - if (!rc) { + if (!rc) trace_mctp_key_acquire(key); - skb = NULL; - } /* we don't need to release key->lock on exit, so * clean up here and suppress the unlock via @@ -561,8 +566,7 @@ static int mctp_dst_input(struct mctp_dst *dst, struct sk_buff *skb) key = NULL; } else { rc = mctp_frag_queue(key, skb); - if (!rc) - skb = NULL; + skb = NULL; } } @@ -572,17 +576,16 @@ static int mctp_dst_input(struct mctp_dst *dst, struct sk_buff *skb) */ /* we need to be continuing an existing reassembly... */ - if (!key->reasm_head) + if (!key->reasm_head) { rc = -EINVAL; - else + } else { rc = mctp_frag_queue(key, skb); + skb = NULL; + } if (rc) goto out_unlock; - /* we've queued; the queue owns the skb now */ - skb = NULL; - /* end of message? deliver to socket, and we're done with * the reassembly/response key */ -- cgit v1.2.3 From 46015e6b3ea75297b28d4806564f3f692cf11861 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Sun, 17 Aug 2025 17:15:56 +0800 Subject: netfilter: ebtables: Use vmalloc_array() to improve code Remove array_size() calls and replace vmalloc() with vmalloc_array() to simplify the code. vmalloc_array() is also optimized better, uses fewer instructions, and handles overflow more concisely[1]. [1]: https://lore.kernel.org/lkml/abc66ec5-85a4-47e1-9759-2f60ab111971@vivo.com/ Signed-off-by: Qianfeng Rong Signed-off-by: Florian Westphal --- net/bridge/netfilter/ebtables.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 3e67d4aff419..5697e3949a36 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -920,8 +920,8 @@ static int translate_table(struct net *net, const char *name, * if an error occurs */ newinfo->chainstack = - vmalloc(array_size(nr_cpu_ids, - sizeof(*(newinfo->chainstack)))); + vmalloc_array(nr_cpu_ids, + sizeof(*(newinfo->chainstack))); if (!newinfo->chainstack) return -ENOMEM; for_each_possible_cpu(i) { @@ -938,7 +938,7 @@ static int translate_table(struct net *net, const char *name, } } - cl_s = vmalloc(array_size(udc_cnt, sizeof(*cl_s))); + cl_s = vmalloc_array(udc_cnt, sizeof(*cl_s)); if (!cl_s) return -ENOMEM; i = 0; /* the i'th udc */ @@ -1018,8 +1018,8 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, * the check on the size is done later, when we have the lock */ if (repl->num_counters) { - unsigned long size = repl->num_counters * sizeof(*counterstmp); - counterstmp = vmalloc(size); + counterstmp = vmalloc_array(repl->num_counters, + sizeof(*counterstmp)); if (!counterstmp) return -ENOMEM; } @@ -1386,7 +1386,7 @@ static int do_update_counters(struct net *net, const char *name, if (num_counters == 0) return -EINVAL; - tmp = vmalloc(array_size(num_counters, sizeof(*tmp))); + tmp = vmalloc_array(num_counters, sizeof(*tmp)); if (!tmp) return -ENOMEM; @@ -1526,7 +1526,7 @@ static int copy_counters_to_user(struct ebt_table *t, if (num_counters != nentries) return -EINVAL; - counterstmp = vmalloc(array_size(nentries, sizeof(*counterstmp))); + counterstmp = vmalloc_array(nentries, sizeof(*counterstmp)); if (!counterstmp) return -ENOMEM; -- cgit v1.2.3 From c015e17ba111e1cd2c7180204b006266aa15c263 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 18 Aug 2025 11:48:30 +0200 Subject: netfilter: nft_payload: Use csum_replace4() instead of opencoding Open coded calculation can be avoided and replaced by the equivalent csum_replace4() in nft_csum_replace(). Signed-off-by: Christophe Leroy Reviewed-by: Simon Horman Signed-off-by: Florian Westphal --- net/netfilter/nft_payload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 7dfc5343dae4..059b28ffad0e 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -684,7 +684,7 @@ static const struct nft_expr_ops nft_payload_inner_ops = { static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum) { - *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), fsum), tsum)); + csum_replace4(sum, (__force __be32)fsum, (__force __be32)tsum); if (*sum == 0) *sum = CSUM_MANGLED_0; } -- cgit v1.2.3 From a60a5abe19d6acd9d9ea4c1883745399fb5dc023 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 22 Aug 2025 10:15:37 +0200 Subject: netfilter: nf_tables: allow iter callbacks to sleep Quoting Sven Auhagen: we do see on occasions that we get the following error message, more so on x86 systems than on arm64: Error: Could not process rule: Cannot allocate memory delete table inet filter It is not a consistent error and does not happen all the time. We are on Kernel 6.6.80, seems to me like we have something along the lines of the nf_tables: allow clone callbacks to sleep problem using GFP_ATOMIC. As hinted at by Sven, this is because of GFP_ATOMIC allocations during set flush. When set is flushed, all elements are deactivated. This triggers a set walk and each element gets added to the transaction list. The rbtree and rhashtable sets don't allow the iter callback to sleep: rbtree walk acquires read side of an rwlock with bh disabled, rhashtable walk happens with rcu read lock held. Rbtree is simple enough to resolve: When the walk context is ITER_READ, no change is needed (the iter callback must not deactivate elements; we're not in a transaction). When the iter type is ITER_UPDATE, the rwlock isn't needed because the caller holds the transaction mutex, this prevents any and all changes to the ruleset, including add/remove of set elements. Rhashtable is slightly more complex. When the iter type is ITER_READ, no change is needed, like rbtree. For ITER_UPDATE, we hold transaction mutex which prevents elements from getting free'd, even outside of rcu read lock section. So build a temporary list of all elements while doing the rcu iteration and then call the iterator in a second pass. The disadvantage is the need to iterate twice, but this cost comes with the benefit to allow the iter callback to use GFP_KERNEL allocations in a followup patch. The new list based logic makes it necessary to catch recursive calls to the same set earlier. Such walk -> iter -> walk recursion for the same set can happen during ruleset validation in case userspace gave us a bogus (cyclic) ruleset where verdict map m jumps to chain that sooner or later also calls "vmap @m". Before the new ->in_update_walk test, the ruleset is rejected because the infinite recursion causes ctx->level to exceed the allowed maximum. But with the new logic added here, elements would get skipped: nft_rhash_walk_update would see elements that are on the walk_list of an older stack frame. As all recursive calls into same map results in -EMLINK, we can avoid this problem by using the new in_update_walk flag and reject immediately. Next patch converts the problematic GFP_ATOMIC allocations. Reported-by: Sven Auhagen Closes: https://lore.kernel.org/netfilter-devel/BY1PR18MB5874110CAFF1ED098D0BC4E7E07BA@BY1PR18MB5874.namprd18.prod.outlook.com/ Signed-off-by: Florian Westphal --- include/net/netfilter/nf_tables.h | 2 + net/netfilter/nft_set_hash.c | 100 ++++++++++++++++++++++++++++++++++++-- net/netfilter/nft_set_rbtree.c | 35 ++++++++++--- 3 files changed, 126 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 891e43a01bdc..e2128663b160 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -556,6 +556,7 @@ struct nft_set_elem_expr { * @size: maximum set size * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element + * @in_update_walk: true during ->walk() in transaction phase * @use: number of rules references to this set * @nelems: number of elements * @ndeact: number of deactivated elements queued for removal @@ -590,6 +591,7 @@ struct nft_set { u32 size; u8 field_len[NFT_REG32_COUNT]; u8 field_count; + bool in_update_walk; u32 use; atomic_t nelems; u32 ndeact; diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 266d0c637225..ba01ce75d6de 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -30,6 +30,7 @@ struct nft_rhash { struct nft_rhash_elem { struct nft_elem_priv priv; struct rhash_head node; + struct llist_node walk_node; u32 wq_gc_seq; struct nft_set_ext ext; }; @@ -144,6 +145,7 @@ nft_rhash_update(struct nft_set *set, const u32 *key, goto err1; he = nft_elem_priv_cast(elem_priv); + init_llist_node(&he->walk_node); prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node, nft_rhash_params); if (IS_ERR(prev)) @@ -180,6 +182,7 @@ static int nft_rhash_insert(const struct net *net, const struct nft_set *set, }; struct nft_rhash_elem *prev; + init_llist_node(&he->walk_node); prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node, nft_rhash_params); if (IS_ERR(prev)) @@ -261,12 +264,12 @@ static bool nft_rhash_delete(const struct nft_set *set, return true; } -static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, - struct nft_set_iter *iter) +static void nft_rhash_walk_ro(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) { struct nft_rhash *priv = nft_set_priv(set); - struct nft_rhash_elem *he; struct rhashtable_iter hti; + struct nft_rhash_elem *he; rhashtable_walk_enter(&priv->ht, &hti); rhashtable_walk_start(&hti); @@ -295,6 +298,97 @@ cont: rhashtable_walk_exit(&hti); } +static void nft_rhash_walk_update(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_iter *iter) +{ + struct nft_rhash *priv = nft_set_priv(set); + struct nft_rhash_elem *he, *tmp; + struct llist_node *first_node; + struct rhashtable_iter hti; + LLIST_HEAD(walk_list); + + lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex); + + if (set->in_update_walk) { + /* This can happen with bogus rulesets during ruleset validation + * when a verdict map causes a jump back to the same map. + * + * Without this extra check the walk_next loop below will see + * elems on the callers walk_list and skip (not validate) them. + */ + iter->err = -EMLINK; + return; + } + + /* walk happens under RCU. + * + * We create a snapshot list so ->iter callback can sleep. + * commit_mutex is held, elements can ... + * .. be added in parallel from dataplane (dynset) + * .. be marked as dead in parallel from dataplane (dynset). + * .. be queued for removal in parallel (gc timeout). + * .. not be freed: transaction mutex is held. + */ + rhashtable_walk_enter(&priv->ht, &hti); + rhashtable_walk_start(&hti); + + while ((he = rhashtable_walk_next(&hti))) { + if (IS_ERR(he)) { + if (PTR_ERR(he) != -EAGAIN) { + iter->err = PTR_ERR(he); + break; + } + + continue; + } + + /* rhashtable resized during walk, skip */ + if (llist_on_list(&he->walk_node)) + continue; + + llist_add(&he->walk_node, &walk_list); + } + rhashtable_walk_stop(&hti); + rhashtable_walk_exit(&hti); + + first_node = __llist_del_all(&walk_list); + set->in_update_walk = true; + llist_for_each_entry_safe(he, tmp, first_node, walk_node) { + if (iter->err == 0) { + iter->err = iter->fn(ctx, set, iter, &he->priv); + if (iter->err == 0) + iter->count++; + } + + /* all entries must be cleared again, else next ->walk iteration + * will skip entries. + */ + init_llist_node(&he->walk_node); + } + set->in_update_walk = false; +} + +static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) +{ + switch (iter->type) { + case NFT_ITER_UPDATE: + /* only relevant for netlink dumps which use READ type */ + WARN_ON_ONCE(iter->skip != 0); + + nft_rhash_walk_update(ctx, set, iter); + break; + case NFT_ITER_READ: + nft_rhash_walk_ro(ctx, set, iter); + break; + default: + iter->err = -EINVAL; + WARN_ON_ONCE(1); + break; + } +} + static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set, struct nft_set_ext *ext) { diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 938a257c069e..b311b66df3e9 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -584,15 +584,14 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, return NULL; } -static void nft_rbtree_walk(const struct nft_ctx *ctx, - struct nft_set *set, - struct nft_set_iter *iter) +static void nft_rbtree_do_walk(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_iter *iter) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe; struct rb_node *node; - read_lock_bh(&priv->lock); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { rbe = rb_entry(node, struct nft_rbtree_elem, node); @@ -600,14 +599,34 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, goto cont; iter->err = iter->fn(ctx, set, iter, &rbe->priv); - if (iter->err < 0) { - read_unlock_bh(&priv->lock); + if (iter->err < 0) return; - } cont: iter->count++; } - read_unlock_bh(&priv->lock); +} + +static void nft_rbtree_walk(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_iter *iter) +{ + struct nft_rbtree *priv = nft_set_priv(set); + + switch (iter->type) { + case NFT_ITER_UPDATE: + lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex); + nft_rbtree_do_walk(ctx, set, iter); + break; + case NFT_ITER_READ: + read_lock_bh(&priv->lock); + nft_rbtree_do_walk(ctx, set, iter); + read_unlock_bh(&priv->lock); + break; + default: + iter->err = -EINVAL; + WARN_ON_ONCE(1); + break; + } } static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, -- cgit v1.2.3 From 3d95a2e016abab29ccb6f384576b2038e544a5a8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 22 Aug 2025 10:15:38 +0200 Subject: netfilter: nf_tables: all transaction allocations can now sleep Now that nft_setelem_flush is not called with rcu read lock held or disabled softinterrupts anymore this can now use GFP_KERNEL too. This is the last atomic allocation of transaction elements, so remove all gfp_t arguments and the wrapper function. This makes attempts to delete large sets much more reliable, before this was prone to transient memory allocation failures. Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 47 +++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 58c5425d61c2..54519b3d2868 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -151,12 +151,12 @@ static void nft_ctx_init(struct nft_ctx *ctx, bitmap_zero(ctx->reg_inited, NFT_REG32_NUM); } -static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx, - int msg_type, u32 size, gfp_t gfp) +static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx, + int msg_type, u32 size) { struct nft_trans *trans; - trans = kzalloc(size, gfp); + trans = kzalloc(size, GFP_KERNEL); if (trans == NULL) return NULL; @@ -172,12 +172,6 @@ static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx, return trans; } -static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx, - int msg_type, u32 size) -{ - return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL); -} - static struct nft_trans_binding *nft_trans_get_binding(struct nft_trans *trans) { switch (trans->msg_type) { @@ -442,8 +436,7 @@ static bool nft_trans_collapse_set_elem_allowed(const struct nft_trans_elem *a, static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net, struct nft_trans_elem *tail, - struct nft_trans_elem *trans, - gfp_t gfp) + struct nft_trans_elem *trans) { unsigned int nelems, old_nelems = tail->nelems; struct nft_trans_elem *new_trans; @@ -466,9 +459,11 @@ static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net, /* krealloc might free tail which invalidates list pointers */ list_del_init(&tail->nft_trans.list); - new_trans = krealloc(tail, struct_size(tail, elems, nelems), gfp); + new_trans = krealloc(tail, struct_size(tail, elems, nelems), + GFP_KERNEL); if (!new_trans) { - list_add_tail(&tail->nft_trans.list, &nft_net->commit_list); + list_add_tail(&tail->nft_trans.list, + &nft_net->commit_list); return false; } @@ -484,7 +479,7 @@ static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net, } static bool nft_trans_try_collapse(struct nftables_pernet *nft_net, - struct nft_trans *trans, gfp_t gfp) + struct nft_trans *trans) { struct nft_trans *tail; @@ -501,7 +496,7 @@ static bool nft_trans_try_collapse(struct nftables_pernet *nft_net, case NFT_MSG_DELSETELEM: return nft_trans_collapse_set_elem(nft_net, nft_trans_container_elem(tail), - nft_trans_container_elem(trans), gfp); + nft_trans_container_elem(trans)); } return false; @@ -537,17 +532,14 @@ static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *tr } } -static void nft_trans_commit_list_add_elem(struct net *net, struct nft_trans *trans, - gfp_t gfp) +static void nft_trans_commit_list_add_elem(struct net *net, struct nft_trans *trans) { struct nftables_pernet *nft_net = nft_pernet(net); WARN_ON_ONCE(trans->msg_type != NFT_MSG_NEWSETELEM && trans->msg_type != NFT_MSG_DELSETELEM); - might_alloc(gfp); - - if (nft_trans_try_collapse(nft_net, trans, gfp)) { + if (nft_trans_try_collapse(nft_net, trans)) { kfree(trans); return; } @@ -7549,7 +7541,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } ue->priv = elem_priv; - nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); + nft_trans_commit_list_add_elem(ctx->net, trans); goto err_elem_free; } } @@ -7573,7 +7565,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } nft_trans_container_elem(trans)->elems[0].priv = elem.priv; - nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); + nft_trans_commit_list_add_elem(ctx->net, trans); return 0; err_set_full: @@ -7839,7 +7831,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, nft_setelem_data_deactivate(ctx->net, set, elem.priv); nft_trans_container_elem(trans)->elems[0].priv = elem.priv; - nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); + nft_trans_commit_list_add_elem(ctx->net, trans); return 0; fail_ops: @@ -7864,9 +7856,8 @@ static int nft_setelem_flush(const struct nft_ctx *ctx, if (!nft_set_elem_active(ext, iter->genmask)) return 0; - trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM, - struct_size_t(struct nft_trans_elem, elems, 1), - GFP_ATOMIC); + trans = nft_trans_alloc(ctx, NFT_MSG_DELSETELEM, + struct_size_t(struct nft_trans_elem, elems, 1)); if (!trans) return -ENOMEM; @@ -7877,7 +7868,7 @@ static int nft_setelem_flush(const struct nft_ctx *ctx, nft_trans_elem_set(trans) = set; nft_trans_container_elem(trans)->nelems = 1; nft_trans_container_elem(trans)->elems[0].priv = elem_priv; - nft_trans_commit_list_add_elem(ctx->net, trans, GFP_ATOMIC); + nft_trans_commit_list_add_elem(ctx->net, trans); return 0; } @@ -7894,7 +7885,7 @@ static int __nft_set_catchall_flush(const struct nft_ctx *ctx, nft_setelem_data_deactivate(ctx->net, set, elem_priv); nft_trans_container_elem(trans)->elems[0].priv = elem_priv; - nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); + nft_trans_commit_list_add_elem(ctx->net, trans); return 0; } -- cgit v1.2.3 From 8959f27d39d65d759b0eb3aab248ffac27d531d7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 24 Aug 2025 12:44:39 +0200 Subject: netfilter: nft_set_pipapo: remove redundant test for avx feature bit Sebastian points out that avx2 depends on avx, see check_cpufeature_deps() in arch/x86/kernel/cpu/cpuid-deps.c: avx2 feature bit will be cleared when avx isn't available. No functional change intended. Reported-by: Sebastian Andrzej Siewior Reviewed-by: Stefano Brivio Signed-off-by: Florian Westphal --- net/netfilter/nft_set_pipapo.c | 3 +-- net/netfilter/nft_set_pipapo_avx2.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index b385cfcf886f..4b64c3bd8e70 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -530,8 +530,7 @@ static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m, local_bh_disable(); #if defined(CONFIG_X86_64) && !defined(CONFIG_UML) - if (boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX) && - irq_fpu_usable()) { + if (boot_cpu_has(X86_FEATURE_AVX2) && irq_fpu_usable()) { e = pipapo_get_avx2(m, data, genmask, tstamp); local_bh_enable(); return e; diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index 29326f3fcaf3..7559306d0aed 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1099,7 +1099,7 @@ bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, desc->field_count < NFT_PIPAPO_MIN_FIELDS) return false; - if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AVX)) + if (!boot_cpu_has(X86_FEATURE_AVX2)) return false; est->size = pipapo_estimate_size(desc); -- cgit v1.2.3 From f4f9e05904e11bbc772c031b35d0d25caa21d5e8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 13 Aug 2025 20:43:47 +0200 Subject: netfilter: nf_reject: remove unneeded exports These functions have no external callers and can be static. Signed-off-by: Florian Westphal --- include/net/netfilter/ipv4/nf_reject.h | 8 -------- include/net/netfilter/ipv6/nf_reject.h | 10 --------- net/ipv4/netfilter/nf_reject_ipv4.c | 27 ++++++++++++++++--------- net/ipv6/netfilter/nf_reject_ipv6.c | 37 +++++++++++++++++++++++----------- 4 files changed, 42 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/ipv4/nf_reject.h b/include/net/netfilter/ipv4/nf_reject.h index c653fcb88354..09de2f2686b5 100644 --- a/include/net/netfilter/ipv4/nf_reject.h +++ b/include/net/netfilter/ipv4/nf_reject.h @@ -10,14 +10,6 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook); void nf_send_reset(struct net *net, struct sock *, struct sk_buff *oldskb, int hook); -const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, - struct tcphdr *_oth, int hook); -struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - __u8 protocol, int ttl); -void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, - const struct tcphdr *oth); - struct sk_buff *nf_reject_skb_v4_unreach(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, diff --git a/include/net/netfilter/ipv6/nf_reject.h b/include/net/netfilter/ipv6/nf_reject.h index d729344ba644..94ec0b9f2838 100644 --- a/include/net/netfilter/ipv6/nf_reject.h +++ b/include/net/netfilter/ipv6/nf_reject.h @@ -9,16 +9,6 @@ void nf_send_unreach6(struct net *net, struct sk_buff *skb_in, unsigned char cod unsigned int hooknum); void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, int hook); -const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, - struct tcphdr *otcph, - unsigned int *otcplen, int hook); -struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - __u8 protocol, int hoplimit); -void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - const struct tcphdr *oth, unsigned int otcplen); - struct sk_buff *nf_reject_skb_v6_tcp_reset(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 0d3cb2ba6fc8..05631abe3f0d 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -12,6 +12,15 @@ #include #include +static struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __u8 protocol, int ttl); +static void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, + const struct tcphdr *oth); +static const struct tcphdr * +nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *_oth, int hook); + static int nf_reject_iphdr_validate(struct sk_buff *skb) { struct iphdr *iph; @@ -136,8 +145,9 @@ struct sk_buff *nf_reject_skb_v4_unreach(struct net *net, } EXPORT_SYMBOL_GPL(nf_reject_skb_v4_unreach); -const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, - struct tcphdr *_oth, int hook) +static const struct tcphdr * +nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *_oth, int hook) { const struct tcphdr *oth; @@ -163,11 +173,10 @@ const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, return oth; } -EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_get); -struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - __u8 protocol, int ttl) +static struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __u8 protocol, int ttl) { struct iphdr *niph, *oiph = ip_hdr(oldskb); @@ -188,10 +197,9 @@ struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, return niph; } -EXPORT_SYMBOL_GPL(nf_reject_iphdr_put); -void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, - const struct tcphdr *oth) +static void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, + const struct tcphdr *oth) { struct iphdr *niph = ip_hdr(nskb); struct tcphdr *tcph; @@ -218,7 +226,6 @@ void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, nskb->csum_start = (unsigned char *)tcph - nskb->head; nskb->csum_offset = offsetof(struct tcphdr, check); } -EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put); static int nf_reject_fill_skb_dst(struct sk_buff *skb_in) { diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index cb2d38e80de9..6b022449f867 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -12,6 +12,19 @@ #include #include +static struct ipv6hdr * +nf_reject_ip6hdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __u8 protocol, int hoplimit); +static void +nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + const struct tcphdr *oth, unsigned int otcplen); +static const struct tcphdr * +nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *otcph, + unsigned int *otcplen, int hook); + static bool nf_reject_v6_csum_ok(struct sk_buff *skb, int hook) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); @@ -146,9 +159,10 @@ struct sk_buff *nf_reject_skb_v6_unreach(struct net *net, } EXPORT_SYMBOL_GPL(nf_reject_skb_v6_unreach); -const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, - struct tcphdr *otcph, - unsigned int *otcplen, int hook) +static const struct tcphdr * +nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *otcph, + unsigned int *otcplen, int hook) { const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); u8 proto; @@ -192,11 +206,11 @@ const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, return otcph; } -EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_get); -struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - __u8 protocol, int hoplimit) +static struct ipv6hdr * +nf_reject_ip6hdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __u8 protocol, int hoplimit) { struct ipv6hdr *ip6h; const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); @@ -216,11 +230,11 @@ struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb, return ip6h; } -EXPORT_SYMBOL_GPL(nf_reject_ip6hdr_put); -void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - const struct tcphdr *oth, unsigned int otcplen) +static void +nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + const struct tcphdr *oth, unsigned int otcplen) { struct tcphdr *tcph; @@ -248,7 +262,6 @@ void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, csum_partial(tcph, sizeof(struct tcphdr), 0)); } -EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_put); static int nf_reject6_fill_skb_dst(struct sk_buff *skb_in) { -- cgit v1.2.3 From 077dc4a275790b09e8a2ce80822ba8970e9dfb99 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Thu, 28 Aug 2025 14:48:31 +0200 Subject: netfilter: nft_payload: extend offset to 65535 bytes In some situations 255 bytes offset is not enough to match or manipulate the desired packet field. Increase the offset limit to 65535 or U16_MAX. In addition, the nla policy maximum value is not set anymore as it is limited to s16. Instead, the maximum value is checked during the payload expression initialization function. Tested with the nft command line tool. table ip filter { chain output { @nh,2040,8 set 0xff @nh,524280,8 set 0xff @nh,524280,8 0xff @nh,2040,8 0xff } } Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Florian Westphal --- include/net/netfilter/nf_tables_core.h | 2 +- net/netfilter/nft_payload.c | 18 +++++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 6c2f483d9828..7644cfe9267d 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -73,7 +73,7 @@ struct nft_ct { struct nft_payload { enum nft_payload_bases base:8; - u8 offset; + u16 offset; u8 len; u8 dreg; }; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 059b28ffad0e..b0214418f75a 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -40,7 +40,7 @@ static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off, /* add vlan header into the user buffer for if tag was removed by offloads */ static bool -nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len) +nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u16 offset, u8 len) { int mac_off = skb_mac_header(skb) - skb->data; u8 *vlanh, *dst_u8 = (u8 *) d; @@ -212,7 +212,7 @@ static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = { [NFTA_PAYLOAD_SREG] = { .type = NLA_U32 }, [NFTA_PAYLOAD_DREG] = { .type = NLA_U32 }, [NFTA_PAYLOAD_BASE] = { .type = NLA_U32 }, - [NFTA_PAYLOAD_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255), + [NFTA_PAYLOAD_OFFSET] = { .type = NLA_BE32 }, [NFTA_PAYLOAD_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 }, [NFTA_PAYLOAD_CSUM_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255), @@ -797,7 +797,7 @@ static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src, struct nft_payload_set { enum nft_payload_bases base:8; - u8 offset; + u16 offset; u8 len; u8 sreg; u8 csum_type; @@ -812,7 +812,7 @@ struct nft_payload_vlan_hdr { }; static bool -nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u8 offset, u8 len, +nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u16 offset, u8 len, int *vlan_hlen) { struct nft_payload_vlan_hdr *vlanh; @@ -940,14 +940,18 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { + u32 csum_offset, offset, csum_type = NFT_PAYLOAD_CSUM_NONE; struct nft_payload_set *priv = nft_expr_priv(expr); - u32 csum_offset, csum_type = NFT_PAYLOAD_CSUM_NONE; int err; priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); - priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset); + if (err < 0) + return err; + priv->offset = offset; + if (tb[NFTA_PAYLOAD_CSUM_TYPE]) csum_type = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE])); if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) { @@ -1069,7 +1073,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_PAYLOAD_DREG] == NULL) return ERR_PTR(-EINVAL); - err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U8_MAX, &offset); + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset); if (err < 0) return ERR_PTR(err); -- cgit v1.2.3 From d250f14f5f0754ce2d05d9c0ce778e4a51f488b0 Mon Sep 17 00:00:00 2001 From: James Flowers Date: Sun, 31 Aug 2025 20:04:59 -0700 Subject: net/smc: Replace use of strncpy on NUL-terminated string with strscpy strncpy is deprecated for use on NUL-terminated strings, as indicated in Documentation/process/deprecated.rst. strncpy NUL-pads the destination buffer and doesn't guarantee the destination buffer will be NUL terminated. Signed-off-by: James Flowers Reviewed-by: Simon Horman Reviewed-by: Dust Li Reviewed-by: Mahanta Jambigi Link: https://patch.msgid.link/20250901030512.80099-1-bold.zone2373@fastmail.com Signed-off-by: Jakub Kicinski --- net/smc/smc_pnet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 76ad29e31d60..b90337f86e83 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -450,7 +450,7 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, return -ENOMEM; new_pe->type = SMC_PNET_IB; memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); - strncpy(new_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX); + strscpy(new_pe->ib_name, ib_name); new_pe->ib_port = ib_port; new_ibdev = true; -- cgit v1.2.3 From 3016024d7514e953cb3a6715ce29799373512eb4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 1 Sep 2025 09:26:07 +0000 Subject: net_sched: add back BH safety to tcf_lock Jamal reported that we had to use BH safety after all, because stats can be updated from BH handler. Fixes: 3133d5c15cb5 ("net_sched: remove BH blocking in eight actions") Fixes: 53df77e78590 ("net_sched: act_skbmod: use RCU in tcf_skbmod_dump()") Fixes: e97ae742972f ("net_sched: act_tunnel_key: use RCU in tunnel_key_dump()") Fixes: 48b5e5dbdb23 ("net_sched: act_vlan: use RCU in tcf_vlan_dump()") Reported-by: Jamal Hadi Salim Closes: https://lore.kernel.org/netdev/CAM0EoMmhq66EtVqDEuNik8MVFZqkgxFbMu=fJtbNoYD7YXg4bA@mail.gmail.com/ Signed-off-by: Eric Dumazet Reviewed-by: Jamal Hadi Salim Link: https://patch.msgid.link/20250901092608.2032473-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/act_connmark.c | 4 ++-- net/sched/act_csum.c | 4 ++-- net/sched/act_ct.c | 4 ++-- net/sched/act_ctinfo.c | 4 ++-- net/sched/act_mpls.c | 4 ++-- net/sched/act_nat.c | 4 ++-- net/sched/act_pedit.c | 4 ++-- net/sched/act_skbedit.c | 4 ++-- net/sched/act_skbmod.c | 4 ++-- net/sched/act_tunnel_key.c | 4 ++-- net/sched/act_vlan.c | 4 ++-- 11 files changed, 22 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index bf2d6b6da042..3e89927d7116 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -169,10 +169,10 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, nparms->action = parm->action; - spin_lock(&ci->tcf_lock); + spin_lock_bh(&ci->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); oparms = rcu_replace_pointer(ci->parms, nparms, lockdep_is_held(&ci->tcf_lock)); - spin_unlock(&ci->tcf_lock); + spin_unlock_bh(&ci->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 8bad91753615..0939e6b2ba4d 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -101,11 +101,11 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, params_new->update_flags = parm->update_flags; params_new->action = parm->action; - spin_lock(&p->tcf_lock); + spin_lock_bh(&p->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params_new = rcu_replace_pointer(p->params, params_new, lockdep_is_held(&p->tcf_lock)); - spin_unlock(&p->tcf_lock); + spin_unlock_bh(&p->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 6d2355e73b0f..6749a4a9a9cd 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -1410,11 +1410,11 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, goto cleanup; params->action = parm->action; - spin_lock(&c->tcf_lock); + spin_lock_bh(&c->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params = rcu_replace_pointer(c->params, params, lockdep_is_held(&c->tcf_lock)); - spin_unlock(&c->tcf_lock); + spin_unlock_bh(&c->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 6f79eed9a544..71efe04d00b5 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -258,11 +258,11 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, cp_new->action = actparm->action; - spin_lock(&ci->tcf_lock); + spin_lock_bh(&ci->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch); cp_new = rcu_replace_pointer(ci->params, cp_new, lockdep_is_held(&ci->tcf_lock)); - spin_unlock(&ci->tcf_lock); + spin_unlock_bh(&ci->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index ed7bdaa23f0d..6654011dcd2b 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -296,10 +296,10 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla, htons(ETH_P_MPLS_UC)); p->action = parm->action; - spin_lock(&m->tcf_lock); + spin_lock_bh(&m->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); p = rcu_replace_pointer(m->mpls_p, p, lockdep_is_held(&m->tcf_lock)); - spin_unlock(&m->tcf_lock); + spin_unlock_bh(&m->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 9cc2a1772cf8..26241d80ebe0 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -95,10 +95,10 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, p = to_tcf_nat(*a); - spin_lock(&p->tcf_lock); + spin_lock_bh(&p->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); oparm = rcu_replace_pointer(p->parms, nparm, lockdep_is_held(&p->tcf_lock)); - spin_unlock(&p->tcf_lock); + spin_unlock_bh(&p->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 8fc8f577cb7a..4b65901397a8 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -280,10 +280,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, p = to_pedit(*a); nparms->action = parm->action; - spin_lock(&p->tcf_lock); + spin_lock_bh(&p->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); oparms = rcu_replace_pointer(p->parms, nparms, 1); - spin_unlock(&p->tcf_lock); + spin_unlock_bh(&p->tcf_lock); if (oparms) call_rcu(&oparms->rcu, tcf_pedit_cleanup_rcu); diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index aa6b1744de21..8c1d1554f657 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -261,11 +261,11 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, params_new->mask = *mask; params_new->action = parm->action; - spin_lock(&d->tcf_lock); + spin_lock_bh(&d->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params_new = rcu_replace_pointer(d->params, params_new, lockdep_is_held(&d->tcf_lock)); - spin_unlock(&d->tcf_lock); + spin_unlock_bh(&d->tcf_lock); if (params_new) kfree_rcu(params_new, rcu); if (goto_ch) diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index fce625eafcb2..a9e0c1326e2a 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -194,7 +194,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, p->flags = lflags; p->action = parm->action; if (ovr) - spin_lock(&d->tcf_lock); + spin_lock_bh(&d->tcf_lock); /* Protected by tcf_lock if overwriting existing action. */ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); p_old = rcu_dereference_protected(d->skbmod_p, 1); @@ -208,7 +208,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, rcu_assign_pointer(d->skbmod_p, p); if (ovr) - spin_unlock(&d->tcf_lock); + spin_unlock_bh(&d->tcf_lock); if (p_old) kfree_rcu(p_old, rcu); diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index e1c8b48c217c..876b30c5709e 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -531,11 +531,11 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, params_new->tcft_enc_metadata = metadata; params_new->action = parm->action; - spin_lock(&t->tcf_lock); + spin_lock_bh(&t->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params_new = rcu_replace_pointer(t->params, params_new, lockdep_is_held(&t->tcf_lock)); - spin_unlock(&t->tcf_lock); + spin_unlock_bh(&t->tcf_lock); tunnel_key_release_params(params_new); if (goto_ch) tcf_chain_put_by_act(goto_ch); diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index b46f980f3b2a..a74621797d69 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -253,10 +253,10 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, } p->action = parm->action; - spin_lock(&v->tcf_lock); + spin_lock_bh(&v->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); p = rcu_replace_pointer(v->vlan_p, p, lockdep_is_held(&v->tcf_lock)); - spin_unlock(&v->tcf_lock); + spin_unlock_bh(&v->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); -- cgit v1.2.3 From 5d14bbf9d1d90cb7ca3e46fe2c8a4277572eab94 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 1 Sep 2025 09:31:41 +0000 Subject: net_sched: act: remove tcfa_qstats tcfa_qstats is currently only used to hold drops and overlimits counters. tcf_action_inc_drop_qstats() and tcf_action_inc_overlimit_qstats() currently acquire a->tcfa_lock to increment these counters. Switch to two atomic_t to get lock-free accounting. Signed-off-by: Eric Dumazet Reviewed-by: Jamal Hadi Salim Link: https://patch.msgid.link/20250901093141.2093176-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/act_api.h | 14 ++++++-------- net/sched/act_api.c | 12 ++++++++---- 2 files changed, 14 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 2894cfff2da3..91a24b5e0b93 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -33,7 +33,10 @@ struct tc_action { struct tcf_t tcfa_tm; struct gnet_stats_basic_sync tcfa_bstats; struct gnet_stats_basic_sync tcfa_bstats_hw; - struct gnet_stats_queue tcfa_qstats; + + atomic_t tcfa_drops; + atomic_t tcfa_overlimits; + struct net_rate_estimator __rcu *tcfa_rate_est; spinlock_t tcfa_lock; struct gnet_stats_basic_sync __percpu *cpu_bstats; @@ -53,7 +56,6 @@ struct tc_action { #define tcf_action common.tcfa_action #define tcf_tm common.tcfa_tm #define tcf_bstats common.tcfa_bstats -#define tcf_qstats common.tcfa_qstats #define tcf_rate_est common.tcfa_rate_est #define tcf_lock common.tcfa_lock @@ -241,9 +243,7 @@ static inline void tcf_action_inc_drop_qstats(struct tc_action *a) qstats_drop_inc(this_cpu_ptr(a->cpu_qstats)); return; } - spin_lock(&a->tcfa_lock); - qstats_drop_inc(&a->tcfa_qstats); - spin_unlock(&a->tcfa_lock); + atomic_inc(&a->tcfa_drops); } static inline void tcf_action_inc_overlimit_qstats(struct tc_action *a) @@ -252,9 +252,7 @@ static inline void tcf_action_inc_overlimit_qstats(struct tc_action *a) qstats_overlimit_inc(this_cpu_ptr(a->cpu_qstats)); return; } - spin_lock(&a->tcfa_lock); - qstats_overlimit_inc(&a->tcfa_qstats); - spin_unlock(&a->tcfa_lock); + atomic_inc(&a->tcfa_overlimits); } void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets, diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 9e468e463467..ff6be5cfe2b0 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1585,7 +1585,7 @@ void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets, } _bstats_update(&a->tcfa_bstats, bytes, packets); - a->tcfa_qstats.drops += drops; + atomic_add(drops, &a->tcfa_drops); if (hw) _bstats_update(&a->tcfa_bstats_hw, bytes, packets); } @@ -1594,8 +1594,9 @@ EXPORT_SYMBOL(tcf_action_update_stats); int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, int compat_mode) { - int err = 0; + struct gnet_stats_queue qstats = {0}; struct gnet_dump d; + int err = 0; if (p == NULL) goto errout; @@ -1619,14 +1620,17 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, if (err < 0) goto errout; + qstats.drops = atomic_read(&p->tcfa_drops); + qstats.overlimits = atomic_read(&p->tcfa_overlimits); + if (gnet_stats_copy_basic(&d, p->cpu_bstats, &p->tcfa_bstats, false) < 0 || gnet_stats_copy_basic_hw(&d, p->cpu_bstats_hw, &p->tcfa_bstats_hw, false) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 || gnet_stats_copy_queue(&d, p->cpu_qstats, - &p->tcfa_qstats, - p->tcfa_qstats.qlen) < 0) + &qstats, + qstats.qlen) < 0) goto errout; if (gnet_stats_finish_copy(&d) < 0) -- cgit v1.2.3 From fa390321aba0a54d0f7ae95ee4ecde1358bb9234 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Sat, 30 Aug 2025 15:55:38 -0700 Subject: net/tcp: Fix socket memory leak in TCP-AO failure handling for IPv6 When tcp_ao_copy_all_matching() fails in tcp_v6_syn_recv_sock() it just exits the function. This ends up causing a memory-leak: unreferenced object 0xffff0000281a8200 (size 2496): comm "softirq", pid 0, jiffies 4295174684 hex dump (first 32 bytes): 7f 00 00 06 7f 00 00 06 00 00 00 00 cb a8 88 13 ................ 0a 00 03 61 00 00 00 00 00 00 00 00 00 00 00 00 ...a............ backtrace (crc 5ebdbe15): kmemleak_alloc+0x44/0xe0 kmem_cache_alloc_noprof+0x248/0x470 sk_prot_alloc+0x48/0x120 sk_clone_lock+0x38/0x3b0 inet_csk_clone_lock+0x34/0x150 tcp_create_openreq_child+0x3c/0x4a8 tcp_v6_syn_recv_sock+0x1c0/0x620 tcp_check_req+0x588/0x790 tcp_v6_rcv+0x5d0/0xc18 ip6_protocol_deliver_rcu+0x2d8/0x4c0 ip6_input_finish+0x74/0x148 ip6_input+0x50/0x118 ip6_sublist_rcv+0x2fc/0x3b0 ipv6_list_rcv+0x114/0x170 __netif_receive_skb_list_core+0x16c/0x200 netif_receive_skb_list_internal+0x1f0/0x2d0 This is because in tcp_v6_syn_recv_sock (and the IPv4 counterpart), when exiting upon error, inet_csk_prepare_forced_close() and tcp_done() need to be called. They make sure the newsk will end up being correctly free'd. tcp_v4_syn_recv_sock() makes this very clear by having the put_and_exit label that takes care of things. So, this patch here makes sure tcp_v4_syn_recv_sock and tcp_v6_syn_recv_sock have similar error-handling and thus fixes the leak for TCP-AO. Fixes: 06b22ef29591 ("net/tcp: Wire TCP-AO to request sockets") Signed-off-by: Christoph Paasch Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com> Link: https://patch.msgid.link/20250830-tcpao_leak-v1-1-e5878c2c3173@openai.com Signed-off-by: Jakub Kicinski --- net/ipv6/tcp_ipv6.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7577e7eb2c97..e885629312a4 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1431,17 +1431,17 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * ireq = inet_rsk(req); if (sk_acceptq_is_full(sk)) - goto out_overflow; + goto exit_overflow; if (!dst) { dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_TCP); if (!dst) - goto out; + goto exit; } newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) - goto out_nonewsk; + goto exit_nonewsk; /* * No need to charge this sock to the relevant IPv6 refcnt debug socks @@ -1525,25 +1525,19 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * const union tcp_md5_addr *addr; addr = (union tcp_md5_addr *)&newsk->sk_v6_daddr; - if (tcp_md5_key_copy(newsk, addr, AF_INET6, 128, l3index, key)) { - inet_csk_prepare_forced_close(newsk); - tcp_done(newsk); - goto out; - } + if (tcp_md5_key_copy(newsk, addr, AF_INET6, 128, l3index, key)) + goto put_and_exit; } } #endif #ifdef CONFIG_TCP_AO /* Copy over tcp_ao_info if any */ if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET6)) - goto out; /* OOM */ + goto put_and_exit; /* OOM */ #endif - if (__inet_inherit_port(sk, newsk) < 0) { - inet_csk_prepare_forced_close(newsk); - tcp_done(newsk); - goto out; - } + if (__inet_inherit_port(sk, newsk) < 0) + goto put_and_exit; *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), &found_dup_sk); if (*own_req) { @@ -1570,13 +1564,17 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * return newsk; -out_overflow: +exit_overflow: __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); -out_nonewsk: +exit_nonewsk: dst_release(dst); -out: +exit: tcp_listendrop(sk); return NULL; +put_and_exit: + inet_csk_prepare_forced_close(newsk); + tcp_done(newsk); + goto exit; } INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, -- cgit v1.2.3 From 3a5f55500f3e93cf4d62351c753452279b088b4b Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Mon, 1 Sep 2025 20:37:25 +0800 Subject: ipv6: annotate data-races around devconf->rpl_seg_enabled devconf->rpl_seg_enabled can be changed concurrently from /proc/sys/net/ipv6/conf, annotate lockless reads on it. Signed-off-by: Yue Haibing Link: https://patch.msgid.link/20250901123726.1972881-2-yuehaibing@huawei.com Signed-off-by: Jakub Kicinski --- net/ipv6/exthdrs.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index d1ef9644f826..a23eb8734e15 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -494,10 +494,8 @@ static int ipv6_rpl_srh_rcv(struct sk_buff *skb) idev = __in6_dev_get(skb->dev); - accept_rpl_seg = net->ipv6.devconf_all->rpl_seg_enabled; - if (accept_rpl_seg > idev->cnf.rpl_seg_enabled) - accept_rpl_seg = idev->cnf.rpl_seg_enabled; - + accept_rpl_seg = min(READ_ONCE(net->ipv6.devconf_all->rpl_seg_enabled), + READ_ONCE(idev->cnf.rpl_seg_enabled)); if (!accept_rpl_seg) { kfree_skb(skb); return -1; -- cgit v1.2.3 From 3d95261eeb74958cd496e1875684827dc5d028cc Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Mon, 1 Sep 2025 20:37:26 +0800 Subject: ipv6: Add sanity checks on ipv6_devconf.rpl_seg_enabled In ipv6_rpl_srh_rcv() we use min(net->ipv6.devconf_all->rpl_seg_enabled, idev->cnf.rpl_seg_enabled) is intended to return 0 when either value is zero, but if one of the values is negative it will in fact return non-zero. Signed-off-by: Yue Haibing Link: https://patch.msgid.link/20250901123726.1972881-3-yuehaibing@huawei.com Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f17a5dd4789f..40e9c336f6c5 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -7238,7 +7238,9 @@ static const struct ctl_table addrconf_sysctl[] = { .data = &ipv6_devconf.rpl_seg_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "ioam6_enabled", -- cgit v1.2.3 From 62b635dcd69c4fde7ce1de4992d71420a37e51e3 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 29 Aug 2025 15:48:45 +0300 Subject: wifi: cfg80211: sme: cap SSID length in __cfg80211_connect_result() If the ssid->datalen is more than IEEE80211_MAX_SSID_LEN (32) it would lead to memory corruption so add some bounds checking. Fixes: c38c70185101 ("wifi: cfg80211: Set SSID if it is not already set") Signed-off-by: Dan Carpenter Link: https://patch.msgid.link/0aaaae4a3ed37c6252363c34ae4904b1604e8e32.1756456951.git.dan.carpenter@linaro.org Signed-off-by: Johannes Berg --- net/wireless/sme.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 826ec0a6355f..3a028ff287fb 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -900,13 +900,16 @@ void __cfg80211_connect_result(struct net_device *dev, if (!wdev->u.client.ssid_len) { rcu_read_lock(); for_each_valid_link(cr, link) { + u32 ssid_len; + ssid = ieee80211_bss_get_elem(cr->links[link].bss, WLAN_EID_SSID); if (!ssid || !ssid->datalen) continue; - memcpy(wdev->u.client.ssid, ssid->data, ssid->datalen); + ssid_len = min(ssid->datalen, IEEE80211_MAX_SSID_LEN); + memcpy(wdev->u.client.ssid, ssid->data, ssid_len); wdev->u.client.ssid_len = ssid->datalen; break; } -- cgit v1.2.3 From 58febb47b961a91d0d12ee0c1618a7843c0908ce Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 28 Aug 2025 22:17:59 -0400 Subject: wifi: cfg80211: Remove unused tracepoints Tracepoints that are defined take up around 5K each, even if they are not used. If they are defined and not used, then they waste memory for unused code. Soon unused tracepoints will cause warnings. Remove the unused tracepoints of the cfg80211 subsystem. They are: cfg80211_chandef_dfs_required cfg80211_return_u32 cfg80211_return_uint cfg80211_send_rx_auth Signed-off-by: Steven Rostedt (Google) Link: https://patch.msgid.link/20250828221759.131160ee@batman.local.home Signed-off-by: Johannes Berg --- net/wireless/trace.h | 56 ---------------------------------------------------- 1 file changed, 56 deletions(-) (limited to 'net') diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 34c584a215e5..9b6074155d59 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -3137,23 +3137,6 @@ DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_notify_new_peer_candidate, TP_ARGS(netdev, macaddr) ); -DECLARE_EVENT_CLASS(netdev_evt_only, - TP_PROTO(struct net_device *netdev), - TP_ARGS(netdev), - TP_STRUCT__entry( - NETDEV_ENTRY - ), - TP_fast_assign( - NETDEV_ASSIGN; - ), - TP_printk(NETDEV_PR_FMT , NETDEV_PR_ARG) -); - -DEFINE_EVENT(netdev_evt_only, cfg80211_send_rx_auth, - TP_PROTO(struct net_device *netdev), - TP_ARGS(netdev) -); - TRACE_EVENT(cfg80211_send_rx_assoc, TP_PROTO(struct net_device *netdev, const struct cfg80211_rx_assoc_resp_data *data), @@ -3480,21 +3463,6 @@ TRACE_EVENT(cfg80211_reg_can_beacon, __entry->prohibited_flags, __entry->permitting_flags) ); -TRACE_EVENT(cfg80211_chandef_dfs_required, - TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef), - TP_ARGS(wiphy, chandef), - TP_STRUCT__entry( - WIPHY_ENTRY - CHAN_DEF_ENTRY - ), - TP_fast_assign( - WIPHY_ASSIGN; - CHAN_DEF_ASSIGN(chandef); - ), - TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT, - WIPHY_PR_ARG, CHAN_DEF_PR_ARG) -); - TRACE_EVENT(cfg80211_ch_switch_notify, TP_PROTO(struct net_device *netdev, struct cfg80211_chan_def *chandef, @@ -3862,30 +3830,6 @@ DEFINE_EVENT(cfg80211_bss_evt, cfg80211_return_bss, TP_ARGS(pub) ); -TRACE_EVENT(cfg80211_return_uint, - TP_PROTO(unsigned int ret), - TP_ARGS(ret), - TP_STRUCT__entry( - __field(unsigned int, ret) - ), - TP_fast_assign( - __entry->ret = ret; - ), - TP_printk("ret: %d", __entry->ret) -); - -TRACE_EVENT(cfg80211_return_u32, - TP_PROTO(u32 ret), - TP_ARGS(ret), - TP_STRUCT__entry( - __field(u32, ret) - ), - TP_fast_assign( - __entry->ret = ret; - ), - TP_printk("ret: %u", __entry->ret) -); - TRACE_EVENT(cfg80211_report_wowlan_wakeup, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_wowlan_wakeup *wakeup), -- cgit v1.2.3 From ac36daa83650c26fd55dee1a6ee5144769239911 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 26 Aug 2025 13:54:31 +0200 Subject: wifi: mac80211: Make CONNECTION_MONITOR optional for MLO sta Since commit '1bc892d76a6f ("wifi: mac80211: extend connection monitoring for MLO")' mac80211 supports connection monitor for MLO client interfaces. Remove the CONNECTION_MONITOR requirement in ieee80211_register_hw routine. Fixes: 1bc892d76a6f ("wifi: mac80211: extend connection monitoring for MLO") Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20250826-remove-conn-mon-check-ieee80211_register_hw-v2-1-5a1e2f038245@kernel.org Signed-off-by: Johannes Berg --- net/mac80211/main.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 9c8f18b258a6..beee51354931 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1164,9 +1164,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) if (WARN_ON(!ieee80211_hw_check(hw, MFP_CAPABLE))) return -EINVAL; - if (WARN_ON(!ieee80211_hw_check(hw, CONNECTION_MONITOR))) - return -EINVAL; - if (WARN_ON(ieee80211_hw_check(hw, NEED_DTIM_BEFORE_ASSOC))) return -EINVAL; -- cgit v1.2.3 From 1373f94148a5adac2f42c8ba9771105624fe4af0 Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Tue, 26 Aug 2025 20:25:24 +0300 Subject: wifi: mac80211: count reg connection element in the size We currently don't count the reg connection length in the per-link capability length. Fix it. Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250826202512.b14fc82f736b.I03442382e8a07f6f9836bcdac2e22ce8afbe6a21@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 1008eb8e9b13..353e89973d1e 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2104,8 +2104,11 @@ ieee80211_link_common_elems_size(struct ieee80211_sub_if_data *sdata, sizeof(struct ieee80211_he_mcs_nss_supp) + IEEE80211_HE_PPE_THRES_MAX_LEN; - if (sband->band == NL80211_BAND_6GHZ) + if (sband->band == NL80211_BAND_6GHZ) { size += 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa); + /* reg connection */ + size += 4; + } size += 2 + 1 + sizeof(struct ieee80211_eht_cap_elem) + sizeof(struct ieee80211_eht_mcs_nss_supp) + -- cgit v1.2.3 From 2d5be5629ce73522d1c739579d6e8e450de8685e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 2 Sep 2025 23:11:33 +0200 Subject: mptcp: use HMAC-SHA256 library instead of open-coded HMAC Now that there are easy-to-use HMAC-SHA256 library functions, use these in net/mptcp/crypto.c instead of open-coding the HMAC algorithm. Remove the WARN_ON_ONCE() for messages longer than SHA256_DIGEST_SIZE. The new implementation handles all message lengths correctly. The mptcp-crypto KUnit test still passes after this change. Signed-off-by: Eric Biggers Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250902-net-next-mptcp-misc-feat-6-18-v2-1-fa02bb3188b1@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/crypto.c | 35 ++--------------------------------- 1 file changed, 2 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c index b08ba959ac4f..31948e18d97d 100644 --- a/net/mptcp/crypto.c +++ b/net/mptcp/crypto.c @@ -22,7 +22,6 @@ #include #include -#include #include "protocol.h" @@ -43,39 +42,9 @@ void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn) void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac) { - u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE]; - u8 key1be[8]; - u8 key2be[8]; - int i; + __be64 key[2] = { cpu_to_be64(key1), cpu_to_be64(key2) }; - if (WARN_ON_ONCE(len > SHA256_DIGEST_SIZE)) - len = SHA256_DIGEST_SIZE; - - put_unaligned_be64(key1, key1be); - put_unaligned_be64(key2, key2be); - - /* Generate key xored with ipad */ - memset(input, 0x36, SHA256_BLOCK_SIZE); - for (i = 0; i < 8; i++) - input[i] ^= key1be[i]; - for (i = 0; i < 8; i++) - input[i + 8] ^= key2be[i]; - - memcpy(&input[SHA256_BLOCK_SIZE], msg, len); - - /* emit sha256(K1 || msg) on the second input block, so we can - * reuse 'input' for the last hashing - */ - sha256(input, SHA256_BLOCK_SIZE + len, &input[SHA256_BLOCK_SIZE]); - - /* Prepare second part of hmac */ - memset(input, 0x5C, SHA256_BLOCK_SIZE); - for (i = 0; i < 8; i++) - input[i] ^= key1be[i]; - for (i = 0; i < 8; i++) - input[i + 8] ^= key2be[i]; - - sha256(input, SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE, hmac); + hmac_sha256_usingrawkey((const u8 *)key, sizeof(key), msg, len, hmac); } #if IS_MODULE(CONFIG_MPTCP_KUNIT_TEST) -- cgit v1.2.3 From 3bd4f98a4e2c601895f0ca8844098caedf4717a1 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Tue, 2 Sep 2025 23:11:36 +0200 Subject: mptcp: record subflows in RPS table Accelerated Receive Flow Steering (aRFS) relies on sockets recording their RX flow hash into the rps_sock_flow_table so that incoming packets are steered to the CPU where the application runs. With MPTCP, the application interacts with the parent MPTCP socket while data is carried over per-subflow TCP sockets. Without recording these subflows, aRFS cannot steer interrupts and RX processing for the flows to the desired CPU. Record all subflows in the RPS table by calling sock_rps_record_flow() for each subflow at the start of mptcp_sendmsg(), mptcp_recvmsg() and mptcp_stream_accept(), by using the new helper mptcp_rps_record_subflows(). It does not by itself improve throughput, but ensures that IRQ and RX processing are directed to the right CPU, which is a prerequisite for effective aRFS. Signed-off-by: Christoph Paasch Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250902-net-next-mptcp-misc-feat-6-18-v2-4-fa02bb3188b1@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index ad41c48126e4..a8d57b88578d 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -1740,6 +1741,20 @@ static u32 mptcp_send_limit(const struct sock *sk) return limit - not_sent; } +static void mptcp_rps_record_subflows(const struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + if (!rfs_is_needed()) + return; + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + sock_rps_record_flow(ssk); + } +} + static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -1753,6 +1768,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) lock_sock(sk); + mptcp_rps_record_subflows(msk); + if (unlikely(inet_test_bit(DEFER_CONNECT, sk) || msg->msg_flags & MSG_FASTOPEN)) { int copied_syn = 0; @@ -2131,6 +2148,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, goto out_err; } + mptcp_rps_record_subflows(msk); + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); len = min_t(size_t, len, INT_MAX); @@ -3922,6 +3941,8 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, mptcp_sock_graft(ssk, newsock); } + mptcp_rps_record_subflows(msk); + /* Do late cleanup for the first subflow as necessary. Also * deal with bad peers not doing a complete shutdown. */ -- cgit v1.2.3 From 9f9581ba74a931843c6d807ecfeaff9fb8c1b731 Mon Sep 17 00:00:00 2001 From: Asbjørn Sloth Tønnesen Date: Tue, 2 Sep 2025 15:46:35 +0000 Subject: netlink: specs: fou: change local-v6/peer-v6 check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While updating the binary min-len implementation, I noticed that the only user, should AFAICT be using exact-len instead. In net/ipv4/fou_core.c FOU_ATTR_LOCAL_V6 and FOU_ATTR_PEER_V6 are only used for singular IPv6 addresses, and there are AFAICT no known implementations trying to send more, it therefore appears safe to change it to an exact-len policy. This patch therefore changes the local-v6/peer-v6 attributes to use an exact-len check, instead of a min-len check. Signed-off-by: Asbjørn Sloth Tønnesen Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250902154640.759815-2-ast@fiberby.net Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/fou.yaml | 4 ++-- net/ipv4/fou_nl.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/Documentation/netlink/specs/fou.yaml b/Documentation/netlink/specs/fou.yaml index 57735726262e..8e7974ec453f 100644 --- a/Documentation/netlink/specs/fou.yaml +++ b/Documentation/netlink/specs/fou.yaml @@ -52,7 +52,7 @@ attribute-sets: name: local-v6 type: binary checks: - min-len: 16 + exact-len: 16 - name: peer-v4 type: u32 @@ -60,7 +60,7 @@ attribute-sets: name: peer-v6 type: binary checks: - min-len: 16 + exact-len: 16 - name: peer-port type: u16 diff --git a/net/ipv4/fou_nl.c b/net/ipv4/fou_nl.c index 3d9614609b2d..506260b4a4dc 100644 --- a/net/ipv4/fou_nl.c +++ b/net/ipv4/fou_nl.c @@ -18,9 +18,9 @@ const struct nla_policy fou_nl_policy[FOU_ATTR_IFINDEX + 1] = { [FOU_ATTR_TYPE] = { .type = NLA_U8, }, [FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, }, [FOU_ATTR_LOCAL_V4] = { .type = NLA_U32, }, - [FOU_ATTR_LOCAL_V6] = { .len = 16, }, + [FOU_ATTR_LOCAL_V6] = NLA_POLICY_EXACT_LEN(16), [FOU_ATTR_PEER_V4] = { .type = NLA_U32, }, - [FOU_ATTR_PEER_V6] = { .len = 16, }, + [FOU_ATTR_PEER_V6] = NLA_POLICY_EXACT_LEN(16), [FOU_ATTR_PEER_PORT] = { .type = NLA_BE16, }, [FOU_ATTR_IFINDEX] = { .type = NLA_S32, }, }; -- cgit v1.2.3 From 5d6b58c932ec451a5c41482790eb5b1ecf165a94 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 2 Sep 2025 18:36:03 +0000 Subject: net: lockless sock_i_ino() Followup of commit c51da3f7a161 ("net: remove sock_i_uid()") A recent syzbot report was the trigger for this change. Over the years, we had many problems caused by the read_lock[_bh](&sk->sk_callback_lock) in sock_i_uid(). We could fix smc_diag_dump_proto() or make a more radical move: Instead of waiting for new syzbot reports, cache the socket inode number in sk->sk_ino, so that we no longer need to acquire sk->sk_callback_lock in sock_i_ino(). This makes socket dumps faster (one less cache line miss, and two atomic ops avoided). Prior art: commit 25a9c8a4431c ("netlink: Add __sock_i_ino() for __netlink_diag_dump().") commit 4f9bf2a2f5aa ("tcp: Don't acquire inet_listen_hashbucket::lock with disabled BH.") commit efc3dbc37412 ("rds: Make rds_sock_lock BH rather than IRQ safe.") Fixes: d2d6422f8bd1 ("x86: Allow to enable PREEMPT_RT.") Reported-by: syzbot+50603c05bbdf4dfdaffa@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/68b73804.050a0220.3db4df.01d8.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20250902183603.740428-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 17 +++++++++++++---- net/core/sock.c | 22 ---------------------- net/mptcp/protocol.c | 1 - net/netlink/diag.c | 2 +- 4 files changed, 14 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index c8a4b283df6f..fb13322a11fc 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -285,6 +285,7 @@ struct sk_filter; * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner + * @sk_ino: inode number (zero if orphaned) * @sk_prefer_busy_poll: prefer busypolling over softirq processing * @sk_busy_poll_budget: napi processing budget when busypolling * @sk_priority: %SO_PRIORITY setting @@ -518,6 +519,7 @@ struct sock { u32 sk_ack_backlog; u32 sk_max_ack_backlog; kuid_t sk_uid; + unsigned long sk_ino; spinlock_t sk_peer_lock; int sk_bind_phc; struct pid *sk_peer_pid; @@ -2056,6 +2058,10 @@ static inline int sk_rx_queue_get(const struct sock *sk) static inline void sk_set_socket(struct sock *sk, struct socket *sock) { sk->sk_socket = sock; + if (sock) { + WRITE_ONCE(sk->sk_uid, SOCK_INODE(sock)->i_uid); + WRITE_ONCE(sk->sk_ino, SOCK_INODE(sock)->i_ino); + } } static inline wait_queue_head_t *sk_sleep(struct sock *sk) @@ -2077,6 +2083,7 @@ static inline void sock_orphan(struct sock *sk) sk_set_socket(sk, NULL); sk->sk_wq = NULL; /* Note: sk_uid is unchanged. */ + WRITE_ONCE(sk->sk_ino, 0); write_unlock_bh(&sk->sk_callback_lock); } @@ -2087,20 +2094,22 @@ static inline void sock_graft(struct sock *sk, struct socket *parent) rcu_assign_pointer(sk->sk_wq, &parent->wq); parent->sk = sk; sk_set_socket(sk, parent); - WRITE_ONCE(sk->sk_uid, SOCK_INODE(parent)->i_uid); security_sock_graft(sk, parent); write_unlock_bh(&sk->sk_callback_lock); } +static inline unsigned long sock_i_ino(const struct sock *sk) +{ + /* Paired with WRITE_ONCE() in sock_graft() and sock_orphan() */ + return READ_ONCE(sk->sk_ino); +} + static inline kuid_t sk_uid(const struct sock *sk) { /* Paired with WRITE_ONCE() in sockfs_setattr() */ return READ_ONCE(sk->sk_uid); } -unsigned long __sock_i_ino(struct sock *sk); -unsigned long sock_i_ino(struct sock *sk); - static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk) { return sk ? sk_uid(sk) : make_kuid(net->user_ns, 0); diff --git a/net/core/sock.c b/net/core/sock.c index 7c26ec8dce63..158bddd23134 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2780,28 +2780,6 @@ void sock_pfree(struct sk_buff *skb) EXPORT_SYMBOL(sock_pfree); #endif /* CONFIG_INET */ -unsigned long __sock_i_ino(struct sock *sk) -{ - unsigned long ino; - - read_lock(&sk->sk_callback_lock); - ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; - read_unlock(&sk->sk_callback_lock); - return ino; -} -EXPORT_SYMBOL(__sock_i_ino); - -unsigned long sock_i_ino(struct sock *sk) -{ - unsigned long ino; - - local_bh_disable(); - ino = __sock_i_ino(sk); - local_bh_enable(); - return ino; -} -EXPORT_SYMBOL(sock_i_ino); - /* * Allocate a skb from the socket's send buffer. */ diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 9a287b75c1b3..e6fd97b21e9e 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3554,7 +3554,6 @@ void mptcp_sock_graft(struct sock *sk, struct socket *parent) write_lock_bh(&sk->sk_callback_lock); rcu_assign_pointer(sk->sk_wq, &parent->wq); sk_set_socket(sk, parent); - WRITE_ONCE(sk->sk_uid, SOCK_INODE(parent)->i_uid); write_unlock_bh(&sk->sk_callback_lock); } diff --git a/net/netlink/diag.c b/net/netlink/diag.c index 61981e01fd6f..b8e58132e8af 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -168,7 +168,7 @@ mc_list: NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - __sock_i_ino(sk)) < 0) { + sock_i_ino(sk)) < 0) { ret = 1; break; } -- cgit v1.2.3 From a51160f8da850a65afbf165f5bbac7ffb388bf74 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 2 Sep 2025 09:36:08 +0300 Subject: ipv4: Fix NULL vs error pointer check in inet_blackhole_dev_init() The inetdev_init() function never returns NULL. Check for error pointers instead. Fixes: 22600596b675 ("ipv4: give an IPv4 dev to blackhole_netdev") Signed-off-by: Dan Carpenter Reviewed-by: Simon Horman Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/aLaQWL9NguWmeM1i@stanley.mountain Signed-off-by: Jakub Kicinski --- net/ipv4/devinet.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index c47d3828d4f6..942a887bf089 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -340,14 +340,13 @@ static void inetdev_destroy(struct in_device *in_dev) static int __init inet_blackhole_dev_init(void) { - int err = 0; + struct in_device *in_dev; rtnl_lock(); - if (!inetdev_init(blackhole_netdev)) - err = -ENOMEM; + in_dev = inetdev_init(blackhole_netdev); rtnl_unlock(); - return err; + return PTR_ERR_OR_ZERO(in_dev); } late_initcall(inet_blackhole_dev_init); -- cgit v1.2.3 From cc282f73bc0cbdf3ee7af2f2d3a2ef4e6b19242d Mon Sep 17 00:00:00 2001 From: Mahanta Jambigi Date: Tue, 2 Sep 2025 10:20:41 +0200 Subject: net/smc: Remove validation of reserved bits in CLC Decline message Currently SMC code is validating the reserved bits while parsing the incoming CLC decline message & when this validation fails, its treated as a protocol error. As a result, the SMC connection is terminated instead of falling back to TCP. As per RFC7609[1] specs we shouldn't be validating the reserved bits that is part of CLC message. This patch fixes this issue. CLC Decline message format can viewed here[2]. [1] https://datatracker.ietf.org/doc/html/rfc7609#page-92 [2] https://datatracker.ietf.org/doc/html/rfc7609#page-105 Fixes: 8ade200c269f ("net/smc: add v2 format of CLC decline message") Signed-off-by: Mahanta Jambigi Reviewed-by: Sidraya Jayagond Reviewed-by: Alexandra Winter Reviewed-by: Dust Li Link: https://patch.msgid.link/20250902082041.98996-1-mjambigi@linux.ibm.com Signed-off-by: Jakub Kicinski --- net/smc/smc_clc.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 5a4db151fe95..08be56dfb3f2 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -426,8 +426,6 @@ smc_clc_msg_decl_valid(struct smc_clc_msg_decline *dclc) { struct smc_clc_msg_hdr *hdr = &dclc->hdr; - if (hdr->typev1 != SMC_TYPE_R && hdr->typev1 != SMC_TYPE_D) - return false; if (hdr->version == SMC_V1) { if (ntohs(hdr->length) != sizeof(struct smc_clc_msg_decline)) return false; -- cgit v1.2.3 From a125c8fb9ddbcb0602103a50727a476fd30dec01 Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Tue, 2 Sep 2025 03:20:55 -0700 Subject: mctp: return -ENOPROTOOPT for unknown getsockopt options In mctp_getsockopt(), unrecognized options currently return -EINVAL. In contrast, mctp_setsockopt() returns -ENOPROTOOPT for unknown options. Update mctp_getsockopt() to also return -ENOPROTOOPT for unknown options. This aligns the behavior of getsockopt() and setsockopt(), and matches the standard kernel socket API convention for handling unsupported options. Fixes: 99ce45d5e7db ("mctp: Implement extended addressing") Signed-off-by: Alok Tiwari Link: https://patch.msgid.link/20250902102059.1370008-1-alok.a.tiwari@oracle.com Signed-off-by: Jakub Kicinski --- net/mctp/af_mctp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index df4e8cf33899..685524800d70 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -425,7 +425,7 @@ static int mctp_getsockopt(struct socket *sock, int level, int optname, return 0; } - return -EINVAL; + return -ENOPROTOOPT; } /* helpers for reading/writing the tag ioc, handling compatibility across the -- cgit v1.2.3 From 8156210d36a43e76372312c87eb5ea3dbb405a85 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 2 Sep 2025 12:46:42 +0000 Subject: ax25: properly unshare skbs in ax25_kiss_rcv() Bernard Pidoux reported a regression apparently caused by commit c353e8983e0d ("net: introduce per netns packet chains"). skb->dev becomes NULL and we crash in __netif_receive_skb_core(). Before above commit, different kind of bugs or corruptions could happen without a major crash. But the root cause is that ax25_kiss_rcv() can queue/mangle input skb without checking if this skb is shared or not. Many thanks to Bernard Pidoux for his help, diagnosis and tests. We had a similar issue years ago fixed with commit 7aaed57c5c28 ("phonet: properly unshare skbs in phonet_rcv()"). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Bernard Pidoux Closes: https://lore.kernel.org/netdev/1713f383-c538-4918-bc64-13b3288cd542@free.fr/ Tested-by: Bernard Pidoux Signed-off-by: Eric Dumazet Cc: Joerg Reuter Cc: David Ranch Cc: Folkert van Heusden Reviewed-by: Dan Cross Link: https://patch.msgid.link/20250902124642.212705-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ax25/ax25_in.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c index 1cac25aca637..f2d66af86359 100644 --- a/net/ax25/ax25_in.c +++ b/net/ax25/ax25_in.c @@ -433,6 +433,10 @@ free: int ax25_kiss_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev) { + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + return NET_RX_DROP; + skb_orphan(skb); if (!net_eq(dev_net(dev), &init_net)) { -- cgit v1.2.3 From 4039ce7ef40474d5ba46f414c50cc7020b9cf8ae Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 7 Aug 2025 15:49:59 +0200 Subject: netfilter: nf_tables: Introduce NFTA_DEVICE_PREFIX This new attribute is supposed to be used instead of NFTA_DEVICE_NAME for simple wildcard interface specs. It holds a NUL-terminated string representing an interface name prefix to match on. While kernel code to distinguish full names from prefixes in NFTA_DEVICE_NAME is simpler than this solution, reusing the existing attribute with different semantics leads to confusion between different versions of kernel and user space though: * With old kernels, wildcards submitted by user space are accepted yet silently treated as regular names. * With old user space, wildcards submitted by kernel may cause crashes since libnftnl expects NUL-termination when there is none. Using a distinct attribute type sanitizes these situations as the receiving part detects and rejects the unexpected attribute nested in *_HOOK_DEVS attributes. Fixes: 6d07a289504a ("netfilter: nf_tables: Support wildcard netdev hook specs") Signed-off-by: Phil Sutter Signed-off-by: Florian Westphal --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 42 +++++++++++++++++++++++--------- 2 files changed, 33 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 2beb30be2c5f..8e0eb832bc01 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1784,10 +1784,12 @@ enum nft_synproxy_attributes { * enum nft_device_attributes - nf_tables device netlink attributes * * @NFTA_DEVICE_NAME: name of this device (NLA_STRING) + * @NFTA_DEVICE_PREFIX: device name prefix, a simple wildcard (NLA_STRING) */ enum nft_devices_attributes { NFTA_DEVICE_UNSPEC, NFTA_DEVICE_NAME, + NFTA_DEVICE_PREFIX, __NFTA_DEVICE_MAX }; #define NFTA_DEVICE_MAX (__NFTA_DEVICE_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 58c5425d61c2..c1082de09656 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1959,6 +1959,18 @@ nla_put_failure: return -ENOSPC; } +static bool hook_is_prefix(struct nft_hook *hook) +{ + return strlen(hook->ifname) >= hook->ifnamelen; +} + +static int nft_nla_put_hook_dev(struct sk_buff *skb, struct nft_hook *hook) +{ + int attr = hook_is_prefix(hook) ? NFTA_DEVICE_PREFIX : NFTA_DEVICE_NAME; + + return nla_put_string(skb, attr, hook->ifname); +} + static int nft_dump_basechain_hook(struct sk_buff *skb, const struct net *net, int family, const struct nft_base_chain *basechain, @@ -1990,16 +2002,15 @@ static int nft_dump_basechain_hook(struct sk_buff *skb, if (!first) first = hook; - if (nla_put(skb, NFTA_DEVICE_NAME, - hook->ifnamelen, hook->ifname)) + if (nft_nla_put_hook_dev(skb, hook)) goto nla_put_failure; n++; } nla_nest_end(skb, nest_devs); if (n == 1 && - nla_put(skb, NFTA_HOOK_DEV, - first->ifnamelen, first->ifname)) + !hook_is_prefix(first) && + nla_put_string(skb, NFTA_HOOK_DEV, first->ifname)) goto nla_put_failure; } nla_nest_end(skb, nest); @@ -2310,7 +2321,8 @@ void nf_tables_chain_destroy(struct nft_chain *chain) } static struct nft_hook *nft_netdev_hook_alloc(struct net *net, - const struct nlattr *attr) + const struct nlattr *attr, + bool prefix) { struct nf_hook_ops *ops; struct net_device *dev; @@ -2327,7 +2339,8 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net, if (err < 0) goto err_hook_free; - hook->ifnamelen = nla_len(attr); + /* include the terminating NUL-char when comparing non-prefixes */ + hook->ifnamelen = strlen(hook->ifname) + !prefix; /* nf_tables_netdev_event() is called under rtnl_mutex, this is * indirectly serializing all the other holders of the commit_mutex with @@ -2374,14 +2387,22 @@ static int nf_tables_parse_netdev_hooks(struct net *net, struct nft_hook *hook, *next; const struct nlattr *tmp; int rem, n = 0, err; + bool prefix; nla_for_each_nested(tmp, attr, rem) { - if (nla_type(tmp) != NFTA_DEVICE_NAME) { + switch (nla_type(tmp)) { + case NFTA_DEVICE_NAME: + prefix = false; + break; + case NFTA_DEVICE_PREFIX: + prefix = true; + break; + default: err = -EINVAL; goto err_hook; } - hook = nft_netdev_hook_alloc(net, tmp); + hook = nft_netdev_hook_alloc(net, tmp, prefix); if (IS_ERR(hook)) { NL_SET_BAD_ATTR(extack, tmp); err = PTR_ERR(hook); @@ -2427,7 +2448,7 @@ static int nft_chain_parse_netdev(struct net *net, struct nlattr *tb[], int err; if (tb[NFTA_HOOK_DEV]) { - hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV]); + hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV], false); if (IS_ERR(hook)) { NL_SET_BAD_ATTR(extack, tb[NFTA_HOOK_DEV]); return PTR_ERR(hook); @@ -9458,8 +9479,7 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, list_for_each_entry_rcu(hook, hook_list, list, lockdep_commit_lock_is_held(net)) { - if (nla_put(skb, NFTA_DEVICE_NAME, - hook->ifnamelen, hook->ifname)) + if (nft_nla_put_hook_dev(skb, hook)) goto nla_put_failure; } nla_nest_end(skb, nest_devs); -- cgit v1.2.3 From 0a228624bcc00af41f281a2a84c928595a74c17d Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Mon, 1 Sep 2025 14:35:37 +0800 Subject: net: atm: fix memory leak in atm_register_sysfs when device_register fail When device_register() return error in atm_register_sysfs(), which can be triggered by kzalloc fail in device_private_init() or other reasons, kmemleak reports the following memory leaks: unreferenced object 0xffff88810182fb80 (size 8): comm "insmod", pid 504, jiffies 4294852464 hex dump (first 8 bytes): 61 64 75 6d 6d 79 30 00 adummy0. backtrace (crc 14dfadaf): __kmalloc_node_track_caller_noprof+0x335/0x450 kvasprintf+0xb3/0x130 kobject_set_name_vargs+0x45/0x120 dev_set_name+0xa9/0xe0 atm_register_sysfs+0xf3/0x220 atm_dev_register+0x40b/0x780 0xffffffffa000b089 do_one_initcall+0x89/0x300 do_init_module+0x27b/0x7d0 load_module+0x54cd/0x5ff0 init_module_from_file+0xe4/0x150 idempotent_init_module+0x32c/0x610 __x64_sys_finit_module+0xbd/0x120 do_syscall_64+0xa8/0x270 entry_SYSCALL_64_after_hwframe+0x77/0x7f When device_create_file() return error in atm_register_sysfs(), the same issue also can be triggered. Function put_device() should be called to release kobj->name memory and other device resource, instead of kfree(). Fixes: 1fa5ae857bb1 ("driver core: get rid of struct device's bus_id string array") Signed-off-by: Wang Liang Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250901063537.1472221-1-wangliang74@huawei.com Signed-off-by: Paolo Abeni --- net/atm/resources.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/atm/resources.c b/net/atm/resources.c index b19d851e1f44..7c6fdedbcf4e 100644 --- a/net/atm/resources.c +++ b/net/atm/resources.c @@ -112,7 +112,9 @@ struct atm_dev *atm_dev_register(const char *type, struct device *parent, if (atm_proc_dev_register(dev) < 0) { pr_err("atm_proc_dev_register failed for dev %s\n", type); - goto out_fail; + mutex_unlock(&atm_dev_mutex); + kfree(dev); + return NULL; } if (atm_register_sysfs(dev, parent) < 0) { @@ -128,7 +130,7 @@ out: return dev; out_fail: - kfree(dev); + put_device(&dev->class_dev); dev = NULL; goto out; } -- cgit v1.2.3 From 61481d72e153703df180c46c3d0eb648fe0416b1 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Mon, 1 Sep 2025 19:48:57 +0800 Subject: ipv6: sit: Add ipip6_tunnel_dst_find() for cleanup Extract the dst lookup logic from ipip6_tunnel_xmit() into new helper ipip6_tunnel_dst_find() to reduce code duplication and enhance readability. No functional change intended. On a x86_64, with allmodconfig object size is also reduced: ./scripts/bloat-o-meter net/ipv6/sit.o net/ipv6/sit-new.o add/remove: 5/3 grow/shrink: 3/4 up/down: 1841/-2275 (-434) Function old new delta ipip6_tunnel_dst_find - 1697 +1697 __pfx_ipip6_tunnel_dst_find - 64 +64 __UNIQUE_ID_modinfo2094 - 43 +43 ipip6_tunnel_xmit.isra.cold 79 88 +9 __UNIQUE_ID_modinfo2096 12 20 +8 __UNIQUE_ID___addressable_init_module2092 - 8 +8 __UNIQUE_ID___addressable_cleanup_module2093 - 8 +8 __func__ 55 59 +4 __UNIQUE_ID_modinfo2097 20 18 -2 __UNIQUE_ID___addressable_init_module2093 8 - -8 __UNIQUE_ID___addressable_cleanup_module2094 8 - -8 __UNIQUE_ID_modinfo2098 18 - -18 __UNIQUE_ID_modinfo2095 43 12 -31 descriptor 112 56 -56 ipip6_tunnel_xmit.isra 9910 7758 -2152 Total: Before=72537, After=72103, chg -0.60% Signed-off-by: Yue Haibing Reviewed-by: Alexander Lobakin Link: https://patch.msgid.link/20250901114857.1968513-1-yuehaibing@huawei.com Signed-off-by: Paolo Abeni --- net/ipv6/sit.c | 104 ++++++++++++++++++++++++++------------------------------- 1 file changed, 48 insertions(+), 56 deletions(-) (limited to 'net') diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 12496ba1b7d4..cf37ad9686e6 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -848,6 +848,49 @@ static inline __be32 try_6rd(struct ip_tunnel *tunnel, return dst; } +static bool ipip6_tunnel_dst_find(struct sk_buff *skb, __be32 *dst, + bool is_isatap) +{ + const struct ipv6hdr *iph6 = ipv6_hdr(skb); + struct neighbour *neigh = NULL; + const struct in6_addr *addr6; + bool found = false; + int addr_type; + + if (skb_dst(skb)) + neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); + + if (!neigh) { + net_dbg_ratelimited("nexthop == NULL\n"); + return false; + } + + addr6 = (const struct in6_addr *)&neigh->primary_key; + addr_type = ipv6_addr_type(addr6); + + if (is_isatap) { + if ((addr_type & IPV6_ADDR_UNICAST) && + ipv6_addr_is_isatap(addr6)) { + *dst = addr6->s6_addr32[3]; + found = true; + } + } else { + if (addr_type == IPV6_ADDR_ANY) { + addr6 = &ipv6_hdr(skb)->daddr; + addr_type = ipv6_addr_type(addr6); + } + + if ((addr_type & IPV6_ADDR_COMPATv4) != 0) { + *dst = addr6->s6_addr32[3]; + found = true; + } + } + + neigh_release(neigh); + + return found; +} + /* * This function assumes it is being called from dev_queue_xmit() * and that skb is filled properly by that function. @@ -867,8 +910,6 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, __be32 dst = tiph->daddr; struct flowi4 fl4; int mtu; - const struct in6_addr *addr6; - int addr_type; u8 ttl; u8 protocol = IPPROTO_IPV6; int t_hlen = tunnel->hlen + sizeof(struct iphdr); @@ -877,64 +918,15 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, tos = ipv6_get_dsfield(iph6); /* ISATAP (RFC4214) - must come before 6to4 */ - if (dev->priv_flags & IFF_ISATAP) { - struct neighbour *neigh = NULL; - bool do_tx_error = false; - - if (skb_dst(skb)) - neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); - - if (!neigh) { - net_dbg_ratelimited("nexthop == NULL\n"); - goto tx_error; - } - - addr6 = (const struct in6_addr *)&neigh->primary_key; - addr_type = ipv6_addr_type(addr6); - - if ((addr_type & IPV6_ADDR_UNICAST) && - ipv6_addr_is_isatap(addr6)) - dst = addr6->s6_addr32[3]; - else - do_tx_error = true; - - neigh_release(neigh); - if (do_tx_error) - goto tx_error; - } + if ((dev->priv_flags & IFF_ISATAP) && + !ipip6_tunnel_dst_find(skb, &dst, true)) + goto tx_error; if (!dst) dst = try_6rd(tunnel, &iph6->daddr); - if (!dst) { - struct neighbour *neigh = NULL; - bool do_tx_error = false; - - if (skb_dst(skb)) - neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); - - if (!neigh) { - net_dbg_ratelimited("nexthop == NULL\n"); - goto tx_error; - } - - addr6 = (const struct in6_addr *)&neigh->primary_key; - addr_type = ipv6_addr_type(addr6); - - if (addr_type == IPV6_ADDR_ANY) { - addr6 = &ipv6_hdr(skb)->daddr; - addr_type = ipv6_addr_type(addr6); - } - - if ((addr_type & IPV6_ADDR_COMPATv4) != 0) - dst = addr6->s6_addr32[3]; - else - do_tx_error = true; - - neigh_release(neigh); - if (do_tx_error) - goto tx_error; - } + if (!dst && !ipip6_tunnel_dst_find(skb, &dst, false)) + goto tx_error; flowi4_init_output(&fl4, tunnel->parms.link, tunnel->fwmark, tos & INET_DSCP_MASK, RT_SCOPE_UNIVERSE, -- cgit v1.2.3 From 3ceb08838b576b20108d7facf6baa3dbf792afe9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 1 Sep 2025 14:12:10 -0700 Subject: net: add helper to pre-check if PP for an Rx queue will be unreadable mlx5 pokes into the rxq state to check if the queue has a memory provider, and therefore whether it may produce unreadable mem. Add a helper for doing this in the page pool API. fbnic will want a similar thing (tho, for a slightly different reason). Reviewed-by: Mina Almasry Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250901211214.1027927-11-kuba@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 9 +-------- include/net/netdev_queues.h | 2 ++ include/net/page_pool/helpers.h | 12 ++++++++++++ net/core/netdev_rx_queue.c | 9 +++++++++ 4 files changed, 24 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 3970d0ddbcdc..714cce595692 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -780,13 +780,6 @@ static void mlx5e_rq_shampo_hd_info_free(struct mlx5e_rq *rq) bitmap_free(rq->mpwqe.shampo->bitmap); } -static bool mlx5_rq_needs_separate_hd_pool(struct mlx5e_rq *rq) -{ - struct netdev_rx_queue *rxq = __netif_get_rx_queue(rq->netdev, rq->ix); - - return !!rxq->mp_params.mp_ops; -} - static int mlx5_rq_shampo_alloc(struct mlx5_core_dev *mdev, struct mlx5e_params *params, struct mlx5e_rq_param *rqp, @@ -825,7 +818,7 @@ static int mlx5_rq_shampo_alloc(struct mlx5_core_dev *mdev, hd_pool_size = (rq->mpwqe.shampo->hd_per_wqe * wq_size) / MLX5E_SHAMPO_WQ_HEADER_PER_PAGE; - if (mlx5_rq_needs_separate_hd_pool(rq)) { + if (netif_rxq_has_unreadable_mp(rq->netdev, rq->ix)) { /* Separate page pool for shampo headers */ struct page_pool_params pp_params = { }; diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index b9d02bc65c97..cd00e0406cf4 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -151,6 +151,8 @@ struct netdev_queue_mgmt_ops { int idx); }; +bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx); + /** * DOC: Lockless queue stopping / waking helpers. * diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index aa3719f28216..3247026e096a 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -505,6 +505,18 @@ static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) page_pool_update_nid(pool, new_nid); } +/** + * page_pool_is_unreadable() - will allocated buffers be unreadable for the CPU + * @pool: queried page pool + * + * Check if page pool will return buffers which are unreadable to the CPU / + * kernel. This will only be the case if user space bound a memory provider (mp) + * which returns unreadable memory to the queue served by the page pool. + * If %PP_FLAG_ALLOW_UNREADABLE_NETMEM was set but there is no mp bound + * this helper will return false. See also netif_rxq_has_unreadable_mp(). + * + * Return: true if memory allocated by the page pool may be unreadable + */ static inline bool page_pool_is_unreadable(struct page_pool *pool) { return !!pool->mp_ops; diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index 3bf1151d8061..c7d9341b7630 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -9,6 +9,15 @@ #include "page_pool_priv.h" +/* See also page_pool_is_unreadable() */ +bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx) +{ + struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx); + + return !!rxq->mp_params.mp_ops; +} +EXPORT_SYMBOL(netif_rxq_has_unreadable_mp); + int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) { struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx); -- cgit v1.2.3 From ee63609454838ea2b108f96f74a287be72d281ee Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Fri, 25 Jul 2025 23:22:19 +1000 Subject: wifi: mac80211: support block bitmap S1G TIM encoding An S1G TIM PVB is encoded differently compared to a non-s1g TIM PVB. As the AP dictates which encoding mode it uses, here we only implement block bitmap encoding. This is the default encoding mode used by all current vendor implementations. Additionally, S1G has a maximum AID count of 8192, however we are limiting the current implementation to 1600. This has no resemblence to the standard and is purely an implementation detail. The reason for this is due to the TIM elements maximum length of 255. This allows for, at most, 25 encoded blocks for a PVB encoded with block bitmap. Support for the maximum of 8192 AIDs will require an implementation of page slicing to be added to mac80211. As a result, we perform extra validation on both the STA and AP side when receiving an AID as an S1G interface. Add support for block bitmap encoding for an S1G AP and limit the maximum AID count to 1600 for the current mac80211 implementations. Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250725132221.258217-2-lachlan.hodges@morsemicro.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 3 +- net/mac80211/cfg.c | 10 ++- net/mac80211/ieee80211_i.h | 8 +++ net/mac80211/mlme.c | 19 ++--- net/mac80211/tx.c | 166 +++++++++++++++++++++++++++++++++---------- 5 files changed, 156 insertions(+), 50 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index d1a14f2892d9..a4bc0c2729f6 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2283,7 +2283,8 @@ enum nl80211_commands { * @NL80211_ATTR_PEER_AID: Association ID for the peer TDLS station (u16). * This is similar to @NL80211_ATTR_STA_AID but with a difference of being * allowed to be used with the first @NL80211_CMD_SET_STATION command to - * update a TDLS peer STA entry. + * update a TDLS peer STA entry. For S1G interfaces, this is limited to + * 1600 for the current mac80211 implementation. * * @NL80211_ATTR_COALESCE_RULE: Coalesce rule information. * diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 2ed07fa121ab..4603350989c8 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2171,10 +2171,16 @@ static int sta_apply_parameters(struct ieee80211_local *local, /* * cfg80211 validates this (1-2007) and allows setting the AID - * only when creating a new station entry + * only when creating a new station entry. For S1G APs, the current + * implementation supports a maximum of 1600 AIDs. */ - if (params->aid) + if (params->aid) { + if (sdata->vif.cfg.s1g && + params->aid > IEEE80211_MAX_SUPPORTED_S1G_AID) + return -EINVAL; + sta->sta.aid = params->aid; + } /* * Some of the following updates would be racy if called on an diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 8afa2404eaa8..07f5fb11569b 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -86,6 +86,14 @@ extern const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS]; #define IEEE80211_MAX_NAN_INSTANCE_ID 255 +/* + * Current mac80211 implementation supports a maximum of 1600 AIDS + * for S1G interfaces. With regards to an S1G TIM, this covers 25 blocks + * as each block is 64 AIDs. + */ +#define IEEE80211_MAX_SUPPORTED_S1G_AID 1600 +#define IEEE80211_MAX_SUPPORTED_S1G_TIM_BLOCKS 25 + enum ieee80211_status_data { IEEE80211_STATUS_TYPE_MASK = 0x00f, IEEE80211_STATUS_TYPE_INVALID = 0, diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 353e89973d1e..f5d09ded9827 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -6351,6 +6351,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, }; u8 ap_mld_addr[ETH_ALEN] __aligned(2); unsigned int link_id; + u16 max_aid = IEEE80211_MAX_AID; lockdep_assert_wiphy(sdata->local->hw.wiphy); @@ -6377,10 +6378,12 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, reassoc = ieee80211_is_reassoc_resp(mgmt->frame_control); capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); status_code = le16_to_cpu(mgmt->u.assoc_resp.status_code); - if (assoc_data->s1g) + if (assoc_data->s1g) { elem_start = mgmt->u.s1g_assoc_resp.variable; - else + max_aid = IEEE80211_MAX_SUPPORTED_S1G_AID; + } else { elem_start = mgmt->u.assoc_resp.variable; + } /* * Note: this may not be perfect, AP might misbehave - if @@ -6404,16 +6407,15 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, if (elems->aid_resp) aid = le16_to_cpu(elems->aid_resp->aid); - else if (assoc_data->s1g) - aid = 0; /* TODO */ else aid = le16_to_cpu(mgmt->u.assoc_resp.aid); /* - * The 5 MSB of the AID field are reserved - * (802.11-2016 9.4.1.8 AID field) + * The 5 MSB of the AID field are reserved for a non-S1G STA. For + * an S1G STA the 3 MSBs are reserved. + * (802.11-2016 9.4.1.8 AID field). */ - aid &= 0x7ff; + aid &= assoc_data->s1g ? 0x1fff : 0x7ff; sdata_info(sdata, "RX %sssocResp from %pM (capab=0x%x status=%d aid=%d)\n", @@ -6450,7 +6452,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, event.u.mlme.reason = status_code; drv_event_callback(sdata->local, sdata, &event); } else { - if (aid == 0 || aid > IEEE80211_MAX_AID) { + if (aid == 0 || aid > max_aid) { sdata_info(sdata, "invalid AID value %d (out of range), turn off PS\n", aid); @@ -6488,6 +6490,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, } sdata->vif.cfg.aid = aid; + sdata->vif.cfg.s1g = assoc_data->s1g; if (!ieee80211_assoc_success(sdata, mgmt, elems, elem_start, elem_len)) { diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 00671ae45b2f..0ece8d89e094 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -4882,15 +4882,114 @@ void ieee80211_tx_pending(struct tasklet_struct *t) /* functions for drivers to get certain frames */ +static void ieee80211_beacon_add_tim_pvb(struct ps_data *ps, + struct sk_buff *skb, + bool mcast_traffic) +{ + int i, n1 = 0, n2; + + /* + * Find largest even number N1 so that bits numbered 1 through + * (N1 x 8) - 1 in the bitmap are 0 and number N2 so that bits + * (N2 + 1) x 8 through 2007 are 0. + */ + for (i = 0; i < IEEE80211_MAX_TIM_LEN; i++) { + if (ps->tim[i]) { + n1 = i & 0xfe; + break; + } + } + n2 = n1; + for (i = IEEE80211_MAX_TIM_LEN - 1; i >= n1; i--) { + if (ps->tim[i]) { + n2 = i; + break; + } + } + + /* Bitmap control */ + skb_put_u8(skb, n1 | mcast_traffic); + /* Part Virt Bitmap */ + skb_put_data(skb, ps->tim + n1, n2 - n1 + 1); +} + +/* + * mac80211 currently supports encoding using block bitmap mode, non + * inversed. The current implementation supports up to 1600 AIDs. + * + * Block bitmap encoding breaks down the AID bitmap into blocks of 64 + * AIDs. Each block contains between 0 and 8 subblocks. Each subblock + * describes 8 AIDs and the presence of a subblock is determined by + * the block bitmap. + */ +static void ieee80211_s1g_beacon_add_tim_pvb(struct ps_data *ps, + struct sk_buff *skb, + bool mcast_traffic) +{ + int blk; + + /* + * Emit a bitmap control block with a page slice number of 31 and a + * page index of 0 which indicates as per IEEE80211-2024 9.4.2.5.1 + * that the entire page (2048 bits) indicated by the page index + * is encoded in the partial virtual bitmap. + */ + skb_put_u8(skb, mcast_traffic | (31 << 1)); + + /* Emit an encoded block for each non-zero sub-block */ + for (blk = 0; blk < IEEE80211_MAX_SUPPORTED_S1G_TIM_BLOCKS; blk++) { + u8 blk_bmap = 0; + int sblk; + + for (sblk = 0; sblk < 8; sblk++) { + int sblk_idx = blk * 8 + sblk; + + /* + * If the current subblock is non-zero, increase the + * number of subblocks to emit for the current block. + */ + if (ps->tim[sblk_idx]) + blk_bmap |= BIT(sblk); + } + + /* If the current block contains no non-zero sublocks */ + if (!blk_bmap) + continue; + + /* + * Emit a block control byte for the current encoded block + * with an encoding mode of block bitmap (0x0), not inverse + * (0x0) and the current block offset (5 bits) + */ + skb_put_u8(skb, blk << 3); + + /* + * Emit the block bitmap for the current encoded block which + * contains the present subblocks. + */ + skb_put_u8(skb, blk_bmap); + + /* Emit the present subblocks */ + for (sblk = 0; sblk < 8; sblk++) { + int sblk_idx = blk * 8 + sblk; + + if (!(blk_bmap & BIT(sblk))) + continue; + + skb_put_u8(skb, ps->tim[sblk_idx]); + } + } +} + static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, struct ps_data *ps, struct sk_buff *skb, bool is_template) { - u8 *pos, *tim; - int aid0 = 0; - int i, have_bits = 0, n1, n2; + struct element *tim; + bool mcast_traffic = false, have_bits = false; struct ieee80211_bss_conf *link_conf = link->conf; + bool s1g = ieee80211_get_link_sband(link)->band == NL80211_BAND_S1GHZ; /* Generate bitmap for TIM only if there are any STAs in power save * mode. */ @@ -4898,7 +4997,8 @@ static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, /* in the hope that this is faster than * checking byte-for-byte */ have_bits = !bitmap_empty((unsigned long *)ps->tim, - IEEE80211_MAX_AID+1); + IEEE80211_MAX_AID + 1); + if (!is_template) { if (ps->dtim_count == 0) ps->dtim_count = link_conf->dtim_period - 1; @@ -4906,51 +5006,39 @@ static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, ps->dtim_count--; } - tim = pos = skb_put(skb, 5); - *pos++ = WLAN_EID_TIM; - *pos++ = 3; - *pos++ = ps->dtim_count; - *pos++ = link_conf->dtim_period; + /* Length is set after parsing the AID bitmap */ + tim = skb_put(skb, sizeof(struct element)); + tim->id = WLAN_EID_TIM; + skb_put_u8(skb, ps->dtim_count); + skb_put_u8(skb, link_conf->dtim_period); if (ps->dtim_count == 0 && !skb_queue_empty(&ps->bc_buf)) - aid0 = 1; + mcast_traffic = true; - ps->dtim_bc_mc = aid0 == 1; + ps->dtim_bc_mc = mcast_traffic; if (have_bits) { - /* Find largest even number N1 so that bits numbered 1 through - * (N1 x 8) - 1 in the bitmap are 0 and number N2 so that bits - * (N2 + 1) x 8 through 2007 are 0. */ - n1 = 0; - for (i = 0; i < IEEE80211_MAX_TIM_LEN; i++) { - if (ps->tim[i]) { - n1 = i & 0xfe; - break; - } - } - n2 = n1; - for (i = IEEE80211_MAX_TIM_LEN - 1; i >= n1; i--) { - if (ps->tim[i]) { - n2 = i; - break; - } - } - - /* Bitmap control */ - *pos++ = n1 | aid0; - /* Part Virt Bitmap */ - skb_put_data(skb, ps->tim + n1, n2 - n1 + 1); - - tim[1] = n2 - n1 + 4; + if (s1g) + ieee80211_s1g_beacon_add_tim_pvb(ps, skb, + mcast_traffic); + else + ieee80211_beacon_add_tim_pvb(ps, skb, mcast_traffic); } else { - *pos++ = aid0; /* Bitmap control */ - - if (ieee80211_get_link_sband(link)->band != NL80211_BAND_S1GHZ) { - tim[1] = 4; + /* + * If there is no buffered unicast traffic for an S1G + * interface, we can exclude the bitmap control. This is in + * contrast to other phy types as they do include the bitmap + * control and pvb even when there is no buffered traffic. + */ + if (!s1g) { + /* Bitmap control */ + skb_put_u8(skb, mcast_traffic); /* Part Virt Bitmap */ skb_put_u8(skb, 0); } } + + tim->datalen = skb_tail_pointer(skb) - tim->data; } static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, -- cgit v1.2.3 From e0c47c6229c25b54440fe1f84a0ff533942290b1 Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Fri, 25 Jul 2025 23:22:20 +1000 Subject: wifi: mac80211: support parsing S1G TIM PVB An S1G TIM PVB has 3 mandatory encoding modes, that being block bitmap, single AID and OBL alongside the ability for each encoding mode to be inverted. Introduce the ability to parse the 3 encoding formats. The implementation specification for the encoding formats can be found in IEEE80211-2024 9.4.2.5. Signed-off-by: Arien Judge Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250725132221.258217-3-lachlan.hodges@morsemicro.com Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/carl9170/rx.c | 2 +- drivers/net/wireless/intersil/p54/txrx.c | 2 +- drivers/net/wireless/ralink/rt2x00/rt2x00dev.c | 2 +- drivers/net/wireless/realtek/rtlwifi/ps.c | 2 +- include/linux/ieee80211.h | 265 ++++++++++++++++++++++++- net/mac80211/mesh_ps.c | 2 +- net/mac80211/mlme.c | 3 +- 7 files changed, 263 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/drivers/net/wireless/ath/carl9170/rx.c b/drivers/net/wireless/ath/carl9170/rx.c index 908c4c8b7f82..6833430130f4 100644 --- a/drivers/net/wireless/ath/carl9170/rx.c +++ b/drivers/net/wireless/ath/carl9170/rx.c @@ -555,7 +555,7 @@ static void carl9170_ps_beacon(struct ar9170 *ar, void *data, unsigned int len) /* Check whenever the PHY can be turned off again. */ /* 1. What about buffered unicast traffic for our AID? */ - cam = ieee80211_check_tim(tim_ie, tim_len, ar->common.curaid); + cam = ieee80211_check_tim(tim_ie, tim_len, ar->common.curaid, false); /* 2. Maybe the AP wants to send multicast/broadcast data? */ cam |= !!(tim_ie->bitmap_ctrl & 0x01); diff --git a/drivers/net/wireless/intersil/p54/txrx.c b/drivers/net/wireless/intersil/p54/txrx.c index 2deb1bb54f24..1294a1d6528e 100644 --- a/drivers/net/wireless/intersil/p54/txrx.c +++ b/drivers/net/wireless/intersil/p54/txrx.c @@ -317,7 +317,7 @@ static void p54_pspoll_workaround(struct p54_common *priv, struct sk_buff *skb) tim_len = tim[1]; tim_ie = (struct ieee80211_tim_ie *) &tim[2]; - new_psm = ieee80211_check_tim(tim_ie, tim_len, priv->aid); + new_psm = ieee80211_check_tim(tim_ie, tim_len, priv->aid, false); if (new_psm != priv->powersave_override) { priv->powersave_override = new_psm; p54_set_ps(priv); diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c b/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c index 7db29e90eb4f..f8a6f9c968a1 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c +++ b/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c @@ -679,7 +679,7 @@ static void rt2x00lib_rxdone_check_ps(struct rt2x00_dev *rt2x00dev, /* Check whenever the PHY can be turned off again. */ /* 1. What about buffered unicast traffic for our AID? */ - cam = ieee80211_check_tim(tim_ie, tim_len, rt2x00dev->aid); + cam = ieee80211_check_tim(tim_ie, tim_len, rt2x00dev->aid, false); /* 2. Maybe the AP wants to send multicast/broadcast data? */ cam |= (tim_ie->bitmap_ctrl & 0x01); diff --git a/drivers/net/wireless/realtek/rtlwifi/ps.c b/drivers/net/wireless/realtek/rtlwifi/ps.c index 6241e4fed4f6..bcab12c3b4c1 100644 --- a/drivers/net/wireless/realtek/rtlwifi/ps.c +++ b/drivers/net/wireless/realtek/rtlwifi/ps.c @@ -519,7 +519,7 @@ void rtl_swlps_beacon(struct ieee80211_hw *hw, void *data, unsigned int len) /* 1. What about buffered unicast traffic for our AID? */ u_buffed = ieee80211_check_tim(tim_ie, tim_len, - rtlpriv->mac80211.assoc_id); + rtlpriv->mac80211.assoc_id, false); /* 2. Maybe the AP wants to send multicast/broadcast data? */ m_buffed = tim_ie->bitmap_ctrl & 0x01; diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index e5a2096e022e..d350263f23f3 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -220,6 +220,12 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2) #define IEEE80211_MAX_AID_S1G 8191 #define IEEE80211_MAX_TIM_LEN 251 #define IEEE80211_MAX_MESH_PEERINGS 63 + +/* S1G encoding types */ +#define IEEE80211_S1G_TIM_ENC_MODE_BLOCK 0 +#define IEEE80211_S1G_TIM_ENC_MODE_SINGLE 1 +#define IEEE80211_S1G_TIM_ENC_MODE_OLB 2 + /* Maximum size for the MA-UNITDATA primitive, 802.11 standard section 6.2.1.1.2. @@ -4757,15 +4763,8 @@ static inline unsigned long ieee80211_tu_to_usec(unsigned long tu) return 1024 * tu; } -/** - * ieee80211_check_tim - check if AID bit is set in TIM - * @tim: the TIM IE - * @tim_len: length of the TIM IE - * @aid: the AID to look for - * Return: whether or not traffic is indicated in the TIM for the given AID - */ -static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim, - u8 tim_len, u16 aid) +static inline bool __ieee80211_check_tim(const struct ieee80211_tim_ie *tim, + u8 tim_len, u16 aid) { u8 mask; u8 index, indexn1, indexn2; @@ -4788,6 +4787,254 @@ static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim, return !!(tim->virtual_map[index] & mask); } +struct s1g_tim_aid { + u16 aid; + u8 target_blk; /* Target block index */ + u8 target_subblk; /* Target subblock index */ + u8 target_subblk_bit; /* Target subblock bit */ +}; + +struct s1g_tim_enc_block { + u8 enc_mode; + bool inverse; + const u8 *ptr; + u8 len; + + /* + * For an OLB encoded block that spans multiple blocks, this + * is the offset into the span described by that encoded block. + */ + u8 olb_blk_offset; +}; + +/* + * Helper routines to quickly extract the length of an encoded block. Validation + * is also performed to ensure the length extracted lies within the TIM. + */ + +static inline int ieee80211_s1g_len_bitmap(const u8 *ptr, const u8 *end) +{ + u8 blkmap; + u8 n_subblks; + + if (ptr >= end) + return -EINVAL; + + blkmap = *ptr; + n_subblks = hweight8(blkmap); + + if (ptr + 1 + n_subblks > end) + return -EINVAL; + + return 1 + n_subblks; +} + +static inline int ieee80211_s1g_len_single(const u8 *ptr, const u8 *end) +{ + return (ptr + 1 > end) ? -EINVAL : 1; +} + +static inline int ieee80211_s1g_len_olb(const u8 *ptr, const u8 *end) +{ + if (ptr >= end) + return -EINVAL; + + return (ptr + 1 + *ptr > end) ? -EINVAL : 1 + *ptr; +} + +/* + * Enumerate all encoded blocks until we find the encoded block that describes + * our target AID. OLB is a special case as a single encoded block can describe + * multiple blocks as a single encoded block. + */ +static inline int ieee80211_s1g_find_target_block(struct s1g_tim_enc_block *enc, + const struct s1g_tim_aid *aid, + const u8 *ptr, const u8 *end) +{ + /* need at least block-control octet */ + while (ptr + 1 <= end) { + u8 ctrl = *ptr++; + u8 mode = ctrl & 0x03; + bool contains, inverse = ctrl & BIT(2); + u8 span, blk_off = ctrl >> 3; + int len; + + switch (mode) { + case IEEE80211_S1G_TIM_ENC_MODE_BLOCK: + len = ieee80211_s1g_len_bitmap(ptr, end); + contains = blk_off == aid->target_blk; + break; + case IEEE80211_S1G_TIM_ENC_MODE_SINGLE: + len = ieee80211_s1g_len_single(ptr, end); + contains = blk_off == aid->target_blk; + break; + case IEEE80211_S1G_TIM_ENC_MODE_OLB: + len = ieee80211_s1g_len_olb(ptr, end); + /* + * An OLB encoded block can describe more then one + * block, meaning an encoded OLB block can span more + * then a single block. + */ + if (len > 0) { + /* Minus one for the length octet */ + span = DIV_ROUND_UP(len - 1, 8); + /* + * Check if our target block lies within the + * block span described by this encoded block. + */ + contains = (aid->target_blk >= blk_off) && + (aid->target_blk < blk_off + span); + } + break; + default: + return -EOPNOTSUPP; + } + + if (len < 0) + return len; + + if (contains) { + enc->enc_mode = mode; + enc->inverse = inverse; + enc->ptr = ptr; + enc->len = (u8)len; + enc->olb_blk_offset = blk_off; + return 0; + } + + ptr += len; + } + + return -ENOENT; +} + +static inline bool ieee80211_s1g_parse_bitmap(struct s1g_tim_enc_block *enc, + struct s1g_tim_aid *aid) +{ + const u8 *ptr = enc->ptr; + u8 blkmap = *ptr++; + + /* + * If our block bitmap does not contain a set bit that corresponds + * to our AID, it could mean a variety of things depending on if + * the encoding mode is inverted or not. + * + * 1. If inverted, it means the entire subblock is present and hence + * our AID has been set. + * 2. If not inverted, it means our subblock is not present and hence + * it is all zero meaning our AID is not set. + */ + if (!(blkmap & BIT(aid->target_subblk))) + return enc->inverse; + + /* + * Increment ptr by the number of set subblocks that appear before our + * target subblock. If our target subblock is 0, do nothing as ptr + * already points to our target subblock. + */ + if (aid->target_subblk) + ptr += hweight8(blkmap & GENMASK(aid->target_subblk - 1, 0)); + + return !!(*ptr & BIT(aid->target_subblk_bit)) ^ enc->inverse; +} + +static inline bool ieee80211_s1g_parse_single(struct s1g_tim_enc_block *enc, + struct s1g_tim_aid *aid) +{ + /* + * Single AID mode describes, as the name suggests, a single AID + * within the block described by the encoded block. The octet + * contains the 6 LSBs of the AID described in the block. The other + * 2 bits are reserved. When inversed, every single AID described + * by the current block have buffered traffic except for the AID + * described in the single AID octet. + */ + return ((*enc->ptr & 0x3f) == (aid->aid & 0x3f)) ^ enc->inverse; +} + +static inline bool ieee80211_s1g_parse_olb(struct s1g_tim_enc_block *enc, + struct s1g_tim_aid *aid) +{ + const u8 *ptr = enc->ptr; + u8 blk_len = *ptr++; + /* + * Given an OLB encoded block that describes multiple blocks, + * calculate the offset into the span. Then calculate the + * subblock location normally. + */ + u16 span_offset = aid->target_blk - enc->olb_blk_offset; + u16 subblk_idx = span_offset * 8 + aid->target_subblk; + + if (subblk_idx >= blk_len) + return enc->inverse; + + return !!(ptr[subblk_idx] & BIT(aid->target_subblk_bit)) ^ enc->inverse; +} + +/* + * An S1G PVB has 3 non optional encoding types, each that can be inverted. + * An S1G PVB is constructed with zero or more encoded block subfields. Each + * encoded block represents a single "block" of AIDs (64), and each encoded + * block can contain one of the 3 encoding types alongside a single bit for + * whether the bits should be inverted. + * + * As the standard makes no guarantee about the ordering of encoded blocks, + * we must parse every encoded block in the worst case scenario given an + * AID that lies within the last block. + */ +static inline bool ieee80211_s1g_check_tim(const struct ieee80211_tim_ie *tim, + u8 tim_len, u16 aid) +{ + int err; + struct s1g_tim_aid target_aid; + struct s1g_tim_enc_block enc_blk; + + if (tim_len < 3) + return false; + + target_aid.aid = aid; + target_aid.target_blk = (aid >> 6) & 0x1f; + target_aid.target_subblk = (aid >> 3) & 0x7; + target_aid.target_subblk_bit = aid & 0x7; + + /* + * Find our AIDs target encoded block and fill &enc_blk with the + * encoded blocks information. If no entry is found or an error + * occurs return false. + */ + err = ieee80211_s1g_find_target_block(&enc_blk, &target_aid, + tim->virtual_map, + (const u8 *)tim + tim_len + 2); + if (err) + return false; + + switch (enc_blk.enc_mode) { + case IEEE80211_S1G_TIM_ENC_MODE_BLOCK: + return ieee80211_s1g_parse_bitmap(&enc_blk, &target_aid); + case IEEE80211_S1G_TIM_ENC_MODE_SINGLE: + return ieee80211_s1g_parse_single(&enc_blk, &target_aid); + case IEEE80211_S1G_TIM_ENC_MODE_OLB: + return ieee80211_s1g_parse_olb(&enc_blk, &target_aid); + default: + return false; + } +} + +/** + * ieee80211_check_tim - check if AID bit is set in TIM + * @tim: the TIM IE + * @tim_len: length of the TIM IE + * @aid: the AID to look for + * @s1g: whether the TIM is from an S1G PPDU + * Return: whether or not traffic is indicated in the TIM for the given AID + */ +static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim, + u8 tim_len, u16 aid, bool s1g) +{ + return s1g ? ieee80211_s1g_check_tim(tim, tim_len, aid) : + __ieee80211_check_tim(tim, tim_len, aid); +} + /** * ieee80211_get_tdls_action - get TDLS action code * @skb: the skb containing the frame, length will not be checked diff --git a/net/mac80211/mesh_ps.c b/net/mac80211/mesh_ps.c index 20e022a03933..ebab1f0a0138 100644 --- a/net/mac80211/mesh_ps.c +++ b/net/mac80211/mesh_ps.c @@ -586,7 +586,7 @@ void ieee80211_mps_frame_release(struct sta_info *sta, if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) has_buffered = ieee80211_check_tim(elems->tim, elems->tim_len, - sta->mesh->aid); + sta->mesh->aid, false); if (has_buffered) mps_dbg(sta->sdata, "%pM indicates buffered frames\n", diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index f5d09ded9827..9568cc95a7ff 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -7438,7 +7438,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, ncrc = elems->crc; if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && - ieee80211_check_tim(elems->tim, elems->tim_len, vif_cfg->aid)) { + ieee80211_check_tim(elems->tim, elems->tim_len, vif_cfg->aid, + vif_cfg->s1g)) { if (local->hw.conf.dynamic_ps_timeout > 0) { if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; -- cgit v1.2.3 From 1860b1a8257c5d52fbed3093eb9a3e7476619403 Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Fri, 25 Jul 2025 23:22:21 +1000 Subject: wifi: mac80211: kunit: add kunit tests for S1G PVB decoding Add support for testing the 6 examples mentioned in IEEE80211-2024 Annex L. These tests cover the 3 mandatory decoding modes being block bitmap, single AID and OLB alongside their equivalent inverses. Test output: 1..6 s1g_tim_block_test: Block 0 (ENC=BLOCK, blk_off=0, inverse=0) s1g_tim_block_test: octet 1 (ctrl) : 00000000 (0x00) s1g_tim_block_test: octet 2 (blk-map) : 00000101 (0x05) s1g_tim_block_test: octet 3 (SB 0) : 01000010 (0x42) s1g_tim_block_test: octet 4 (SB 2) : 10100000 (0xa0) ok 1 s1g_tim_block_test s1g_tim_single_test: Block 0 (ENC=SINGLE, blk_off=0, inverse=0) s1g_tim_single_test: octet 1 (ctrl) : 00000001 (0x01) s1g_tim_single_test: octet 2 (single) : 00011111 (0x1f) ok 2 s1g_tim_single_test s1g_tim_olb_test: Block 0 (ENC=OLB, blk_off=0, inverse=0) s1g_tim_olb_test: octet 1 (ctrl) : 00000010 (0x02) s1g_tim_olb_test: octet 2 (len= 9) : 00001001 (0x09) s1g_tim_olb_test: octet 3 (SB 0) : 01000010 (0x42) s1g_tim_olb_test: octet 4 (SB 1) : 10100000 (0xa0) s1g_tim_olb_test: octet 5 (SB 2) : 01000010 (0x42) s1g_tim_olb_test: octet 6 (SB 3) : 10100000 (0xa0) s1g_tim_olb_test: octet 7 (SB 4) : 01000010 (0x42) s1g_tim_olb_test: octet 8 (SB 5) : 10100000 (0xa0) s1g_tim_olb_test: octet 9 (SB 6) : 01000010 (0x42) s1g_tim_olb_test: octet 10 (SB 7) : 10100000 (0xa0) s1g_tim_olb_test: octet 11 (SB 8) : 01000010 (0x42) ok 3 s1g_tim_olb_test s1g_tim_inverse_block_test: Block 0 (ENC=BLOCK, blk_off=0, inverse=1) s1g_tim_inverse_block_test: octet 1 (ctrl) : 00000100 (0x04) s1g_tim_inverse_block_test: octet 2 (blk-map) : 00000101 (0x05) s1g_tim_inverse_block_test: octet 3 (SB 0) : 01000010 (0x42) s1g_tim_inverse_block_test: octet 4 (SB 2) : 10100000 (0xa0) ok 4 s1g_tim_inverse_block_test s1g_tim_inverse_single_test: Block 0 (ENC=SINGLE, blk_off=0, inverse=1) s1g_tim_inverse_single_test: octet 1 (ctrl) : 00000101 (0x05) s1g_tim_inverse_single_test: octet 2 (single) : 00011111 (0x1f) ok 5 s1g_tim_inverse_single_test s1g_tim_inverse_olb_test: Block 0 (ENC=OLB, blk_off=0, inverse=1) s1g_tim_inverse_olb_test: octet 1 (ctrl) : 00000110 (0x06) s1g_tim_inverse_olb_test: octet 2 (len= 9) : 00001001 (0x09) s1g_tim_inverse_olb_test: octet 3 (SB 0) : 01000010 (0x42) s1g_tim_inverse_olb_test: octet 4 (SB 1) : 10100000 (0xa0) s1g_tim_inverse_olb_test: octet 5 (SB 2) : 01000010 (0x42) s1g_tim_inverse_olb_test: octet 6 (SB 3) : 10100000 (0xa0) s1g_tim_inverse_olb_test: octet 7 (SB 4) : 01000010 (0x42) s1g_tim_inverse_olb_test: octet 8 (SB 5) : 10100000 (0xa0) s1g_tim_inverse_olb_test: octet 9 (SB 6) : 01000010 (0x42) s1g_tim_inverse_olb_test: octet 10 (SB 7) : 10100000 (0xa0) s1g_tim_inverse_olb_test: octet 11 (SB 8) : 01000010 (0x42) ok 6 s1g_tim_inverse_olb_test mac80211-s1g-tim: pass:6 fail:0 skip:0 total:6 Totals: pass:6 fail:0 skip:0 total:6 Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250725132221.258217-4-lachlan.hodges@morsemicro.com [make tim_push() void, non-inline] Signed-off-by: Johannes Berg --- net/mac80211/tests/Makefile | 2 +- net/mac80211/tests/s1g_tim.c | 356 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 357 insertions(+), 1 deletion(-) create mode 100644 net/mac80211/tests/s1g_tim.c (limited to 'net') diff --git a/net/mac80211/tests/Makefile b/net/mac80211/tests/Makefile index 3b0c08356fc5..3c7f874e5c41 100644 --- a/net/mac80211/tests/Makefile +++ b/net/mac80211/tests/Makefile @@ -1,3 +1,3 @@ -mac80211-tests-y += module.o util.o elems.o mfp.o tpe.o chan-mode.o +mac80211-tests-y += module.o util.o elems.o mfp.o tpe.o chan-mode.o s1g_tim.o obj-$(CONFIG_MAC80211_KUNIT_TEST) += mac80211-tests.o diff --git a/net/mac80211/tests/s1g_tim.c b/net/mac80211/tests/s1g_tim.c new file mode 100644 index 000000000000..642fa4ece89f --- /dev/null +++ b/net/mac80211/tests/s1g_tim.c @@ -0,0 +1,356 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KUnit tests for S1G TIM PVB decoding. This test suite covers + * IEEE80211-2024 Annex L figures 8, 9, 10, 12, 13, 14. ADE mode + * is not covered as it is an optional encoding format and is not + * currently supported by mac80211. + * + * Copyright (C) 2025 Morse Micro + */ +#include +#include +#include + +#define MAX_AID 128 + +#define BC(enc_mode, inverse, blk_off) \ + ((((blk_off) & 0x1f) << 3) | ((inverse) ? BIT(2) : 0) | \ + ((enc_mode) & 0x3)) + +static void byte_to_bitstr(u8 v, char *out) +{ + for (int b = 7; b >= 0; b--) + *out++ = (v & BIT(b)) ? '1' : '0'; + *out = '\0'; +} + +static void dump_tim_bits(struct kunit *test, + const struct ieee80211_tim_ie *tim, u8 tim_len) +{ + const u8 *ptr = tim->virtual_map; + const u8 *end = (const u8 *)tim + tim_len; + unsigned int oct = 1; + unsigned int blk = 0; + char bits[9]; + + while (ptr < end) { + u8 ctrl = *ptr++; + u8 mode = ctrl & 0x03; + bool inverse = ctrl & BIT(2); + u8 blk_off = ctrl >> 3; + + kunit_info( + test, "Block %u (ENC=%s, blk_off=%u, inverse=%u)", blk, + (mode == IEEE80211_S1G_TIM_ENC_MODE_BLOCK) ? "BLOCK" : + (mode == IEEE80211_S1G_TIM_ENC_MODE_SINGLE) ? "SINGLE" : + "OLB", + blk_off, inverse); + + byte_to_bitstr(ctrl, bits); + kunit_info(test, " octet %2u (ctrl) : %s (0x%02x)", oct, + bits, ctrl); + ++oct; + + switch (mode) { + case IEEE80211_S1G_TIM_ENC_MODE_BLOCK: { + u8 blkmap = *ptr++; + + byte_to_bitstr(blkmap, bits); + kunit_info(test, " octet %2u (blk-map) : %s (0x%02x)", + oct, bits, blkmap); + ++oct; + + for (u8 sb = 0; sb < 8; sb++) { + if (!(blkmap & BIT(sb))) + continue; + u8 sub = *ptr++; + + byte_to_bitstr(sub, bits); + kunit_info( + test, + " octet %2u (SB %2u) : %s (0x%02x)", + oct, sb, bits, sub); + ++oct; + } + break; + } + case IEEE80211_S1G_TIM_ENC_MODE_SINGLE: { + u8 single = *ptr++; + + byte_to_bitstr(single, bits); + kunit_info(test, " octet %2u (single) : %s (0x%02x)", + oct, bits, single); + ++oct; + break; + } + case IEEE80211_S1G_TIM_ENC_MODE_OLB: { + u8 len = *ptr++; + + byte_to_bitstr(len, bits); + kunit_info(test, " octet %2u (len=%2u) : %s (0x%02x)", + oct, len, bits, len); + ++oct; + + for (u8 i = 0; i < len && ptr < end; i++) { + u8 sub = *ptr++; + + byte_to_bitstr(sub, bits); + kunit_info( + test, + " octet %2u (SB %2u) : %s (0x%02x)", + oct, i, bits, sub); + ++oct; + } + break; + } + default: + kunit_info(test, " ** unknown encoding 0x%x **", mode); + return; + } + blk++; + } +} + +static void tim_push(u8 **p, u8 v) +{ + *(*p)++ = v; +} + +static void tim_begin(struct ieee80211_tim_ie *tim, u8 **p) +{ + tim->dtim_count = 0; + tim->dtim_period = 1; + tim->bitmap_ctrl = 0; + *p = tim->virtual_map; +} + +static u8 tim_end(struct ieee80211_tim_ie *tim, u8 *tail) +{ + return tail - (u8 *)tim; +} + +static void pvb_add_block_bitmap(u8 **p, u8 blk_off, bool inverse, u8 blk_bmap, + const u8 *subblocks) +{ + u8 enc = IEEE80211_S1G_TIM_ENC_MODE_BLOCK; + u8 n = hweight8(blk_bmap); + + tim_push(p, BC(enc, inverse, blk_off)); + tim_push(p, blk_bmap); + + for (u8 i = 0; i < n; i++) + tim_push(p, subblocks[i]); +} + +static void pvb_add_single_aid(u8 **p, u8 blk_off, bool inverse, u8 single6) +{ + u8 enc = IEEE80211_S1G_TIM_ENC_MODE_SINGLE; + + tim_push(p, BC(enc, inverse, blk_off)); + tim_push(p, single6 & GENMASK(5, 0)); +} + +static void pvb_add_olb(u8 **p, u8 blk_off, bool inverse, const u8 *subblocks, + u8 len) +{ + u8 enc = IEEE80211_S1G_TIM_ENC_MODE_OLB; + + tim_push(p, BC(enc, inverse, blk_off)); + tim_push(p, len); + for (u8 i = 0; i < len; i++) + tim_push(p, subblocks[i]); +} + +static void check_all_aids(struct kunit *test, + const struct ieee80211_tim_ie *tim, u8 tim_len, + const unsigned long *expected) +{ + for (u16 aid = 1; aid <= MAX_AID; aid++) { + bool want = test_bit(aid, expected); + bool got = ieee80211_s1g_check_tim(tim, tim_len, aid); + + KUNIT_ASSERT_EQ_MSG(test, got, want, + "AID %u mismatch (got=%d want=%d)", aid, + got, want); + } +} + +static void fill_bitmap(unsigned long *bm, const u16 *list, size_t n) +{ + size_t i; + + bitmap_zero(bm, MAX_AID + 1); + for (i = 0; i < n; i++) + __set_bit(list[i], bm); +} + +static void fill_bitmap_inverse(unsigned long *bm, u16 max_aid, + const u16 *except, size_t n_except) +{ + bitmap_zero(bm, MAX_AID + 1); + for (u16 aid = 1; aid <= max_aid; aid++) + __set_bit(aid, bm); + + for (size_t i = 0; i < n_except; i++) + if (except[i] <= max_aid) + __clear_bit(except[i], bm); +} + +static void s1g_tim_block_test(struct kunit *test) +{ + u8 buf[256] = {}; + struct ieee80211_tim_ie *tim = (void *)buf; + u8 *p, tim_len; + static const u8 subblocks[] = { + 0x42, /* SB m=0: AIDs 1,6 */ + 0xA0, /* SB m=2: AIDs 21,23 */ + }; + u8 blk_bmap = 0x05; /* bits 0 and 2 set */ + bool inverse = false; + static const u16 set_list[] = { 1, 6, 21, 23 }; + DECLARE_BITMAP(exp, MAX_AID + 1); + + tim_begin(tim, &p); + pvb_add_block_bitmap(&p, 0, inverse, blk_bmap, subblocks); + tim_len = tim_end(tim, p); + + fill_bitmap(exp, set_list, ARRAY_SIZE(set_list)); + + dump_tim_bits(test, tim, tim_len); + check_all_aids(test, tim, tim_len, exp); +} + +static void s1g_tim_single_test(struct kunit *test) +{ + u8 buf[256] = {}; + struct ieee80211_tim_ie *tim = (void *)buf; + u8 *p, tim_len; + bool inverse = false; + u8 blk_off = 0; + u8 single6 = 0x1f; /* 31 */ + static const u16 set_list[] = { 31 }; + DECLARE_BITMAP(exp, MAX_AID + 1); + + tim_begin(tim, &p); + pvb_add_single_aid(&p, blk_off, inverse, single6); + tim_len = tim_end(tim, p); + + fill_bitmap(exp, set_list, ARRAY_SIZE(set_list)); + + dump_tim_bits(test, tim, tim_len); + check_all_aids(test, tim, tim_len, exp); +} + +static void s1g_tim_olb_test(struct kunit *test) +{ + u8 buf[256] = {}; + struct ieee80211_tim_ie *tim = (void *)buf; + u8 *p, tim_len; + bool inverse = false; + u8 blk_off = 0; + static const u16 set_list[] = { 1, 6, 13, 15, 17, 22, 29, 31, 33, + 38, 45, 47, 49, 54, 61, 63, 65, 70 }; + static const u8 subblocks[] = { 0x42, 0xA0, 0x42, 0xA0, 0x42, + 0xA0, 0x42, 0xA0, 0x42 }; + u8 len = ARRAY_SIZE(subblocks); + DECLARE_BITMAP(exp, MAX_AID + 1); + + tim_begin(tim, &p); + pvb_add_olb(&p, blk_off, inverse, subblocks, len); + tim_len = tim_end(tim, p); + + fill_bitmap(exp, set_list, ARRAY_SIZE(set_list)); + + dump_tim_bits(test, tim, tim_len); + check_all_aids(test, tim, tim_len, exp); +} + +static void s1g_tim_inverse_block_test(struct kunit *test) +{ + u8 buf[256] = {}; + struct ieee80211_tim_ie *tim = (void *)buf; + u8 *p, tim_len; + /* Same sub-block content as Figure L-8, but inverse = true */ + static const u8 subblocks[] = { + 0x42, /* SB m=0: AIDs 1,6 */ + 0xA0, /* SB m=2: AIDs 21,23 */ + }; + u8 blk_bmap = 0x05; + bool inverse = true; + /* All AIDs except 1,6,21,23 are set */ + static const u16 except[] = { 1, 6, 21, 23 }; + DECLARE_BITMAP(exp, MAX_AID + 1); + + tim_begin(tim, &p); + pvb_add_block_bitmap(&p, 0, inverse, blk_bmap, subblocks); + tim_len = tim_end(tim, p); + + fill_bitmap_inverse(exp, 63, except, ARRAY_SIZE(except)); + + dump_tim_bits(test, tim, tim_len); + check_all_aids(test, tim, tim_len, exp); +} + +static void s1g_tim_inverse_single_test(struct kunit *test) +{ + u8 buf[256] = {}; + struct ieee80211_tim_ie *tim = (void *)buf; + u8 *p, tim_len; + bool inverse = true; + u8 blk_off = 0; + u8 single6 = 0x1f; /* 31 */ + /* All AIDs except 31 are set */ + static const u16 except[] = { 31 }; + DECLARE_BITMAP(exp, MAX_AID + 1); + + tim_begin(tim, &p); + pvb_add_single_aid(&p, blk_off, inverse, single6); + tim_len = tim_end(tim, p); + + fill_bitmap_inverse(exp, 63, except, ARRAY_SIZE(except)); + + dump_tim_bits(test, tim, tim_len); + check_all_aids(test, tim, tim_len, exp); +} + +static void s1g_tim_inverse_olb_test(struct kunit *test) +{ + u8 buf[256] = {}; + struct ieee80211_tim_ie *tim = (void *)buf; + u8 *p, tim_len; + bool inverse = true; + u8 blk_off = 0, len; + /* All AIDs except the list below are set */ + static const u16 except[] = { 1, 6, 13, 15, 17, 22, 29, 31, 33, + 38, 45, 47, 49, 54, 61, 63, 65, 70 }; + static const u8 subblocks[] = { 0x42, 0xA0, 0x42, 0xA0, 0x42, + 0xA0, 0x42, 0xA0, 0x42 }; + len = ARRAY_SIZE(subblocks); + DECLARE_BITMAP(exp, MAX_AID + 1); + + tim_begin(tim, &p); + pvb_add_olb(&p, blk_off, inverse, subblocks, len); + tim_len = tim_end(tim, p); + + fill_bitmap_inverse(exp, 127, except, ARRAY_SIZE(except)); + + dump_tim_bits(test, tim, tim_len); + check_all_aids(test, tim, tim_len, exp); +} + +static struct kunit_case s1g_tim_test_cases[] = { + KUNIT_CASE(s1g_tim_block_test), + KUNIT_CASE(s1g_tim_single_test), + KUNIT_CASE(s1g_tim_olb_test), + KUNIT_CASE(s1g_tim_inverse_block_test), + KUNIT_CASE(s1g_tim_inverse_single_test), + KUNIT_CASE(s1g_tim_inverse_olb_test), + {} +}; + +static struct kunit_suite s1g_tim = { + .name = "mac80211-s1g-tim", + .test_cases = s1g_tim_test_cases, +}; + +kunit_test_suite(s1g_tim); -- cgit v1.2.3 From 5f9d5fd8e08968e66d0212f782fc24d76e52800f Mon Sep 17 00:00:00 2001 From: Aditya Kumar Singh Date: Tue, 12 Aug 2025 12:53:28 +0530 Subject: wifi: cfg80211: fix return value in cfg80211_get_radio_idx_by_chan() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a valid radio index is not found, the function returns -ENOENT. If the channel argument itself is invalid, it returns -EINVAL. However, since the caller only checks for < 0, the distinction between these error codes is not utilized much. Also, handling these two distinct error codes throughout the codebase adds complexity, as both cases must be addressed separately. A subsequent change aims to simplify this by using a single error code for all invalid cases, making error handling more consistent and streamlined. To support this change, update the return value to -EINVAL when a valid radio index is not found. This is still appropriate because, even if the channel argument is structurally valid, the absence of a corresponding radio index implies that the argument is effectively invalid—otherwise, a valid index would have been found. Signed-off-by: Aditya Kumar Singh Link: https://patch.msgid.link/20250812-fix_scan_ap_flag_requirement_during_mlo-v4-1-383ffb6da213@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 +- net/wireless/util.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 406626ff6cc8..cb1c36be2749 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -9548,7 +9548,7 @@ int cfg80211_iter_combinations(struct wiphy *wiphy, * @wiphy: the wiphy * @chan: channel for which the supported radio index is required * - * Return: radio index on success or a negative error code + * Return: radio index on success or -EINVAL otherwise */ int cfg80211_get_radio_idx_by_chan(struct wiphy *wiphy, const struct ieee80211_channel *chan); diff --git a/net/wireless/util.c b/net/wireless/util.c index 240c68baa3d1..d12d49134c88 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -2584,7 +2584,7 @@ int cfg80211_get_radio_idx_by_chan(struct wiphy *wiphy, } } - return -ENOENT; + return -EINVAL; } EXPORT_SYMBOL(cfg80211_get_radio_idx_by_chan); -- cgit v1.2.3 From cfb58d5fc964e7e008d8d64c8fa8e9e28e501bc6 Mon Sep 17 00:00:00 2001 From: Aditya Kumar Singh Date: Tue, 12 Aug 2025 12:53:29 +0530 Subject: wifi: mac80211: simplify return value handling of cfg80211_get_radio_idx_by_chan() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In several instances where cfg80211_get_radio_idx_by_chan() is called, redundant checks are performed across function — such as verifying if wiphy->n_radio < 2 or if the returned index is negative. These checks are unnecessary, as the return value can be directly compared. Moreover, the function can be safely called even when radio-level properties are not explicitly advertised since in such case in each call it is going to get same error value. Therefore, simplify the usage of this function across all such cases by removing redundant conditions and relying on the return value directly. Signed-off-by: Aditya Kumar Singh Link: https://patch.msgid.link/20250812-fix_scan_ap_flag_requirement_during_mlo-v4-2-383ffb6da213@oss.qualcomm.com Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 13 ------------- net/mac80211/chan.c | 11 ----------- net/mac80211/util.c | 15 ++++++--------- 3 files changed, 6 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 4603350989c8..fca93cd36bd3 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3683,12 +3683,7 @@ static bool ieee80211_is_scan_ongoing(struct wiphy *wiphy, if (list_empty(&local->roc_list) && !local->scanning) return false; - if (wiphy->n_radio < 2) - return true; - req_radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, chandef->chan); - if (req_radio_idx < 0) - return true; if (local->scanning) { scan_req = wiphy_dereference(wiphy, local->scan_req); @@ -3707,14 +3702,6 @@ static bool ieee80211_is_scan_ongoing(struct wiphy *wiphy, list_for_each_entry(roc, &local->roc_list, list) { chan_radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, roc->chan); - /* - * The roc work is added but chan_radio_idx is invalid. - * Should not happen but if it does, let's not take - * risk and return true. - */ - if (chan_radio_idx < 0) - return true; - if (chan_radio_idx == req_radio_idx) return true; } diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index c9cea0e7ac16..57065714cf8c 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -659,19 +659,8 @@ bool ieee80211_is_radar_required(struct ieee80211_local *local, for_each_sdata_link(local, link) { if (link->radar_required) { - if (wiphy->n_radio < 2) - return true; - chan = link->conf->chanreq.oper.chan; radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, chan); - /* - * The radio index (radio_idx) is expected to be valid, - * as it's derived from a channel tied to a link. If - * it's invalid (i.e., negative), return true to avoid - * potential issues with radar-sensitive operations. - */ - if (radio_idx < 0) - return true; if (ieee80211_is_radio_idx_in_scan_req(wiphy, req, radio_idx)) diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 32f1bc5908c5..51e3e3c913f7 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -4022,16 +4022,13 @@ bool ieee80211_is_radio_idx_in_scan_req(struct wiphy *wiphy, for (i = 0; i < scan_req->n_channels; i++) { chan = scan_req->channels[i]; chan_radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, chan); - /* - * The chan_radio_idx should be valid since it's taken from a - * valid scan request. - * However, if chan_radio_idx is unexpectedly invalid (negative), - * we take a conservative approach and assume the scan request - * might use the specified radio_idx. Hence, return true. - */ - if (WARN_ON(chan_radio_idx < 0)) - return true; + /* The radio index either matched successfully, or an error + * occurred. For example, if radio-level information is + * missing, the same error value is returned. This + * typically implies a single-radio setup, in which case + * the operation should not be allowed. + */ if (chan_radio_idx == radio_idx) return true; } -- cgit v1.2.3 From 36b75dcb1e25739a3a0975699208c98f4b55d012 Mon Sep 17 00:00:00 2001 From: Aditya Kumar Singh Date: Tue, 12 Aug 2025 12:53:30 +0530 Subject: wifi: mac80211: consider links for validating SCAN_FLAG_AP in scan request during MLO Commit 78a7a126dc5b ("wifi: mac80211: validate SCAN_FLAG_AP in scan request during MLO") introduced a check that rejects scan requests if any link is already beaconing. This works fine when all links share the same radio, but breaks down in multi-radio setups. Consider a scenario where a 2.4 GHz link is beaconing and a scan is requested on a 5 GHz link, each backed by a different physical radio. The current logic still blocks the scan, even though it should be allowed. As a result, interface bring-up fails unnecessarily in valid configurations. Fix this by checking whether the scan is being requested on the same underlying radio as the beaconing link. Only reject the scan if it targets a link that is already beaconing and the NL80211_FEATURE_AP_SCAN is not set. This ensures correct behavior in multi-radio environments and avoids false rejections. Fixes: 78a7a126dc5b ("wifi: mac80211: validate SCAN_FLAG_AP in scan request during MLO") Signed-off-by: Aditya Kumar Singh Link: https://patch.msgid.link/20250812-fix_scan_ap_flag_requirement_during_mlo-v4-3-383ffb6da213@oss.qualcomm.com Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index fca93cd36bd3..b26f61f13605 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3007,6 +3007,9 @@ static int ieee80211_scan(struct wiphy *wiphy, struct cfg80211_scan_request *req) { struct ieee80211_sub_if_data *sdata; + struct ieee80211_link_data *link; + struct ieee80211_channel *chan; + int radio_idx; sdata = IEEE80211_WDEV_TO_SUB_IF(req->wdev); @@ -3034,10 +3037,20 @@ static int ieee80211_scan(struct wiphy *wiphy, * the frames sent while scanning on other channel will be * lost) */ - if (ieee80211_num_beaconing_links(sdata) && - (!(wiphy->features & NL80211_FEATURE_AP_SCAN) || - !(req->flags & NL80211_SCAN_FLAG_AP))) - return -EOPNOTSUPP; + for_each_link_data(sdata, link) { + /* if the link is not beaconing, ignore it */ + if (!sdata_dereference(link->u.ap.beacon, sdata)) + continue; + + chan = link->conf->chanreq.oper.chan; + radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, chan); + + if (ieee80211_is_radio_idx_in_scan_req(wiphy, req, + radio_idx) && + (!(wiphy->features & NL80211_FEATURE_AP_SCAN) || + !(req->flags & NL80211_SCAN_FLAG_AP))) + return -EOPNOTSUPP; + } break; case NL80211_IFTYPE_NAN: default: -- cgit v1.2.3 From d0bf06158c39e7129524dd8b43b82aed84d68faa Mon Sep 17 00:00:00 2001 From: Muna Sinada Date: Fri, 15 Aug 2025 14:30:11 -0700 Subject: wifi: nl80211: Add EHT fixed Tx rate support Add new attributes to support EHT MCS/NSS Tx rates and EHT GI/LTF. Parse EHT fixed MCS/NSS Tx rates and EHT GI/LTF values passed by the userspace, validate and add as part of cfg80211_bitrate_mask. MCS mask is constructed by new function, eht_build_mcs_mask(). Max NSS supported for MCS rates of 7, 9, 11 and 13 is utilized to set MCS bitmask for each NSS. MCS rates 14, and 15 if supported, are set only for NSS = 0. Co-developed-by: Aloka Dixit Signed-off-by: Aloka Dixit Signed-off-by: Muna Sinada Link: https://patch.msgid.link/20250815213011.2704803-1-muna.sinada@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 + include/uapi/linux/nl80211.h | 41 +++++++- net/wireless/nl80211.c | 229 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 266 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index cb1c36be2749..7d881aa7e48b 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -841,9 +841,12 @@ struct cfg80211_bitrate_mask { u8 ht_mcs[IEEE80211_HT_MCS_MASK_LEN]; u16 vht_mcs[NL80211_VHT_NSS_MAX]; u16 he_mcs[NL80211_HE_NSS_MAX]; + u16 eht_mcs[NL80211_EHT_NSS_MAX]; enum nl80211_txrate_gi gi; enum nl80211_he_gi he_gi; + enum nl80211_eht_gi eht_gi; enum nl80211_he_ltf he_ltf; + enum nl80211_eht_ltf eht_ltf; } control[NUM_NL80211_BANDS]; }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index a4bc0c2729f6..4f08264bbc8e 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1943,8 +1943,9 @@ enum nl80211_commands { * The driver must also specify support for this with the extended * features NL80211_EXT_FEATURE_BEACON_RATE_LEGACY, * NL80211_EXT_FEATURE_BEACON_RATE_HT, - * NL80211_EXT_FEATURE_BEACON_RATE_VHT and - * NL80211_EXT_FEATURE_BEACON_RATE_HE. + * NL80211_EXT_FEATURE_BEACON_RATE_VHT, + * NL80211_EXT_FEATURE_BEACON_RATE_HE and + * NL80211_EXT_FEATURE_BEACON_RATE_EHT. * * @NL80211_ATTR_FRAME_MATCH: A binary attribute which typically must contain * at least one byte, currently used with @NL80211_CMD_REGISTER_FRAME. @@ -3736,6 +3737,22 @@ enum nl80211_eht_gi { NL80211_RATE_INFO_EHT_GI_3_2, }; +/** + * enum nl80211_eht_ltf - EHT long training field + * @NL80211_RATE_INFO_EHT_1XLTF: 3.2 usec + * @NL80211_RATE_INFO_EHT_2XLTF: 6.4 usec + * @NL80211_RATE_INFO_EHT_4XLTF: 12.8 usec + * @NL80211_RATE_INFO_EHT_6XLTF: 19.2 usec + * @NL80211_RATE_INFO_EHT_8XLTF: 25.6 usec + */ +enum nl80211_eht_ltf { + NL80211_RATE_INFO_EHT_1XLTF, + NL80211_RATE_INFO_EHT_2XLTF, + NL80211_RATE_INFO_EHT_4XLTF, + NL80211_RATE_INFO_EHT_6XLTF, + NL80211_RATE_INFO_EHT_8XLTF, +}; + /** * enum nl80211_eht_ru_alloc - EHT RU allocation values * @NL80211_RATE_INFO_EHT_RU_ALLOC_26: 26-tone RU allocation @@ -5482,6 +5499,10 @@ enum nl80211_key_attributes { * see &struct nl80211_txrate_he * @NL80211_TXRATE_HE_GI: configure HE GI, 0.8us, 1.6us and 3.2us. * @NL80211_TXRATE_HE_LTF: configure HE LTF, 1XLTF, 2XLTF and 4XLTF. + * @NL80211_TXRATE_EHT: EHT rates allowed for TX rate selection, + * see &struct nl80211_txrate_eht + * @NL80211_TXRATE_EHT_GI: configure EHT GI, (u8, see &enum nl80211_eht_gi) + * @NL80211_TXRATE_EHT_LTF: configure EHT LTF, (u8, see &enum nl80211_eht_ltf) * @__NL80211_TXRATE_AFTER_LAST: internal * @NL80211_TXRATE_MAX: highest TX rate attribute */ @@ -5494,6 +5515,9 @@ enum nl80211_tx_rate_attributes { NL80211_TXRATE_HE, NL80211_TXRATE_HE_GI, NL80211_TXRATE_HE_LTF, + NL80211_TXRATE_EHT, + NL80211_TXRATE_EHT_GI, + NL80211_TXRATE_EHT_LTF, /* keep last */ __NL80211_TXRATE_AFTER_LAST, @@ -5526,6 +5550,15 @@ enum nl80211_txrate_gi { NL80211_TXRATE_FORCE_LGI, }; +#define NL80211_EHT_NSS_MAX 16 +/** + * struct nl80211_txrate_eht - EHT MCS/NSS txrate bitmap + * @mcs: MCS bitmap table for each NSS (array index 0 for 1 stream, etc.) + */ +struct nl80211_txrate_eht { + __u16 mcs[NL80211_EHT_NSS_MAX]; +}; + /** * enum nl80211_band - Frequency band * @NL80211_BAND_2GHZ: 2.4 GHz ISM band @@ -6650,6 +6683,9 @@ enum nl80211_feature_flags { * (signaling and payload protected) A-MSDUs and this shall be advertised * in the RSNXE. * + * @NL80211_EXT_FEATURE_BEACON_RATE_EHT: Driver supports beacon rate + * configuration (AP/mesh) with EHT rates. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -6725,6 +6761,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_OWE_OFFLOAD_AP, NL80211_EXT_FEATURE_DFS_CONCURRENT, NL80211_EXT_FEATURE_SPP_AMSDU_SUPPORT, + NL80211_EXT_FEATURE_BEACON_RATE_EHT, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 89519aa52893..364d83fb9992 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -411,6 +411,14 @@ static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = { [NL80211_TXRATE_HE_LTF] = NLA_POLICY_RANGE(NLA_U8, NL80211_RATE_INFO_HE_1XLTF, NL80211_RATE_INFO_HE_4XLTF), + [NL80211_TXRATE_EHT] = NLA_POLICY_EXACT_LEN(sizeof(struct nl80211_txrate_eht)), + [NL80211_TXRATE_EHT_GI] = NLA_POLICY_RANGE(NLA_U8, + NL80211_RATE_INFO_EHT_GI_0_8, + NL80211_RATE_INFO_EHT_GI_3_2), + [NL80211_TXRATE_EHT_LTF] = NLA_POLICY_RANGE(NLA_U8, + NL80211_RATE_INFO_EHT_1XLTF, + NL80211_RATE_INFO_EHT_8XLTF), + }; static const struct nla_policy @@ -5393,6 +5401,164 @@ static bool he_set_mcs_mask(struct genl_info *info, return true; } +static void eht_build_mcs_mask(struct genl_info *info, + const struct ieee80211_sta_eht_cap *eht_cap, + u8 mcs_nss_len, u16 *mcs_mask) +{ + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + u8 nss, mcs_7 = 0, mcs_9 = 0, mcs_11 = 0, mcs_13 = 0; + unsigned int link_id = nl80211_link_id(info->attrs); + + if (mcs_nss_len == 4) { + const struct ieee80211_eht_mcs_nss_supp_20mhz_only *mcs = + &eht_cap->eht_mcs_nss_supp.only_20mhz; + + mcs_7 = u8_get_bits(mcs->rx_tx_mcs7_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_9 = u8_get_bits(mcs->rx_tx_mcs9_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_11 = u8_get_bits(mcs->rx_tx_mcs11_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_13 = u8_get_bits(mcs->rx_tx_mcs13_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + + } else { + const struct ieee80211_eht_mcs_nss_supp_bw *mcs; + enum nl80211_chan_width width; + + switch (wdev->iftype) { + case NL80211_IFTYPE_ADHOC: + width = wdev->u.ibss.chandef.width; + break; + case NL80211_IFTYPE_MESH_POINT: + width = wdev->u.mesh.chandef.width; + break; + case NL80211_IFTYPE_OCB: + width = wdev->u.ocb.chandef.width; + break; + default: + if (wdev->valid_links) + width = wdev->links[link_id].ap.chandef.width; + else + width = wdev->u.ap.preset_chandef.width; + break; + } + + switch (width) { + case NL80211_CHAN_WIDTH_320: + mcs = &eht_cap->eht_mcs_nss_supp.bw._320; + break; + case NL80211_CHAN_WIDTH_160: + mcs = &eht_cap->eht_mcs_nss_supp.bw._160; + break; + default: + mcs = &eht_cap->eht_mcs_nss_supp.bw._80; + break; + } + + mcs_7 = u8_get_bits(mcs->rx_tx_mcs9_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_9 = u8_get_bits(mcs->rx_tx_mcs9_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_11 = u8_get_bits(mcs->rx_tx_mcs11_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_13 = u8_get_bits(mcs->rx_tx_mcs13_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + } + + /* Enable MCS 14 for NSS 0 */ + if (eht_cap->eht_cap_elem.phy_cap_info[6] & + IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP) + mcs_mask[0] |= 0x4000; + + /* Enable MCS 15 for NSS 0 */ + mcs_mask[0] |= 0x8000; + + for (nss = 0; nss < NL80211_EHT_NSS_MAX; nss++) { + if (!mcs_7) + continue; + mcs_mask[nss] |= 0x00FF; + mcs_7--; + + if (!mcs_9) + continue; + mcs_mask[nss] |= 0x0300; + mcs_9--; + + if (!mcs_11) + continue; + mcs_mask[nss] |= 0x0C00; + mcs_11--; + + if (!mcs_13) + continue; + mcs_mask[nss] |= 0x3000; + mcs_13--; + } +} + +static bool eht_set_mcs_mask(struct genl_info *info, struct wireless_dev *wdev, + struct ieee80211_supported_band *sband, + struct nl80211_txrate_eht *txrate, + u16 mcs[NL80211_EHT_NSS_MAX]) +{ + const struct ieee80211_sta_he_cap *he_cap; + const struct ieee80211_sta_eht_cap *eht_cap; + u16 tx_mcs_mask[NL80211_EHT_NSS_MAX] = { 0 }; + u8 i, mcs_nss_len; + + he_cap = ieee80211_get_he_iftype_cap(sband, wdev->iftype); + if (!he_cap) + return false; + + eht_cap = ieee80211_get_eht_iftype_cap(sband, wdev->iftype); + if (!eht_cap) + return false; + + /* Checks for MCS 14 */ + if (txrate->mcs[0] & 0x4000) { + if (sband->band != NL80211_BAND_6GHZ) + return false; + + if (!(eht_cap->eht_cap_elem.phy_cap_info[6] & + IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP)) + return false; + } + + mcs_nss_len = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, + &eht_cap->eht_cap_elem, + wdev->iftype == + NL80211_IFTYPE_STATION); + + if (mcs_nss_len == 3) { + /* Supported iftypes for setting non-20 MHZ only EHT MCS */ + switch (wdev->iftype) { + case NL80211_IFTYPE_ADHOC: + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + case NL80211_IFTYPE_MESH_POINT: + case NL80211_IFTYPE_OCB: + break; + default: + return false; + } + } + + /* Build eht_mcs_mask from EHT and HE capabilities */ + eht_build_mcs_mask(info, eht_cap, mcs_nss_len, tx_mcs_mask); + + memset(mcs, 0, sizeof(u16) * NL80211_EHT_NSS_MAX); + for (i = 0; i < NL80211_EHT_NSS_MAX; i++) { + if ((tx_mcs_mask[i] & txrate->mcs[i]) == txrate->mcs[i]) + mcs[i] = txrate->mcs[i]; + else + return false; + } + + return true; +} + static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, struct nlattr *attrs[], enum nl80211_attrs attr, @@ -5413,6 +5579,8 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, /* Default to all rates enabled */ for (i = 0; i < NUM_NL80211_BANDS; i++) { const struct ieee80211_sta_he_cap *he_cap; + const struct ieee80211_sta_eht_cap *eht_cap; + u8 mcs_nss_len; if (!default_all_enabled) break; @@ -5441,6 +5609,21 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, mask->control[i].he_gi = 0xFF; mask->control[i].he_ltf = 0xFF; + + eht_cap = ieee80211_get_eht_iftype_cap(sband, wdev->iftype); + if (!eht_cap) + continue; + + mcs_nss_len = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, + &eht_cap->eht_cap_elem, + wdev->iftype == + NL80211_IFTYPE_STATION); + + eht_build_mcs_mask(info, eht_cap, mcs_nss_len, + mask->control[i].eht_mcs); + + mask->control[i].eht_gi = 0xFF; + mask->control[i].eht_ltf = 0xFF; } /* if no rates are given set it back to the defaults */ @@ -5512,13 +5695,27 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, mask->control[band].he_ltf = nla_get_u8(tb[NL80211_TXRATE_HE_LTF]); + if (tb[NL80211_TXRATE_EHT] && + !eht_set_mcs_mask(info, wdev, sband, + nla_data(tb[NL80211_TXRATE_EHT]), + mask->control[band].eht_mcs)) + return -EINVAL; + + if (tb[NL80211_TXRATE_EHT_GI]) + mask->control[band].eht_gi = + nla_get_u8(tb[NL80211_TXRATE_EHT_GI]); + if (tb[NL80211_TXRATE_EHT_LTF]) + mask->control[band].eht_ltf = + nla_get_u8(tb[NL80211_TXRATE_EHT_LTF]); + if (mask->control[band].legacy == 0) { - /* don't allow empty legacy rates if HT, VHT or HE + /* don't allow empty legacy rates if HT, VHT, HE or EHT * are not even supported. */ if (!(rdev->wiphy.bands[band]->ht_cap.ht_supported || rdev->wiphy.bands[band]->vht_cap.vht_supported || - ieee80211_get_he_iftype_cap(sband, wdev->iftype))) + ieee80211_get_he_iftype_cap(sband, wdev->iftype) || + ieee80211_get_eht_iftype_cap(sband, wdev->iftype))) return -EINVAL; for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) @@ -5533,6 +5730,10 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, if (mask->control[band].he_mcs[i]) goto out; + for (i = 0; i < NL80211_EHT_NSS_MAX; i++) + if (mask->control[band].eht_mcs[i]) + goto out; + /* legacy and mcs rates may not be both empty */ return -EINVAL; } @@ -5546,7 +5747,7 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, enum nl80211_band band, struct cfg80211_bitrate_mask *beacon_rate) { - u32 count_ht, count_vht, count_he, i; + u32 count_ht, count_vht, count_he, count_eht, i; u32 rate = beacon_rate->control[band].legacy; /* Allow only one rate */ @@ -5592,8 +5793,21 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, return -EINVAL; } - if ((count_ht && count_vht && count_he) || - (!rate && !count_ht && !count_vht && !count_he)) + count_eht = 0; + for (i = 0; i < NL80211_EHT_NSS_MAX; i++) { + if (hweight16(beacon_rate->control[band].eht_mcs[i]) > 1) { + return -EINVAL; + } else if (beacon_rate->control[band].eht_mcs[i]) { + count_eht++; + if (count_eht > 1) + return -EINVAL; + } + if (count_eht && rate) + return -EINVAL; + } + + if ((count_ht && count_vht && count_he && count_eht) || + (!rate && !count_ht && !count_vht && !count_he && !count_eht)) return -EINVAL; if (rate && @@ -5613,6 +5827,11 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, NL80211_EXT_FEATURE_BEACON_RATE_HE)) return -EINVAL; + if (count_eht && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_RATE_EHT)) + return -EINVAL; + return 0; } -- cgit v1.2.3 From 24185534915b5d926ded098336f47bdcca333aec Mon Sep 17 00:00:00 2001 From: Arend van Spriel Date: Sun, 17 Aug 2025 21:04:32 +0200 Subject: wifi: nl80211: allow drivers to support subset of NL80211_CMD_SET_BSS The so-called fullmac devices rely on firmware functionality and/or API to change BSS parameters. Today there are limited drivers supporting the nl80211 primitive, but they only handle a subset of the bss parameters passed if any. The mac80211 driver does handle all parameters and stores their configured values. Some of the BSS parameters were already conditional by wiphy->features. For these the wiphy->bss_param_support and wiphy->features fields are silently aligned in wiphy_register(). Maybe better to issue a warning instead when they are misaligned. Signed-off-by: Arend van Spriel Link: https://patch.msgid.link/20250817190435.1495094-2-arend.vanspriel@broadcom.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 29 +++++++++++++++++++++++++++++ include/uapi/linux/nl80211.h | 4 ++++ net/wireless/core.c | 9 +++++++++ net/wireless/nl80211.c | 39 +++++++++++++++++++++++++++++++++++++-- 4 files changed, 79 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7d881aa7e48b..4072a67c9cc9 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2459,6 +2459,29 @@ struct mpath_info { int generation; }; +/** + * enum wiphy_bss_param_flags - bit positions for supported bss parameters. + * + * @WIPHY_BSS_PARAM_CTS_PROT: support changing CTS protection. + * @WIPHY_BSS_PARAM_SHORT_PREAMBLE: support changing short preamble usage. + * @WIPHY_BSS_PARAM_SHORT_SLOT_TIME: support changing short slot time usage. + * @WIPHY_BSS_PARAM_BASIC_RATES: support reconfiguring basic rates. + * @WIPHY_BSS_PARAM_AP_ISOLATE: support changing AP isolation. + * @WIPHY_BSS_PARAM_HT_OPMODE: support changing HT operating mode. + * @WIPHY_BSS_PARAM_P2P_CTWINDOW: support reconfiguring ctwindow. + * @WIPHY_BSS_PARAM_P2P_OPPPS: support changing P2P opportunistic power-save. + */ +enum wiphy_bss_param_flags { + WIPHY_BSS_PARAM_CTS_PROT = BIT(0), + WIPHY_BSS_PARAM_SHORT_PREAMBLE = BIT(1), + WIPHY_BSS_PARAM_SHORT_SLOT_TIME = BIT(2), + WIPHY_BSS_PARAM_BASIC_RATES = BIT(3), + WIPHY_BSS_PARAM_AP_ISOLATE = BIT(4), + WIPHY_BSS_PARAM_HT_OPMODE = BIT(5), + WIPHY_BSS_PARAM_P2P_CTWINDOW = BIT(6), + WIPHY_BSS_PARAM_P2P_OPPPS = BIT(7), +}; + /** * struct bss_parameters - BSS parameters * @@ -5785,6 +5808,11 @@ struct wiphy_radio { * and probe responses. This value should be set if the driver * wishes to limit the number of csa counters. Default (0) means * infinite. + * @bss_param_support: bitmask indicating which bss_parameters as defined in + * &struct bss_parameters the driver can actually handle in the + * .change_bss() callback. The bit positions are defined in &enum + * wiphy_bss_param_flags. + * * @bss_select_support: bitmask indicating the BSS selection criteria supported * by the driver in the .connect() callback. The bit position maps to the * attribute indices defined in &enum nl80211_bss_select_attr. @@ -5970,6 +5998,7 @@ struct wiphy { u8 max_num_csa_counters; + u32 bss_param_support; u32 bss_select_support; u8 nan_supported_bands; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 4f08264bbc8e..6c07100fc01f 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2930,6 +2930,9 @@ enum nl80211_commands { * required alongside this attribute. Refer to * @enum nl80211_s1g_short_beacon_attrs for the attribute definitions. * + * @NL80211_ATTR_BSS_PARAM: nested attribute used with %NL80211_CMD_GET_WIPHY + * which indicates which BSS parameters can be modified. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3491,6 +3494,7 @@ enum nl80211_attrs { NL80211_ATTR_S1G_LONG_BEACON_PERIOD, NL80211_ATTR_S1G_SHORT_BEACON, + NL80211_ATTR_BSS_PARAM, /* add attributes here, update the policy in nl80211.c */ diff --git a/net/wireless/core.c b/net/wireless/core.c index a7e2931ffb2e..797f9f2004a6 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1018,6 +1018,15 @@ int wiphy_register(struct wiphy *wiphy) rdev->wiphy.features |= NL80211_FEATURE_SCAN_FLUSH; + if (rdev->wiphy.bss_param_support & WIPHY_BSS_PARAM_P2P_CTWINDOW) + rdev->wiphy.features |= NL80211_FEATURE_P2P_GO_CTWIN; + else if (rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN) + rdev->wiphy.bss_param_support |= WIPHY_BSS_PARAM_P2P_CTWINDOW; + if (rdev->wiphy.bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS) + rdev->wiphy.features |= NL80211_FEATURE_P2P_GO_OPPPS; + else if (rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS) + rdev->wiphy.bss_param_support |= WIPHY_BSS_PARAM_P2P_OPPPS; + rtnl_lock(); wiphy_lock(&rdev->wiphy); res = device_add(&rdev->wiphy.dev); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 364d83fb9992..153644a04072 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3027,6 +3027,40 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, rdev->wiphy.ext_features)) goto nla_put_failure; + if (rdev->wiphy.bss_param_support) { + struct nlattr *nested; + u32 parsup = rdev->wiphy.bss_param_support; + + nested = nla_nest_start(msg, NL80211_ATTR_BSS_PARAM); + if (!nested) + goto nla_put_failure; + + if ((parsup & WIPHY_BSS_PARAM_CTS_PROT) && + nla_put_flag(msg, NL80211_ATTR_BSS_CTS_PROT)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_SHORT_PREAMBLE) && + nla_put_flag(msg, NL80211_ATTR_BSS_SHORT_PREAMBLE)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_SHORT_SLOT_TIME) && + nla_put_flag(msg, NL80211_ATTR_BSS_SHORT_SLOT_TIME)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_BASIC_RATES) && + nla_put_flag(msg, NL80211_ATTR_BSS_BASIC_RATES)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_AP_ISOLATE) && + nla_put_flag(msg, NL80211_ATTR_AP_ISOLATE)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_HT_OPMODE) && + nla_put_flag(msg, NL80211_ATTR_BSS_HT_OPMODE)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_P2P_CTWINDOW) && + nla_put_flag(msg, NL80211_ATTR_P2P_CTWINDOW)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_P2P_OPPPS) && + nla_put_flag(msg, NL80211_ATTR_P2P_OPPPS)) + goto nla_put_failure; + nla_nest_end(msg, nested); + } if (rdev->wiphy.bss_select_support) { struct nlattr *nested; u32 bss_select_support = rdev->wiphy.bss_select_support; @@ -9048,6 +9082,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct bss_parameters params; + u32 bss_param_support = rdev->wiphy.bss_param_support; memset(¶ms, 0, sizeof(params)); params.link_id = nl80211_link_id_or_invalid(info->attrs); @@ -9087,7 +9122,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) params.p2p_ctwindow = nla_get_u8(info->attrs[NL80211_ATTR_P2P_CTWINDOW]); if (params.p2p_ctwindow != 0 && - !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN)) + !(bss_param_support & WIPHY_BSS_PARAM_P2P_CTWINDOW)) return -EINVAL; } @@ -9099,7 +9134,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) tmp = nla_get_u8(info->attrs[NL80211_ATTR_P2P_OPPPS]); params.p2p_opp_ps = tmp; if (params.p2p_opp_ps && - !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS)) + !(bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS)) return -EINVAL; } -- cgit v1.2.3 From 18abf7a05f1e171a290d8abc3078189ca0fe2db0 Mon Sep 17 00:00:00 2001 From: Arend van Spriel Date: Sun, 17 Aug 2025 21:04:33 +0200 Subject: wifi: drivers: indicate support for attributes in NL80211_CMD_SET_BSS The command NL80211_CMD_SET_BSS has a number of individual attributes and the driver can advertise which of those it will handle when it is changed by user-space. For drivers providing an empty .change_bss() the callback has been removed. Signed-off-by: Arend van Spriel Link: https://patch.msgid.link/20250817190435.1495094-3-arend.vanspriel@broadcom.com Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/wil6210/cfg80211.c | 1 + drivers/net/wireless/microchip/wilc1000/cfg80211.c | 7 ------- drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c | 8 -------- net/mac80211/main.c | 8 ++++++++ 4 files changed, 9 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/drivers/net/wireless/ath/wil6210/cfg80211.c b/drivers/net/wireless/ath/wil6210/cfg80211.c index 7703a0933a14..7218fe70f3bc 100644 --- a/drivers/net/wireless/ath/wil6210/cfg80211.c +++ b/drivers/net/wireless/ath/wil6210/cfg80211.c @@ -2708,6 +2708,7 @@ static void wil_wiphy_init(struct wiphy *wiphy) wiphy->n_cipher_suites = ARRAY_SIZE(wil_cipher_suites); wiphy->mgmt_stypes = wil_mgmt_stypes; wiphy->features |= NL80211_FEATURE_SK_TX_STATUS; + wiphy->bss_param_support = WIPHY_BSS_PARAM_AP_ISOLATE; wiphy->n_vendor_commands = ARRAY_SIZE(wil_nl80211_vendor_commands); wiphy->vendor_commands = wil_nl80211_vendor_commands; diff --git a/drivers/net/wireless/microchip/wilc1000/cfg80211.c b/drivers/net/wireless/microchip/wilc1000/cfg80211.c index a395829ebadf..c39e7f313ea1 100644 --- a/drivers/net/wireless/microchip/wilc1000/cfg80211.c +++ b/drivers/net/wireless/microchip/wilc1000/cfg80211.c @@ -794,12 +794,6 @@ static int get_station(struct wiphy *wiphy, struct net_device *dev, return 0; } -static int change_bss(struct wiphy *wiphy, struct net_device *dev, - struct bss_parameters *params) -{ - return 0; -} - static int set_wiphy_params(struct wiphy *wiphy, int radio_idx, u32 changed) { int ret = -EINVAL; @@ -1709,7 +1703,6 @@ static const struct cfg80211_ops wilc_cfg80211_ops = { .change_station = change_station, .get_station = get_station, .dump_station = dump_station, - .change_bss = change_bss, .set_wiphy_params = set_wiphy_params, .external_auth = external_auth, diff --git a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c index ac3d085808e9..315bab373729 100644 --- a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c +++ b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c @@ -2441,13 +2441,6 @@ exit: return ret; } -static int cfg80211_rtw_change_bss(struct wiphy *wiphy, - struct net_device *ndev, - struct bss_parameters *params) -{ - return 0; -} - void rtw_cfg80211_rx_action(struct adapter *adapter, u8 *frame, uint frame_len, const char *msg) { s32 freq; @@ -2704,7 +2697,6 @@ static struct cfg80211_ops rtw_cfg80211_ops = { .del_station = cfg80211_rtw_del_station, .change_station = cfg80211_rtw_change_station, .dump_station = cfg80211_rtw_dump_station, - .change_bss = cfg80211_rtw_change_bss, .mgmt_tx = cfg80211_rtw_mgmt_tx, }; diff --git a/net/mac80211/main.c b/net/mac80211/main.c index beee51354931..e8c85aa77c56 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -862,6 +862,14 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, if (emulate_chanctx || ops->remain_on_channel) wiphy->flags |= WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL; + wiphy->bss_param_support = WIPHY_BSS_PARAM_CTS_PROT | + WIPHY_BSS_PARAM_SHORT_PREAMBLE | + WIPHY_BSS_PARAM_SHORT_SLOT_TIME | + WIPHY_BSS_PARAM_BASIC_RATES | + WIPHY_BSS_PARAM_AP_ISOLATE | + WIPHY_BSS_PARAM_HT_OPMODE | + WIPHY_BSS_PARAM_P2P_CTWINDOW | + WIPHY_BSS_PARAM_P2P_OPPPS; wiphy->features |= NL80211_FEATURE_SK_TX_STATUS | NL80211_FEATURE_SAE | NL80211_FEATURE_HT_IBSS | -- cgit v1.2.3 From 4f652a390db4246c5d3c51bf25d03ed0e4178fdc Mon Sep 17 00:00:00 2001 From: Arend van Spriel Date: Sun, 17 Aug 2025 21:04:34 +0200 Subject: wifi: nl80211: strict checking attributes for NL80211_CMD_SET_BSS Assure user-space only modifies attributes for NL80211_CMD_SET_BSS that are supported by the driver. This stricter checking is only done when user-space commits to it by including NL80211_ATTR_BSS_PARAM. Signed-off-by: Arend van Spriel Link: https://patch.msgid.link/20250817190435.1495094-4-arend.vanspriel@broadcom.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 5 ++++- net/wireless/nl80211.c | 52 ++++++++++++++++++++++++++++++++++++++------ 2 files changed, 49 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 6c07100fc01f..aed0b4c5d5e8 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2931,7 +2931,10 @@ enum nl80211_commands { * @enum nl80211_s1g_short_beacon_attrs for the attribute definitions. * * @NL80211_ATTR_BSS_PARAM: nested attribute used with %NL80211_CMD_GET_WIPHY - * which indicates which BSS parameters can be modified. + * which indicates which BSS parameters can be modified. The attribute can + * also be used as flag attribute by user-space in %NL80211_CMD_SET_BSS to + * indicate that it wants strict checking on the BSS parameters to be + * modified. * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 153644a04072..99e2aadc65f7 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -879,6 +879,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_S1G_LONG_BEACON_PERIOD] = NLA_POLICY_MIN(NLA_U8, 2), [NL80211_ATTR_S1G_SHORT_BEACON] = NLA_POLICY_NESTED(nl80211_s1g_short_beacon), + [NL80211_ATTR_BSS_PARAM] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -9083,6 +9084,8 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) struct net_device *dev = info->user_ptr[1]; struct bss_parameters params; u32 bss_param_support = rdev->wiphy.bss_param_support; + u32 changed = 0; + bool strict; memset(¶ms, 0, sizeof(params)); params.link_id = nl80211_link_id_or_invalid(info->attrs); @@ -9095,26 +9098,54 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) params.p2p_ctwindow = -1; params.p2p_opp_ps = -1; - if (info->attrs[NL80211_ATTR_BSS_CTS_PROT]) + strict = nla_get_flag(info->attrs[NL80211_ATTR_BSS_PARAM]); + if (info->attrs[NL80211_ATTR_BSS_CTS_PROT]) { + if (strict && !(bss_param_support & WIPHY_BSS_PARAM_CTS_PROT)) + return -EINVAL; params.use_cts_prot = nla_get_u8(info->attrs[NL80211_ATTR_BSS_CTS_PROT]); - if (info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE]) + changed |= WIPHY_BSS_PARAM_CTS_PROT; + } + if (info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE]) { + if (strict && + !(bss_param_support & WIPHY_BSS_PARAM_SHORT_PREAMBLE)) + return -EINVAL; params.use_short_preamble = nla_get_u8(info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE]); - if (info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]) + changed |= WIPHY_BSS_PARAM_SHORT_PREAMBLE; + } + if (info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]) { + if (strict && + !(bss_param_support & WIPHY_BSS_PARAM_SHORT_SLOT_TIME)) + return -EINVAL; params.use_short_slot_time = nla_get_u8(info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]); + changed |= WIPHY_BSS_PARAM_SHORT_SLOT_TIME; + } if (info->attrs[NL80211_ATTR_BSS_BASIC_RATES]) { + if (strict && + !(bss_param_support & WIPHY_BSS_PARAM_BASIC_RATES)) + return -EINVAL; params.basic_rates = nla_data(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]); params.basic_rates_len = nla_len(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]); + changed |= WIPHY_BSS_PARAM_BASIC_RATES; } - if (info->attrs[NL80211_ATTR_AP_ISOLATE]) - params.ap_isolate = !!nla_get_u8(info->attrs[NL80211_ATTR_AP_ISOLATE]); - if (info->attrs[NL80211_ATTR_BSS_HT_OPMODE]) + if (info->attrs[NL80211_ATTR_AP_ISOLATE]) { + if (strict && !(bss_param_support & WIPHY_BSS_PARAM_AP_ISOLATE)) + return -EINVAL; + params.ap_isolate = + !!nla_get_u8(info->attrs[NL80211_ATTR_AP_ISOLATE]); + changed |= WIPHY_BSS_PARAM_AP_ISOLATE; + } + if (info->attrs[NL80211_ATTR_BSS_HT_OPMODE]) { + if (strict && !(bss_param_support & WIPHY_BSS_PARAM_HT_OPMODE)) + return -EINVAL; params.ht_opmode = nla_get_u16(info->attrs[NL80211_ATTR_BSS_HT_OPMODE]); + changed |= WIPHY_BSS_PARAM_HT_OPMODE; + } if (info->attrs[NL80211_ATTR_P2P_CTWINDOW]) { if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) @@ -9124,6 +9155,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) if (params.p2p_ctwindow != 0 && !(bss_param_support & WIPHY_BSS_PARAM_P2P_CTWINDOW)) return -EINVAL; + changed |= WIPHY_BSS_PARAM_P2P_CTWINDOW; } if (info->attrs[NL80211_ATTR_P2P_OPPPS]) { @@ -9132,9 +9164,11 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) return -EINVAL; tmp = nla_get_u8(info->attrs[NL80211_ATTR_P2P_OPPPS]); + if (tmp && !(bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS)) + return -EINVAL; params.p2p_opp_ps = tmp; if (params.p2p_opp_ps && - !(bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS)) + !(rdev->wiphy.bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS)) return -EINVAL; } @@ -9145,6 +9179,10 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) return -EOPNOTSUPP; + changed &= rdev->wiphy.bss_param_support; + if (!changed) + return 0; + return rdev_change_bss(rdev, dev, ¶ms); } -- cgit v1.2.3 From 937d6aea5c6211fe2c8eb6f99b5baa6887a696c4 Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Wed, 3 Sep 2025 11:39:03 +0300 Subject: wifi: mac80211: reduce the scope of link_id Reduce the scope of the link_id variable in sta_set_sinfo to the 'if' scope. Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250903083904.1972284-2-miriam.rachel.korenblit@intel.com Signed-off-by: Johannes Berg --- net/mac80211/sta_info.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 8c550aab9bdc..8e275f0a1238 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2962,7 +2962,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; u32 thr = 0; - int i, ac, cpu, link_id; + int i, ac, cpu; struct ieee80211_sta_rx_stats *last_rxstats; last_rxstats = sta_get_last_rx_stats(sta, -1); @@ -3204,6 +3204,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, if (sta->sta.valid_links) { struct ieee80211_link_data *link; struct link_sta_info *link_sta; + int link_id; ether_addr_copy(sinfo->mld_addr, sta->addr); for_each_valid_link(sinfo, link_id) { -- cgit v1.2.3 From 7a7458ed0df90d4870f902fddc85b4a413ef7de4 Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Wed, 3 Sep 2025 11:39:04 +0300 Subject: wifi: mac80211: reduce the scope of rts_threshold This is only needed within the 'if' scope, not in the function scope. Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250903083904.1972284-3-miriam.rachel.korenblit@intel.com Signed-off-by: Johannes Berg --- net/mac80211/util.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 51e3e3c913f7..9eb35e3b9e52 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1756,7 +1756,6 @@ int ieee80211_reconfig(struct ieee80211_local *local) bool sched_scan_stopped = false; bool suspended = local->suspended; bool in_reconfig = false; - u32 rts_threshold; lockdep_assert_wiphy(local->hw.wiphy); @@ -1832,7 +1831,9 @@ int ieee80211_reconfig(struct ieee80211_local *local) /* setup RTS threshold */ if (hw->wiphy->n_radio > 0) { for (i = 0; i < hw->wiphy->n_radio; i++) { - rts_threshold = hw->wiphy->radio_cfg[i].rts_threshold; + u32 rts_threshold = + hw->wiphy->radio_cfg[i].rts_threshold; + drv_set_rts_threshold(local, i, rts_threshold); } } else { -- cgit v1.2.3 From e53f8b12a21c2974b66fa8c706090182da06fff3 Mon Sep 17 00:00:00 2001 From: Ramya Gnanasekar Date: Fri, 6 Jun 2025 16:14:36 +0530 Subject: wifi: mac80211: Fix 6 GHz Band capabilities element advertisement in lower bands Currently, when adding the 6 GHz Band Capabilities element, the channel list of the wiphy is checked to determine if 6 GHz is supported for a given virtual interface. However, in a multi-radio wiphy (e.g., one that has both lower bands and 6 GHz combined), the wiphy advertises support for all bands. As a result, the 6 GHz Band Capabilities element is incorrectly included in mesh beacon and station's association request frames of interfaces operating in lower bands, without verifying whether the interface is actually operating in a 6 GHz channel. Fix this by verifying if the interface operates on 6 GHz channel before adding the element. Note that this check cannot be placed directly in ieee80211_put_he_6ghz_cap() as the same function is used to add probe request elements while initiating scan in which case the interface may not be operating in any band's channel. Signed-off-by: Ramya Gnanasekar Signed-off-by: Rameshkumar Sundaram Link: https://patch.msgid.link/20250606104436.326654-1-rameshkumar.sundaram@oss.qualcomm.com Signed-off-by: Johannes Berg --- net/mac80211/mesh.c | 3 +++ net/mac80211/mlme.c | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index a4a715f6f1c3..f37068a533f4 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -624,6 +624,9 @@ int mesh_add_he_6ghz_cap_ie(struct ieee80211_sub_if_data *sdata, if (!sband) return -EINVAL; + if (sband->band != NL80211_BAND_6GHZ) + return 0; + iftd = ieee80211_get_sband_iftype_data(sband, NL80211_IFTYPE_MESH_POINT); /* The device doesn't support HE in mesh mode or at all */ diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 9568cc95a7ff..83a9986dd1c4 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1842,7 +1842,8 @@ ieee80211_add_link_elems(struct ieee80211_sub_if_data *sdata, ieee80211_put_he_cap(skb, sdata, sband, &assoc_data->link[link_id].conn); ADD_PRESENT_EXT_ELEM(WLAN_EID_EXT_HE_CAPABILITY); - ieee80211_put_he_6ghz_cap(skb, sdata, smps_mode); + if (sband->band == NL80211_BAND_6GHZ) + ieee80211_put_he_6ghz_cap(skb, sdata, smps_mode); } /* -- cgit v1.2.3 From c975e1dfcc929dbfde8abfa514494b66f0335006 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Mon, 1 Sep 2025 16:58:42 +0200 Subject: net/smc: Improve log message for devices w/o pnetid Explicitly state in the log message, when a device has no pnetid. "with pnetid" and "has pnetid" was misleading for devices without pnetid. Signed-off-by: Alexandra Winter Reviewed-by: Dust Li Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250901145842.1718373-3-wintera@linux.ibm.com Signed-off-by: Jakub Kicinski --- net/smc/smc_ib.c | 18 +++++++++++------- net/smc/smc_ism.c | 13 +++++++++---- 2 files changed, 20 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 53828833a3f7..f2de12990b5b 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -971,13 +971,17 @@ static int smc_ib_add_dev(struct ib_device *ibdev) smcibdev->pnetid[i])) smc_pnetid_by_table_ib(smcibdev, i + 1); smc_copy_netdev_ifindex(smcibdev, i); - pr_warn_ratelimited("smc: ib device %s port %d has pnetid " - "%.16s%s\n", - smcibdev->ibdev->name, i + 1, - smcibdev->pnetid[i], - smcibdev->pnetid_by_user[i] ? - " (user defined)" : - ""); + if (smc_pnet_is_pnetid_set(smcibdev->pnetid[i])) + pr_warn_ratelimited("smc: ib device %s port %d has pnetid %.16s%s\n", + smcibdev->ibdev->name, i + 1, + smcibdev->pnetid[i], + smcibdev->pnetid_by_user[i] ? + " (user defined)" : + ""); + else + pr_warn_ratelimited("smc: ib device %s port %d has no pnetid\n", + smcibdev->ibdev->name, i + 1); + } schedule_work(&smcibdev->port_event_work); return 0; diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 84f98e18c7db..a58ffb7a0610 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -518,10 +518,15 @@ static void smcd_register_dev(struct ism_dev *ism) } mutex_unlock(&smcd_dev_list.mutex); - pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", - dev_name(&ism->dev), smcd->pnetid, - smcd->pnetid_by_user ? " (user defined)" : ""); - + if (smc_pnet_is_pnetid_set(smcd->pnetid)) + pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", + dev_name(&ism->dev), smcd->pnetid, + smcd->pnetid_by_user ? + " (user defined)" : + ""); + else + pr_warn_ratelimited("smc: adding smcd device %s without pnetid\n", + dev_name(&ism->dev)); return; } -- cgit v1.2.3 From 5f9238530970f2993b23dd67fdaffc552a2d2e98 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 3 Sep 2025 08:47:18 +0000 Subject: tcp: fix __tcp_close() to only send RST when required If the receive queue contains payload that was already received, __tcp_close() can send an unexpected RST. Refine the code to take tp->copied_seq into account, as we already do in tcp recvmsg(). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Kuniyuki Iwashima Reviewed-by: Jason Xing Link: https://patch.msgid.link/20250903084720.1168904-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 40b774b4f587..39eb03f6d07f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3099,8 +3099,8 @@ bool tcp_check_oom(const struct sock *sk, int shift) void __tcp_close(struct sock *sk, long timeout) { + bool data_was_unread = false; struct sk_buff *skb; - int data_was_unread = 0; int state; WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); @@ -3119,11 +3119,12 @@ void __tcp_close(struct sock *sk, long timeout) * reader process may not have drained the data yet! */ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { - u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq; + u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) - len--; - data_was_unread += len; + end_seq--; + if (after(end_seq, tcp_sk(sk)->copied_seq)) + data_was_unread = true; __kfree_skb(skb); } -- cgit v1.2.3 From b13592d20b210976a0946adf027b7bd9d7734326 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 3 Sep 2025 08:47:20 +0000 Subject: tcp: use tcp_eat_recv_skb in __tcp_close() Small change to use tcp_eat_recv_skb() instead of __kfree_skb(). This can help if an application under attack has to close many sockets with unread data. Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Kuniyuki Iwashima Reviewed-by: Jason Xing Link: https://patch.msgid.link/20250903084720.1168904-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 39eb03f6d07f..588932c3cf1d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3118,14 +3118,14 @@ void __tcp_close(struct sock *sk, long timeout) * descriptor close, not protocol-sourced closes, because the * reader process may not have drained the data yet! */ - while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { + while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) end_seq--; if (after(end_seq, tcp_sk(sk)->copied_seq)) data_was_unread = true; - __kfree_skb(skb); + tcp_eat_recv_skb(sk, skb); } /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */ -- cgit v1.2.3 From 16c610162d1f1c332209de1c91ffb09b659bb65d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 3 Sep 2025 17:48:10 +0000 Subject: net: call cond_resched() less often in __release_sock() While stress testing TCP I had unexpected retransmits and sack packets when a single cpu receives data from multiple high-throughput flows. super_netperf 4 -H srv -T,10 -l 3000 & Tcpdump extract: 00:00:00.000007 IP6 clnt > srv: Flags [.], seq 26062848:26124288, ack 1, win 66, options [nop,nop,TS val 651460834 ecr 3100749131], length 61440 00:00:00.000006 IP6 clnt > srv: Flags [.], seq 26124288:26185728, ack 1, win 66, options [nop,nop,TS val 651460834 ecr 3100749131], length 61440 00:00:00.000005 IP6 clnt > srv: Flags [P.], seq 26185728:26243072, ack 1, win 66, options [nop,nop,TS val 651460834 ecr 3100749131], length 57344 00:00:00.000006 IP6 clnt > srv: Flags [.], seq 26243072:26304512, ack 1, win 66, options [nop,nop,TS val 651460844 ecr 3100749141], length 61440 00:00:00.000005 IP6 clnt > srv: Flags [.], seq 26304512:26365952, ack 1, win 66, options [nop,nop,TS val 651460844 ecr 3100749141], length 61440 00:00:00.000007 IP6 clnt > srv: Flags [P.], seq 26365952:26423296, ack 1, win 66, options [nop,nop,TS val 651460844 ecr 3100749141], length 57344 00:00:00.000006 IP6 clnt > srv: Flags [.], seq 26423296:26484736, ack 1, win 66, options [nop,nop,TS val 651460853 ecr 3100749150], length 61440 00:00:00.000005 IP6 clnt > srv: Flags [.], seq 26484736:26546176, ack 1, win 66, options [nop,nop,TS val 651460853 ecr 3100749150], length 61440 00:00:00.000005 IP6 clnt > srv: Flags [P.], seq 26546176:26603520, ack 1, win 66, options [nop,nop,TS val 651460853 ecr 3100749150], length 57344 00:00:00.003932 IP6 clnt > srv: Flags [P.], seq 26603520:26619904, ack 1, win 66, options [nop,nop,TS val 651464844 ecr 3100753141], length 16384 00:00:00.006602 IP6 clnt > srv: Flags [.], seq 24862720:24866816, ack 1, win 66, options [nop,nop,TS val 651471419 ecr 3100759716], length 4096 00:00:00.013000 IP6 clnt > srv: Flags [.], seq 24862720:24866816, ack 1, win 66, options [nop,nop,TS val 651484421 ecr 3100772718], length 4096 00:00:00.000416 IP6 srv > clnt: Flags [.], ack 26619904, win 1393, options [nop,nop,TS val 3100773185 ecr 651484421,nop,nop,sack 1 {24862720:24866816}], length 0 After analysis, it appears this is because of the cond_resched() call from __release_sock(). When current thread is yielding, while still holding the TCP socket lock, it might regain the cpu after a very long time. Other peer TLP/RTO is firing (multiple times) and packets are retransmit, while the initial copy is waiting in the socket backlog or receive queue. In this patch, I call cond_resched() only once every 16 packets. Modern TCP stack now spends less time per packet in the backlog, especially because ACK are no longer sent (commit 133c4c0d3717 "tcp: defer regular ACK while processing socket backlog") Before: clnt:/# nstat -n;sleep 10;nstat|egrep "TcpOutSegs|TcpRetransSegs|TCPFastRetrans|TCPTimeouts|Probes|TCPSpuriousRTOs|DSACK" TcpOutSegs 19046186 0.0 TcpRetransSegs 1471 0.0 TcpExtTCPTimeouts 1397 0.0 TcpExtTCPLossProbes 1356 0.0 TcpExtTCPDSACKRecv 1352 0.0 TcpExtTCPSpuriousRTOs 114 0.0 TcpExtTCPDSACKRecvSegs 1352 0.0 After: clnt:/# nstat -n;sleep 10;nstat|egrep "TcpOutSegs|TcpRetransSegs|TCPFastRetrans|TCPTimeouts|Probes|TCPSpuriousRTOs|DSACK" TcpOutSegs 19218936 0.0 Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250903174811.1930820-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/sock.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 02f31f21b4af..1f8ef4d8bcd9 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3165,23 +3165,27 @@ void __release_sock(struct sock *sk) __acquires(&sk->sk_lock.slock) { struct sk_buff *skb, *next; + int nb = 0; while ((skb = sk->sk_backlog.head) != NULL) { sk->sk_backlog.head = sk->sk_backlog.tail = NULL; spin_unlock_bh(&sk->sk_lock.slock); - do { + while (1) { next = skb->next; prefetch(next); DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb)); skb_mark_not_on_list(skb); sk_backlog_rcv(sk, skb); - cond_resched(); - skb = next; - } while (skb != NULL); + if (!skb) + break; + + if (!(++nb & 15)) + cond_resched(); + } spin_lock_bh(&sk->sk_lock.slock); } -- cgit v1.2.3 From e89888a1e778db5954e702defc44cfbc4ebe92c2 Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Thu, 28 Aug 2025 20:28:30 +0200 Subject: batman-adv: Start new development cycle This version will contain all the (major or even only minor) changes for Linux 6.18. The version number isn't a semantic version number with major and minor information. It is just encoding the year of the expected publishing as Linux -rc1 and the number of published versions this year (starting at 0). Signed-off-by: Simon Wunderlich --- net/batman-adv/main.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 1481eb2bacee..0d3ec0f1c9fb 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2025.3" +#define BATADV_SOURCE_VERSION "2025.4" #endif /* B.A.T.M.A.N. parameters */ -- cgit v1.2.3 From 87b95082db32ae1cfe66d04052da8c6b21531110 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Thu, 28 Aug 2025 17:33:48 +0200 Subject: batman-adv: remove network coding support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Network Coding feature, introduced in 2013, is based on the master thesis "Inter-Flow Network Coding for Wireless Mesh Networks". It relies on the assumption that neighboring mesh nodes can reliably overhear each other's transmissions in promiscuous mode, allowing packets to be combined to reduce forwarding overhead. This assumption no longer holds for modern wireless mesh networks, which are heterogeneous and make overhearing increasingly unreliable. Factors such as multiple spatial streams, varying data rates, beamforming, and OFDMA all prevent nodes from consistently overhearing each other. The current implementation in batman-adv is not able to detect these conditions and would require a more complex layer beyond its neighbor discovery process to do so. In addition, the feature has been unmaintained for years and is discouraged for use. None of the current maintainers have the required test setup to verify its functionality, and known issues remain in its data structures (reference counting, RCU usage, and cleanup handling). Its continued presence also blocks necessary refactoring of the core originator infrastructure. Remove this obsolete and unmaintained feature. Signed-off-by: Sven Eckelmann Acked-by: Martin Hundebøll Acked-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/Kconfig | 13 - net/batman-adv/Makefile | 1 - net/batman-adv/bat_iv_ogm.c | 5 - net/batman-adv/log.h | 3 - net/batman-adv/main.c | 16 - net/batman-adv/main.h | 2 - net/batman-adv/mesh-interface.c | 14 - net/batman-adv/netlink.c | 17 - net/batman-adv/network-coding.c | 1878 ------------------------------------ net/batman-adv/network-coding.h | 106 -- net/batman-adv/originator.c | 6 - net/batman-adv/routing.c | 9 +- net/batman-adv/send.c | 16 +- net/batman-adv/translation-table.c | 4 +- net/batman-adv/types.h | 216 ----- 15 files changed, 4 insertions(+), 2302 deletions(-) delete mode 100644 net/batman-adv/network-coding.c delete mode 100644 net/batman-adv/network-coding.h (limited to 'net') diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index 20b316207f9a..c299e2bc87ed 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -53,19 +53,6 @@ config BATMAN_ADV_DAT mesh networks. If you think that your network does not need this option you can safely remove it and save some space. -config BATMAN_ADV_NC - bool "Network Coding" - depends on BATMAN_ADV - help - This option enables network coding, a mechanism that aims to - increase the overall network throughput by fusing multiple - packets in one transmission. - Note that interfaces controlled by batman-adv must be manually - configured to have promiscuous mode enabled in order to make - network coding work. - If you think that your network does not need this feature you - can safely disable it and save some space. - config BATMAN_ADV_MCAST bool "Multicast optimisation" depends on BATMAN_ADV && INET && !(BRIDGE=m && BATMAN_ADV=y) diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index 1cc9be6de456..d3c4d4143c14 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -23,7 +23,6 @@ batman-adv-y += mesh-interface.o batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast_forw.o batman-adv-y += netlink.o -batman-adv-$(CONFIG_BATMAN_ADV_NC) += network-coding.o batman-adv-y += originator.o batman-adv-y += routing.o batman-adv-y += send.o diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 54fe38b3b2fd..b75c2228e69a 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -52,7 +52,6 @@ #include "hash.h" #include "log.h" #include "netlink.h" -#include "network-coding.h" #include "originator.h" #include "routing.h" #include "send.h" @@ -1406,10 +1405,6 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, if (!orig_neigh_node) goto out; - /* Update nc_nodes of the originator */ - batadv_nc_update_nc_node(bat_priv, orig_node, orig_neigh_node, - ogm_packet, is_single_hop_neigh); - orig_neigh_router = batadv_orig_router_get(orig_neigh_node, if_outgoing); diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index 567afaa8df99..225b747a2048 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -51,9 +51,6 @@ enum batadv_dbg_level { /** @BATADV_DBG_DAT: ARP snooping and DAT related messages */ BATADV_DBG_DAT = BIT(4), - /** @BATADV_DBG_NC: network coding related messages */ - BATADV_DBG_NC = BIT(5), - /** @BATADV_DBG_MCAST: multicast related messages */ BATADV_DBG_MCAST = BIT(6), diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 20346d7b6b69..1dcacfc310ee 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -53,7 +53,6 @@ #include "mesh-interface.h" #include "multicast.h" #include "netlink.h" -#include "network-coding.h" #include "originator.h" #include "routing.h" #include "send.h" @@ -103,7 +102,6 @@ static int __init batadv_init(void) batadv_v_init(); batadv_iv_init(); - batadv_nc_init(); batadv_tp_meter_init(); batadv_event_workqueue = create_singlethread_workqueue("bat_events"); @@ -218,12 +216,6 @@ int batadv_mesh_init(struct net_device *mesh_iface) goto err_dat; } - ret = batadv_nc_mesh_init(bat_priv); - if (ret < 0) { - atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); - goto err_nc; - } - batadv_gw_init(bat_priv); batadv_mcast_init(bat_priv); @@ -232,8 +224,6 @@ int batadv_mesh_init(struct net_device *mesh_iface) return 0; -err_nc: - batadv_dat_free(bat_priv); err_dat: batadv_bla_free(bat_priv); err_bla: @@ -264,7 +254,6 @@ void batadv_mesh_free(struct net_device *mesh_iface) batadv_gw_node_free(bat_priv); batadv_v_mesh_free(bat_priv); - batadv_nc_mesh_free(bat_priv); batadv_dat_free(bat_priv); batadv_bla_free(bat_priv); @@ -336,11 +325,6 @@ int batadv_max_header_len(void) header_len = max_t(int, header_len, sizeof(struct batadv_bcast_packet)); -#ifdef CONFIG_BATMAN_ADV_NC - header_len = max_t(int, header_len, - sizeof(struct batadv_coded_packet)); -#endif - return header_len + ETH_HLEN; } diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 0d3ec0f1c9fb..7352b11df968 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -121,8 +121,6 @@ #define BATADV_RESET_PROTECTION_MS 30000 #define BATADV_EXPECTED_SEQNO_RANGE 65536 -#define BATADV_NC_NODE_TIMEOUT 10000 /* Milliseconds */ - /** * BATADV_TP_MAX_NUM - maximum number of simultaneously active tp sessions */ diff --git a/net/batman-adv/mesh-interface.c b/net/batman-adv/mesh-interface.c index de2c2d9c6e4d..be55d8d87348 100644 --- a/net/batman-adv/mesh-interface.c +++ b/net/batman-adv/mesh-interface.c @@ -46,7 +46,6 @@ #include "gateway_client.h" #include "hard-interface.h" #include "multicast.h" -#include "network-coding.h" #include "send.h" #include "translation-table.h" @@ -802,8 +801,6 @@ static int batadv_meshif_init_late(struct net_device *dev) bat_priv->primary_if = NULL; - batadv_nc_init_bat_priv(bat_priv); - if (!bat_priv->algo_ops) { ret = batadv_algo_select(bat_priv, batadv_routing_algo); if (ret < 0) @@ -947,17 +944,6 @@ static const struct { { "dat_put_rx" }, { "dat_cached_reply_tx" }, #endif -#ifdef CONFIG_BATMAN_ADV_NC - { "nc_code" }, - { "nc_code_bytes" }, - { "nc_recode" }, - { "nc_recode_bytes" }, - { "nc_buffer" }, - { "nc_decode" }, - { "nc_decode_bytes" }, - { "nc_decode_failed" }, - { "nc_sniffed" }, -#endif }; static void batadv_get_strings(struct net_device *dev, u32 stringset, u8 *data) diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index beb181b3a7d8..78c651f634cd 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -44,7 +44,6 @@ #include "log.h" #include "mesh-interface.h" #include "multicast.h" -#include "network-coding.h" #include "originator.h" #include "tp_meter.h" #include "translation-table.h" @@ -144,7 +143,6 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_LOG_LEVEL] = { .type = NLA_U32 }, [BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_MULTICAST_FANOUT] = { .type = NLA_U32 }, - [BATADV_ATTR_NETWORK_CODING_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_ORIG_INTERVAL] = { .type = NLA_U32 }, [BATADV_ATTR_ELP_INTERVAL] = { .type = NLA_U32 }, [BATADV_ATTR_THROUGHPUT_OVERRIDE] = { .type = NLA_U32 }, @@ -345,12 +343,6 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg, goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_MCAST */ -#ifdef CONFIG_BATMAN_ADV_NC - if (nla_put_u8(msg, BATADV_ATTR_NETWORK_CODING_ENABLED, - !!atomic_read(&bat_priv->network_coding))) - goto nla_put_failure; -#endif /* CONFIG_BATMAN_ADV_NC */ - if (nla_put_u32(msg, BATADV_ATTR_ORIG_INTERVAL, atomic_read(&bat_priv->orig_interval))) goto nla_put_failure; @@ -588,15 +580,6 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) } #endif /* CONFIG_BATMAN_ADV_MCAST */ -#ifdef CONFIG_BATMAN_ADV_NC - if (info->attrs[BATADV_ATTR_NETWORK_CODING_ENABLED]) { - attr = info->attrs[BATADV_ATTR_NETWORK_CODING_ENABLED]; - - atomic_set(&bat_priv->network_coding, !!nla_get_u8(attr)); - batadv_nc_status_update(bat_priv->mesh_iface); - } -#endif /* CONFIG_BATMAN_ADV_NC */ - if (info->attrs[BATADV_ATTR_ORIG_INTERVAL]) { u32 orig_interval; diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c deleted file mode 100644 index af97d077369f..000000000000 --- a/net/batman-adv/network-coding.c +++ /dev/null @@ -1,1878 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) B.A.T.M.A.N. contributors: - * - * Martin Hundebøll, Jeppe Ledet-Pedersen - */ - -#include "network-coding.h" -#include "main.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "hash.h" -#include "log.h" -#include "originator.h" -#include "routing.h" -#include "send.h" -#include "tvlv.h" - -static struct lock_class_key batadv_nc_coding_hash_lock_class_key; -static struct lock_class_key batadv_nc_decoding_hash_lock_class_key; - -static void batadv_nc_worker(struct work_struct *work); -static int batadv_nc_recv_coded_packet(struct sk_buff *skb, - struct batadv_hard_iface *recv_if); - -/** - * batadv_nc_init() - one-time initialization for network coding - * - * Return: 0 on success or negative error number in case of failure - */ -int __init batadv_nc_init(void) -{ - /* Register our packet type */ - return batadv_recv_handler_register(BATADV_CODED, - batadv_nc_recv_coded_packet); -} - -/** - * batadv_nc_start_timer() - initialise the nc periodic worker - * @bat_priv: the bat priv with all the mesh interface information - */ -static void batadv_nc_start_timer(struct batadv_priv *bat_priv) -{ - queue_delayed_work(batadv_event_workqueue, &bat_priv->nc.work, - msecs_to_jiffies(10)); -} - -/** - * batadv_nc_tvlv_container_update() - update the network coding tvlv container - * after network coding setting change - * @bat_priv: the bat priv with all the mesh interface information - */ -static void batadv_nc_tvlv_container_update(struct batadv_priv *bat_priv) -{ - char nc_mode; - - nc_mode = atomic_read(&bat_priv->network_coding); - - switch (nc_mode) { - case 0: - batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_NC, 1); - break; - case 1: - batadv_tvlv_container_register(bat_priv, BATADV_TVLV_NC, 1, - NULL, 0); - break; - } -} - -/** - * batadv_nc_status_update() - update the network coding tvlv container after - * network coding setting change - * @net_dev: the mesh interface net device - */ -void batadv_nc_status_update(struct net_device *net_dev) -{ - struct batadv_priv *bat_priv = netdev_priv(net_dev); - - batadv_nc_tvlv_container_update(bat_priv); -} - -/** - * batadv_nc_tvlv_ogm_handler_v1() - process incoming nc tvlv container - * @bat_priv: the bat priv with all the mesh interface information - * @orig: the orig_node of the ogm - * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) - * @tvlv_value: tvlv buffer containing the gateway data - * @tvlv_value_len: tvlv buffer length - */ -static void batadv_nc_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig, - u8 flags, - void *tvlv_value, u16 tvlv_value_len) -{ - if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) - clear_bit(BATADV_ORIG_CAPA_HAS_NC, &orig->capabilities); - else - set_bit(BATADV_ORIG_CAPA_HAS_NC, &orig->capabilities); -} - -/** - * batadv_nc_mesh_init() - initialise coding hash table and start housekeeping - * @bat_priv: the bat priv with all the mesh interface information - * - * Return: 0 on success or negative error number in case of failure - */ -int batadv_nc_mesh_init(struct batadv_priv *bat_priv) -{ - bat_priv->nc.timestamp_fwd_flush = jiffies; - bat_priv->nc.timestamp_sniffed_purge = jiffies; - - if (bat_priv->nc.coding_hash || bat_priv->nc.decoding_hash) - return 0; - - bat_priv->nc.coding_hash = batadv_hash_new(128); - if (!bat_priv->nc.coding_hash) - goto err; - - batadv_hash_set_lock_class(bat_priv->nc.coding_hash, - &batadv_nc_coding_hash_lock_class_key); - - bat_priv->nc.decoding_hash = batadv_hash_new(128); - if (!bat_priv->nc.decoding_hash) { - batadv_hash_destroy(bat_priv->nc.coding_hash); - goto err; - } - - batadv_hash_set_lock_class(bat_priv->nc.decoding_hash, - &batadv_nc_decoding_hash_lock_class_key); - - INIT_DELAYED_WORK(&bat_priv->nc.work, batadv_nc_worker); - batadv_nc_start_timer(bat_priv); - - batadv_tvlv_handler_register(bat_priv, batadv_nc_tvlv_ogm_handler_v1, - NULL, NULL, BATADV_TVLV_NC, 1, - BATADV_TVLV_HANDLER_OGM_CIFNOTFND); - batadv_nc_tvlv_container_update(bat_priv); - return 0; - -err: - return -ENOMEM; -} - -/** - * batadv_nc_init_bat_priv() - initialise the nc specific bat_priv variables - * @bat_priv: the bat priv with all the mesh interface information - */ -void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv) -{ - atomic_set(&bat_priv->network_coding, 0); - bat_priv->nc.min_tq = 200; - bat_priv->nc.max_fwd_delay = 10; - bat_priv->nc.max_buffer_time = 200; -} - -/** - * batadv_nc_init_orig() - initialise the nc fields of an orig_node - * @orig_node: the orig_node which is going to be initialised - */ -void batadv_nc_init_orig(struct batadv_orig_node *orig_node) -{ - INIT_LIST_HEAD(&orig_node->in_coding_list); - INIT_LIST_HEAD(&orig_node->out_coding_list); - spin_lock_init(&orig_node->in_coding_list_lock); - spin_lock_init(&orig_node->out_coding_list_lock); -} - -/** - * batadv_nc_node_release() - release nc_node from lists and queue for free - * after rcu grace period - * @ref: kref pointer of the nc_node - */ -static void batadv_nc_node_release(struct kref *ref) -{ - struct batadv_nc_node *nc_node; - - nc_node = container_of(ref, struct batadv_nc_node, refcount); - - batadv_orig_node_put(nc_node->orig_node); - kfree_rcu(nc_node, rcu); -} - -/** - * batadv_nc_node_put() - decrement the nc_node refcounter and possibly - * release it - * @nc_node: nc_node to be free'd - */ -static void batadv_nc_node_put(struct batadv_nc_node *nc_node) -{ - if (!nc_node) - return; - - kref_put(&nc_node->refcount, batadv_nc_node_release); -} - -/** - * batadv_nc_path_release() - release nc_path from lists and queue for free - * after rcu grace period - * @ref: kref pointer of the nc_path - */ -static void batadv_nc_path_release(struct kref *ref) -{ - struct batadv_nc_path *nc_path; - - nc_path = container_of(ref, struct batadv_nc_path, refcount); - - kfree_rcu(nc_path, rcu); -} - -/** - * batadv_nc_path_put() - decrement the nc_path refcounter and possibly - * release it - * @nc_path: nc_path to be free'd - */ -static void batadv_nc_path_put(struct batadv_nc_path *nc_path) -{ - if (!nc_path) - return; - - kref_put(&nc_path->refcount, batadv_nc_path_release); -} - -/** - * batadv_nc_packet_free() - frees nc packet - * @nc_packet: the nc packet to free - * @dropped: whether the packet is freed because is dropped - */ -static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet, - bool dropped) -{ - if (dropped) - kfree_skb(nc_packet->skb); - else - consume_skb(nc_packet->skb); - - batadv_nc_path_put(nc_packet->nc_path); - kfree(nc_packet); -} - -/** - * batadv_nc_to_purge_nc_node() - checks whether an nc node has to be purged - * @bat_priv: the bat priv with all the mesh interface information - * @nc_node: the nc node to check - * - * Return: true if the entry has to be purged now, false otherwise - */ -static bool batadv_nc_to_purge_nc_node(struct batadv_priv *bat_priv, - struct batadv_nc_node *nc_node) -{ - if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) - return true; - - return batadv_has_timed_out(nc_node->last_seen, BATADV_NC_NODE_TIMEOUT); -} - -/** - * batadv_nc_to_purge_nc_path_coding() - checks whether an nc path has timed out - * @bat_priv: the bat priv with all the mesh interface information - * @nc_path: the nc path to check - * - * Return: true if the entry has to be purged now, false otherwise - */ -static bool batadv_nc_to_purge_nc_path_coding(struct batadv_priv *bat_priv, - struct batadv_nc_path *nc_path) -{ - if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) - return true; - - /* purge the path when no packets has been added for 10 times the - * max_fwd_delay time - */ - return batadv_has_timed_out(nc_path->last_valid, - bat_priv->nc.max_fwd_delay * 10); -} - -/** - * batadv_nc_to_purge_nc_path_decoding() - checks whether an nc path has timed - * out - * @bat_priv: the bat priv with all the mesh interface information - * @nc_path: the nc path to check - * - * Return: true if the entry has to be purged now, false otherwise - */ -static bool batadv_nc_to_purge_nc_path_decoding(struct batadv_priv *bat_priv, - struct batadv_nc_path *nc_path) -{ - if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) - return true; - - /* purge the path when no packets has been added for 10 times the - * max_buffer time - */ - return batadv_has_timed_out(nc_path->last_valid, - bat_priv->nc.max_buffer_time * 10); -} - -/** - * batadv_nc_purge_orig_nc_nodes() - go through list of nc nodes and purge stale - * entries - * @bat_priv: the bat priv with all the mesh interface information - * @list: list of nc nodes - * @lock: nc node list lock - * @to_purge: function in charge to decide whether an entry has to be purged or - * not. This function takes the nc node as argument and has to return - * a boolean value: true if the entry has to be deleted, false - * otherwise - */ -static void -batadv_nc_purge_orig_nc_nodes(struct batadv_priv *bat_priv, - struct list_head *list, - spinlock_t *lock, - bool (*to_purge)(struct batadv_priv *, - struct batadv_nc_node *)) -{ - struct batadv_nc_node *nc_node, *nc_node_tmp; - - /* For each nc_node in list */ - spin_lock_bh(lock); - list_for_each_entry_safe(nc_node, nc_node_tmp, list, list) { - /* if an helper function has been passed as parameter, - * ask it if the entry has to be purged or not - */ - if (to_purge && !to_purge(bat_priv, nc_node)) - continue; - - batadv_dbg(BATADV_DBG_NC, bat_priv, - "Removing nc_node %pM -> %pM\n", - nc_node->addr, nc_node->orig_node->orig); - list_del_rcu(&nc_node->list); - batadv_nc_node_put(nc_node); - } - spin_unlock_bh(lock); -} - -/** - * batadv_nc_purge_orig() - purges all nc node data attached of the given - * originator - * @bat_priv: the bat priv with all the mesh interface information - * @orig_node: orig_node with the nc node entries to be purged - * @to_purge: function in charge to decide whether an entry has to be purged or - * not. This function takes the nc node as argument and has to return - * a boolean value: true is the entry has to be deleted, false - * otherwise - */ -void batadv_nc_purge_orig(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - bool (*to_purge)(struct batadv_priv *, - struct batadv_nc_node *)) -{ - /* Check ingoing nc_node's of this orig_node */ - batadv_nc_purge_orig_nc_nodes(bat_priv, &orig_node->in_coding_list, - &orig_node->in_coding_list_lock, - to_purge); - - /* Check outgoing nc_node's of this orig_node */ - batadv_nc_purge_orig_nc_nodes(bat_priv, &orig_node->out_coding_list, - &orig_node->out_coding_list_lock, - to_purge); -} - -/** - * batadv_nc_purge_orig_hash() - traverse entire originator hash to check if - * they have timed out nc nodes - * @bat_priv: the bat priv with all the mesh interface information - */ -static void batadv_nc_purge_orig_hash(struct batadv_priv *bat_priv) -{ - struct batadv_hashtable *hash = bat_priv->orig_hash; - struct hlist_head *head; - struct batadv_orig_node *orig_node; - u32 i; - - if (!hash) - return; - - /* For each orig_node */ - for (i = 0; i < hash->size; i++) { - head = &hash->table[i]; - - rcu_read_lock(); - hlist_for_each_entry_rcu(orig_node, head, hash_entry) - batadv_nc_purge_orig(bat_priv, orig_node, - batadv_nc_to_purge_nc_node); - rcu_read_unlock(); - } -} - -/** - * batadv_nc_purge_paths() - traverse all nc paths part of the hash and remove - * unused ones - * @bat_priv: the bat priv with all the mesh interface information - * @hash: hash table containing the nc paths to check - * @to_purge: function in charge to decide whether an entry has to be purged or - * not. This function takes the nc node as argument and has to return - * a boolean value: true is the entry has to be deleted, false - * otherwise - */ -static void batadv_nc_purge_paths(struct batadv_priv *bat_priv, - struct batadv_hashtable *hash, - bool (*to_purge)(struct batadv_priv *, - struct batadv_nc_path *)) -{ - struct hlist_head *head; - struct hlist_node *node_tmp; - struct batadv_nc_path *nc_path; - spinlock_t *lock; /* Protects lists in hash */ - u32 i; - - for (i = 0; i < hash->size; i++) { - head = &hash->table[i]; - lock = &hash->list_locks[i]; - - /* For each nc_path in this bin */ - spin_lock_bh(lock); - hlist_for_each_entry_safe(nc_path, node_tmp, head, hash_entry) { - /* if an helper function has been passed as parameter, - * ask it if the entry has to be purged or not - */ - if (to_purge && !to_purge(bat_priv, nc_path)) - continue; - - /* purging an non-empty nc_path should never happen, but - * is observed under high CPU load. Delay the purging - * until next iteration to allow the packet_list to be - * emptied first. - */ - if (!unlikely(list_empty(&nc_path->packet_list))) { - net_ratelimited_function(printk, - KERN_WARNING - "Skipping free of non-empty nc_path (%pM -> %pM)!\n", - nc_path->prev_hop, - nc_path->next_hop); - continue; - } - - /* nc_path is unused, so remove it */ - batadv_dbg(BATADV_DBG_NC, bat_priv, - "Remove nc_path %pM -> %pM\n", - nc_path->prev_hop, nc_path->next_hop); - hlist_del_rcu(&nc_path->hash_entry); - batadv_nc_path_put(nc_path); - } - spin_unlock_bh(lock); - } -} - -/** - * batadv_nc_hash_key_gen() - computes the nc_path hash key - * @key: buffer to hold the final hash key - * @src: source ethernet mac address going into the hash key - * @dst: destination ethernet mac address going into the hash key - */ -static void batadv_nc_hash_key_gen(struct batadv_nc_path *key, const char *src, - const char *dst) -{ - memcpy(key->prev_hop, src, sizeof(key->prev_hop)); - memcpy(key->next_hop, dst, sizeof(key->next_hop)); -} - -/** - * batadv_nc_hash_choose() - compute the hash value for an nc path - * @data: data to hash - * @size: size of the hash table - * - * Return: the selected index in the hash table for the given data. - */ -static u32 batadv_nc_hash_choose(const void *data, u32 size) -{ - const struct batadv_nc_path *nc_path = data; - u32 hash = 0; - - hash = jhash(&nc_path->prev_hop, sizeof(nc_path->prev_hop), hash); - hash = jhash(&nc_path->next_hop, sizeof(nc_path->next_hop), hash); - - return hash % size; -} - -/** - * batadv_nc_hash_compare() - comparing function used in the network coding hash - * tables - * @node: node in the local table - * @data2: second object to compare the node to - * - * Return: true if the two entry are the same, false otherwise - */ -static bool batadv_nc_hash_compare(const struct hlist_node *node, - const void *data2) -{ - const struct batadv_nc_path *nc_path1, *nc_path2; - - nc_path1 = container_of(node, struct batadv_nc_path, hash_entry); - nc_path2 = data2; - - /* Return 1 if the two keys are identical */ - if (!batadv_compare_eth(nc_path1->prev_hop, nc_path2->prev_hop)) - return false; - - if (!batadv_compare_eth(nc_path1->next_hop, nc_path2->next_hop)) - return false; - - return true; -} - -/** - * batadv_nc_hash_find() - search for an existing nc path and return it - * @hash: hash table containing the nc path - * @data: search key - * - * Return: the nc_path if found, NULL otherwise. - */ -static struct batadv_nc_path * -batadv_nc_hash_find(struct batadv_hashtable *hash, - void *data) -{ - struct hlist_head *head; - struct batadv_nc_path *nc_path, *nc_path_tmp = NULL; - int index; - - if (!hash) - return NULL; - - index = batadv_nc_hash_choose(data, hash->size); - head = &hash->table[index]; - - rcu_read_lock(); - hlist_for_each_entry_rcu(nc_path, head, hash_entry) { - if (!batadv_nc_hash_compare(&nc_path->hash_entry, data)) - continue; - - if (!kref_get_unless_zero(&nc_path->refcount)) - continue; - - nc_path_tmp = nc_path; - break; - } - rcu_read_unlock(); - - return nc_path_tmp; -} - -/** - * batadv_nc_send_packet() - send non-coded packet and free nc_packet struct - * @nc_packet: the nc packet to send - */ -static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet) -{ - batadv_send_unicast_skb(nc_packet->skb, nc_packet->neigh_node); - nc_packet->skb = NULL; - batadv_nc_packet_free(nc_packet, false); -} - -/** - * batadv_nc_sniffed_purge() - Checks timestamp of given sniffed nc_packet. - * @bat_priv: the bat priv with all the mesh interface information - * @nc_path: the nc path the packet belongs to - * @nc_packet: the nc packet to be checked - * - * Checks whether the given sniffed (overheard) nc_packet has hit its buffering - * timeout. If so, the packet is no longer kept and the entry deleted from the - * queue. Has to be called with the appropriate locks. - * - * Return: false as soon as the entry in the fifo queue has not been timed out - * yet and true otherwise. - */ -static bool batadv_nc_sniffed_purge(struct batadv_priv *bat_priv, - struct batadv_nc_path *nc_path, - struct batadv_nc_packet *nc_packet) -{ - unsigned long timeout = bat_priv->nc.max_buffer_time; - bool res = false; - - lockdep_assert_held(&nc_path->packet_list_lock); - - /* Packets are added to tail, so the remaining packets did not time - * out and we can stop processing the current queue - */ - if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE && - !batadv_has_timed_out(nc_packet->timestamp, timeout)) - goto out; - - /* purge nc packet */ - list_del(&nc_packet->list); - batadv_nc_packet_free(nc_packet, true); - - res = true; - -out: - return res; -} - -/** - * batadv_nc_fwd_flush() - Checks the timestamp of the given nc packet. - * @bat_priv: the bat priv with all the mesh interface information - * @nc_path: the nc path the packet belongs to - * @nc_packet: the nc packet to be checked - * - * Checks whether the given nc packet has hit its forward timeout. If so, the - * packet is no longer delayed, immediately sent and the entry deleted from the - * queue. Has to be called with the appropriate locks. - * - * Return: false as soon as the entry in the fifo queue has not been timed out - * yet and true otherwise. - */ -static bool batadv_nc_fwd_flush(struct batadv_priv *bat_priv, - struct batadv_nc_path *nc_path, - struct batadv_nc_packet *nc_packet) -{ - unsigned long timeout = bat_priv->nc.max_fwd_delay; - - lockdep_assert_held(&nc_path->packet_list_lock); - - /* Packets are added to tail, so the remaining packets did not time - * out and we can stop processing the current queue - */ - if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE && - !batadv_has_timed_out(nc_packet->timestamp, timeout)) - return false; - - /* Send packet */ - batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD); - batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES, - nc_packet->skb->len + ETH_HLEN); - list_del(&nc_packet->list); - batadv_nc_send_packet(nc_packet); - - return true; -} - -/** - * batadv_nc_process_nc_paths() - traverse given nc packet pool and free timed - * out nc packets - * @bat_priv: the bat priv with all the mesh interface information - * @hash: to be processed hash table - * @process_fn: Function called to process given nc packet. Should return true - * to encourage this function to proceed with the next packet. - * Otherwise the rest of the current queue is skipped. - */ -static void -batadv_nc_process_nc_paths(struct batadv_priv *bat_priv, - struct batadv_hashtable *hash, - bool (*process_fn)(struct batadv_priv *, - struct batadv_nc_path *, - struct batadv_nc_packet *)) -{ - struct hlist_head *head; - struct batadv_nc_packet *nc_packet, *nc_packet_tmp; - struct batadv_nc_path *nc_path; - bool ret; - int i; - - if (!hash) - return; - - /* Loop hash table bins */ - for (i = 0; i < hash->size; i++) { - head = &hash->table[i]; - - /* Loop coding paths */ - rcu_read_lock(); - hlist_for_each_entry_rcu(nc_path, head, hash_entry) { - /* Loop packets */ - spin_lock_bh(&nc_path->packet_list_lock); - list_for_each_entry_safe(nc_packet, nc_packet_tmp, - &nc_path->packet_list, list) { - ret = process_fn(bat_priv, nc_path, nc_packet); - if (!ret) - break; - } - spin_unlock_bh(&nc_path->packet_list_lock); - } - rcu_read_unlock(); - } -} - -/** - * batadv_nc_worker() - periodic task for housekeeping related to network - * coding - * @work: kernel work struct - */ -static void batadv_nc_worker(struct work_struct *work) -{ - struct delayed_work *delayed_work; - struct batadv_priv_nc *priv_nc; - struct batadv_priv *bat_priv; - unsigned long timeout; - - delayed_work = to_delayed_work(work); - priv_nc = container_of(delayed_work, struct batadv_priv_nc, work); - bat_priv = container_of(priv_nc, struct batadv_priv, nc); - - batadv_nc_purge_orig_hash(bat_priv); - batadv_nc_purge_paths(bat_priv, bat_priv->nc.coding_hash, - batadv_nc_to_purge_nc_path_coding); - batadv_nc_purge_paths(bat_priv, bat_priv->nc.decoding_hash, - batadv_nc_to_purge_nc_path_decoding); - - timeout = bat_priv->nc.max_fwd_delay; - - if (batadv_has_timed_out(bat_priv->nc.timestamp_fwd_flush, timeout)) { - batadv_nc_process_nc_paths(bat_priv, bat_priv->nc.coding_hash, - batadv_nc_fwd_flush); - bat_priv->nc.timestamp_fwd_flush = jiffies; - } - - if (batadv_has_timed_out(bat_priv->nc.timestamp_sniffed_purge, - bat_priv->nc.max_buffer_time)) { - batadv_nc_process_nc_paths(bat_priv, bat_priv->nc.decoding_hash, - batadv_nc_sniffed_purge); - bat_priv->nc.timestamp_sniffed_purge = jiffies; - } - - /* Schedule a new check */ - batadv_nc_start_timer(bat_priv); -} - -/** - * batadv_can_nc_with_orig() - checks whether the given orig node is suitable - * for coding or not - * @bat_priv: the bat priv with all the mesh interface information - * @orig_node: neighboring orig node which may be used as nc candidate - * @ogm_packet: incoming ogm packet also used for the checks - * - * Return: true if: - * 1) The OGM must have the most recent sequence number. - * 2) The TTL must be decremented by one and only one. - * 3) The OGM must be received from the first hop from orig_node. - * 4) The TQ value of the OGM must be above bat_priv->nc.min_tq. - */ -static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - struct batadv_ogm_packet *ogm_packet) -{ - struct batadv_orig_ifinfo *orig_ifinfo; - u32 last_real_seqno; - u8 last_ttl; - - orig_ifinfo = batadv_orig_ifinfo_get(orig_node, BATADV_IF_DEFAULT); - if (!orig_ifinfo) - return false; - - last_ttl = orig_ifinfo->last_ttl; - last_real_seqno = orig_ifinfo->last_real_seqno; - batadv_orig_ifinfo_put(orig_ifinfo); - - if (last_real_seqno != ntohl(ogm_packet->seqno)) - return false; - if (last_ttl != ogm_packet->ttl + 1) - return false; - if (!batadv_compare_eth(ogm_packet->orig, ogm_packet->prev_sender)) - return false; - if (ogm_packet->tq < bat_priv->nc.min_tq) - return false; - - return true; -} - -/** - * batadv_nc_find_nc_node() - search for an existing nc node and return it - * @orig_node: orig node originating the ogm packet - * @orig_neigh_node: neighboring orig node from which we received the ogm packet - * (can be equal to orig_node) - * @in_coding: traverse incoming or outgoing network coding list - * - * Return: the nc_node if found, NULL otherwise. - */ -static struct batadv_nc_node * -batadv_nc_find_nc_node(struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh_node, - bool in_coding) -{ - struct batadv_nc_node *nc_node, *nc_node_out = NULL; - struct list_head *list; - - if (in_coding) - list = &orig_neigh_node->in_coding_list; - else - list = &orig_neigh_node->out_coding_list; - - /* Traverse list of nc_nodes to orig_node */ - rcu_read_lock(); - list_for_each_entry_rcu(nc_node, list, list) { - if (!batadv_compare_eth(nc_node->addr, orig_node->orig)) - continue; - - if (!kref_get_unless_zero(&nc_node->refcount)) - continue; - - /* Found a match */ - nc_node_out = nc_node; - break; - } - rcu_read_unlock(); - - return nc_node_out; -} - -/** - * batadv_nc_get_nc_node() - retrieves an nc node or creates the entry if it was - * not found - * @bat_priv: the bat priv with all the mesh interface information - * @orig_node: orig node originating the ogm packet - * @orig_neigh_node: neighboring orig node from which we received the ogm packet - * (can be equal to orig_node) - * @in_coding: traverse incoming or outgoing network coding list - * - * Return: the nc_node if found or created, NULL in case of an error. - */ -static struct batadv_nc_node * -batadv_nc_get_nc_node(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh_node, - bool in_coding) -{ - struct batadv_nc_node *nc_node; - spinlock_t *lock; /* Used to lock list selected by "int in_coding" */ - struct list_head *list; - - /* Select ingoing or outgoing coding node */ - if (in_coding) { - lock = &orig_neigh_node->in_coding_list_lock; - list = &orig_neigh_node->in_coding_list; - } else { - lock = &orig_neigh_node->out_coding_list_lock; - list = &orig_neigh_node->out_coding_list; - } - - spin_lock_bh(lock); - - /* Check if nc_node is already added */ - nc_node = batadv_nc_find_nc_node(orig_node, orig_neigh_node, in_coding); - - /* Node found */ - if (nc_node) - goto unlock; - - nc_node = kzalloc(sizeof(*nc_node), GFP_ATOMIC); - if (!nc_node) - goto unlock; - - /* Initialize nc_node */ - INIT_LIST_HEAD(&nc_node->list); - kref_init(&nc_node->refcount); - ether_addr_copy(nc_node->addr, orig_node->orig); - kref_get(&orig_neigh_node->refcount); - nc_node->orig_node = orig_neigh_node; - - batadv_dbg(BATADV_DBG_NC, bat_priv, "Adding nc_node %pM -> %pM\n", - nc_node->addr, nc_node->orig_node->orig); - - /* Add nc_node to orig_node */ - kref_get(&nc_node->refcount); - list_add_tail_rcu(&nc_node->list, list); - -unlock: - spin_unlock_bh(lock); - - return nc_node; -} - -/** - * batadv_nc_update_nc_node() - updates stored incoming and outgoing nc node - * structs (best called on incoming OGMs) - * @bat_priv: the bat priv with all the mesh interface information - * @orig_node: orig node originating the ogm packet - * @orig_neigh_node: neighboring orig node from which we received the ogm packet - * (can be equal to orig_node) - * @ogm_packet: incoming ogm packet - * @is_single_hop_neigh: orig_node is a single hop neighbor - */ -void batadv_nc_update_nc_node(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh_node, - struct batadv_ogm_packet *ogm_packet, - int is_single_hop_neigh) -{ - struct batadv_nc_node *in_nc_node = NULL; - struct batadv_nc_node *out_nc_node = NULL; - - /* Check if network coding is enabled */ - if (!atomic_read(&bat_priv->network_coding)) - goto out; - - /* check if orig node is network coding enabled */ - if (!test_bit(BATADV_ORIG_CAPA_HAS_NC, &orig_node->capabilities)) - goto out; - - /* accept ogms from 'good' neighbors and single hop neighbors */ - if (!batadv_can_nc_with_orig(bat_priv, orig_node, ogm_packet) && - !is_single_hop_neigh) - goto out; - - /* Add orig_node as in_nc_node on hop */ - in_nc_node = batadv_nc_get_nc_node(bat_priv, orig_node, - orig_neigh_node, true); - if (!in_nc_node) - goto out; - - in_nc_node->last_seen = jiffies; - - /* Add hop as out_nc_node on orig_node */ - out_nc_node = batadv_nc_get_nc_node(bat_priv, orig_neigh_node, - orig_node, false); - if (!out_nc_node) - goto out; - - out_nc_node->last_seen = jiffies; - -out: - batadv_nc_node_put(in_nc_node); - batadv_nc_node_put(out_nc_node); -} - -/** - * batadv_nc_get_path() - get existing nc_path or allocate a new one - * @bat_priv: the bat priv with all the mesh interface information - * @hash: hash table containing the nc path - * @src: ethernet source address - first half of the nc path search key - * @dst: ethernet destination address - second half of the nc path search key - * - * Return: pointer to nc_path if the path was found or created, returns NULL - * on error. - */ -static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, - struct batadv_hashtable *hash, - u8 *src, - u8 *dst) -{ - int hash_added; - struct batadv_nc_path *nc_path, nc_path_key; - - batadv_nc_hash_key_gen(&nc_path_key, src, dst); - - /* Search for existing nc_path */ - nc_path = batadv_nc_hash_find(hash, (void *)&nc_path_key); - - if (nc_path) { - /* Set timestamp to delay removal of nc_path */ - nc_path->last_valid = jiffies; - return nc_path; - } - - /* No existing nc_path was found; create a new */ - nc_path = kzalloc(sizeof(*nc_path), GFP_ATOMIC); - - if (!nc_path) - return NULL; - - /* Initialize nc_path */ - INIT_LIST_HEAD(&nc_path->packet_list); - spin_lock_init(&nc_path->packet_list_lock); - kref_init(&nc_path->refcount); - nc_path->last_valid = jiffies; - ether_addr_copy(nc_path->next_hop, dst); - ether_addr_copy(nc_path->prev_hop, src); - - batadv_dbg(BATADV_DBG_NC, bat_priv, "Adding nc_path %pM -> %pM\n", - nc_path->prev_hop, - nc_path->next_hop); - - /* Add nc_path to hash table */ - kref_get(&nc_path->refcount); - hash_added = batadv_hash_add(hash, batadv_nc_hash_compare, - batadv_nc_hash_choose, &nc_path_key, - &nc_path->hash_entry); - - if (hash_added < 0) { - kfree(nc_path); - return NULL; - } - - return nc_path; -} - -/** - * batadv_nc_random_weight_tq() - scale the receivers TQ-value to avoid unfair - * selection of a receiver with slightly lower TQ than the other - * @tq: to be weighted tq value - * - * Return: scaled tq value - */ -static u8 batadv_nc_random_weight_tq(u8 tq) -{ - /* randomize the estimated packet loss (max TQ - estimated TQ) */ - u8 rand_tq = get_random_u32_below(BATADV_TQ_MAX_VALUE + 1 - tq); - - /* convert to (randomized) estimated tq again */ - return BATADV_TQ_MAX_VALUE - rand_tq; -} - -/** - * batadv_nc_memxor() - XOR destination with source - * @dst: byte array to XOR into - * @src: byte array to XOR from - * @len: length of destination array - */ -static void batadv_nc_memxor(char *dst, const char *src, unsigned int len) -{ - unsigned int i; - - for (i = 0; i < len; ++i) - dst[i] ^= src[i]; -} - -/** - * batadv_nc_code_packets() - code a received unicast_packet with an nc packet - * into a coded_packet and send it - * @bat_priv: the bat priv with all the mesh interface information - * @skb: data skb to forward - * @ethhdr: pointer to the ethernet header inside the skb - * @nc_packet: structure containing the packet to the skb can be coded with - * @neigh_node: next hop to forward packet to - * - * Return: true if both packets are consumed, false otherwise. - */ -static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, - struct sk_buff *skb, - struct ethhdr *ethhdr, - struct batadv_nc_packet *nc_packet, - struct batadv_neigh_node *neigh_node) -{ - u8 tq_weighted_neigh, tq_weighted_coding, tq_tmp; - struct sk_buff *skb_dest, *skb_src; - struct batadv_unicast_packet *packet1; - struct batadv_unicast_packet *packet2; - struct batadv_coded_packet *coded_packet; - struct batadv_neigh_node *neigh_tmp, *router_neigh, *first_dest; - struct batadv_neigh_node *router_coding = NULL, *second_dest; - struct batadv_neigh_ifinfo *router_neigh_ifinfo = NULL; - struct batadv_neigh_ifinfo *router_coding_ifinfo = NULL; - u8 *first_source, *second_source; - __be32 packet_id1, packet_id2; - size_t count; - bool res = false; - int coding_len; - int unicast_size = sizeof(*packet1); - int coded_size = sizeof(*coded_packet); - int header_add = coded_size - unicast_size; - - /* TODO: do we need to consider the outgoing interface for - * coded packets? - */ - router_neigh = batadv_orig_router_get(neigh_node->orig_node, - BATADV_IF_DEFAULT); - if (!router_neigh) - goto out; - - router_neigh_ifinfo = batadv_neigh_ifinfo_get(router_neigh, - BATADV_IF_DEFAULT); - if (!router_neigh_ifinfo) - goto out; - - neigh_tmp = nc_packet->neigh_node; - router_coding = batadv_orig_router_get(neigh_tmp->orig_node, - BATADV_IF_DEFAULT); - if (!router_coding) - goto out; - - router_coding_ifinfo = batadv_neigh_ifinfo_get(router_coding, - BATADV_IF_DEFAULT); - if (!router_coding_ifinfo) - goto out; - - tq_tmp = router_neigh_ifinfo->bat_iv.tq_avg; - tq_weighted_neigh = batadv_nc_random_weight_tq(tq_tmp); - tq_tmp = router_coding_ifinfo->bat_iv.tq_avg; - tq_weighted_coding = batadv_nc_random_weight_tq(tq_tmp); - - /* Select one destination for the MAC-header dst-field based on - * weighted TQ-values. - */ - if (tq_weighted_neigh >= tq_weighted_coding) { - /* Destination from nc_packet is selected for MAC-header */ - first_dest = nc_packet->neigh_node; - first_source = nc_packet->nc_path->prev_hop; - second_dest = neigh_node; - second_source = ethhdr->h_source; - packet1 = (struct batadv_unicast_packet *)nc_packet->skb->data; - packet2 = (struct batadv_unicast_packet *)skb->data; - packet_id1 = nc_packet->packet_id; - packet_id2 = batadv_skb_crc32(skb, - skb->data + sizeof(*packet2)); - } else { - /* Destination for skb is selected for MAC-header */ - first_dest = neigh_node; - first_source = ethhdr->h_source; - second_dest = nc_packet->neigh_node; - second_source = nc_packet->nc_path->prev_hop; - packet1 = (struct batadv_unicast_packet *)skb->data; - packet2 = (struct batadv_unicast_packet *)nc_packet->skb->data; - packet_id1 = batadv_skb_crc32(skb, - skb->data + sizeof(*packet1)); - packet_id2 = nc_packet->packet_id; - } - - /* Instead of zero padding the smallest data buffer, we - * code into the largest. - */ - if (skb->len <= nc_packet->skb->len) { - skb_dest = nc_packet->skb; - skb_src = skb; - } else { - skb_dest = skb; - skb_src = nc_packet->skb; - } - - /* coding_len is used when decoding the packet shorter packet */ - coding_len = skb_src->len - unicast_size; - - if (skb_linearize(skb_dest) < 0 || skb_linearize(skb_src) < 0) - goto out; - - skb_push(skb_dest, header_add); - - coded_packet = (struct batadv_coded_packet *)skb_dest->data; - skb_reset_mac_header(skb_dest); - - coded_packet->packet_type = BATADV_CODED; - coded_packet->version = BATADV_COMPAT_VERSION; - coded_packet->ttl = packet1->ttl; - - /* Info about first unicast packet */ - ether_addr_copy(coded_packet->first_source, first_source); - ether_addr_copy(coded_packet->first_orig_dest, packet1->dest); - coded_packet->first_crc = packet_id1; - coded_packet->first_ttvn = packet1->ttvn; - - /* Info about second unicast packet */ - ether_addr_copy(coded_packet->second_dest, second_dest->addr); - ether_addr_copy(coded_packet->second_source, second_source); - ether_addr_copy(coded_packet->second_orig_dest, packet2->dest); - coded_packet->second_crc = packet_id2; - coded_packet->second_ttl = packet2->ttl; - coded_packet->second_ttvn = packet2->ttvn; - coded_packet->coded_len = htons(coding_len); - - /* This is where the magic happens: Code skb_src into skb_dest */ - batadv_nc_memxor(skb_dest->data + coded_size, - skb_src->data + unicast_size, coding_len); - - /* Update counters accordingly */ - if (BATADV_SKB_CB(skb_src)->decoded && - BATADV_SKB_CB(skb_dest)->decoded) { - /* Both packets are recoded */ - count = skb_src->len + ETH_HLEN; - count += skb_dest->len + ETH_HLEN; - batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE, 2); - batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES, count); - } else if (!BATADV_SKB_CB(skb_src)->decoded && - !BATADV_SKB_CB(skb_dest)->decoded) { - /* Both packets are newly coded */ - count = skb_src->len + ETH_HLEN; - count += skb_dest->len + ETH_HLEN; - batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE, 2); - batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES, count); - } else if (BATADV_SKB_CB(skb_src)->decoded && - !BATADV_SKB_CB(skb_dest)->decoded) { - /* skb_src recoded and skb_dest is newly coded */ - batadv_inc_counter(bat_priv, BATADV_CNT_NC_RECODE); - batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES, - skb_src->len + ETH_HLEN); - batadv_inc_counter(bat_priv, BATADV_CNT_NC_CODE); - batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES, - skb_dest->len + ETH_HLEN); - } else if (!BATADV_SKB_CB(skb_src)->decoded && - BATADV_SKB_CB(skb_dest)->decoded) { - /* skb_src is newly coded and skb_dest is recoded */ - batadv_inc_counter(bat_priv, BATADV_CNT_NC_CODE); - batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES, - skb_src->len + ETH_HLEN); - batadv_inc_counter(bat_priv, BATADV_CNT_NC_RECODE); - batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES, - skb_dest->len + ETH_HLEN); - } - - /* skb_src is now coded into skb_dest, so free it */ - consume_skb(skb_src); - - /* avoid duplicate free of skb from nc_packet */ - nc_packet->skb = NULL; - batadv_nc_packet_free(nc_packet, false); - - /* Send the coded packet and return true */ - batadv_send_unicast_skb(skb_dest, first_dest); - res = true; -out: - batadv_neigh_node_put(router_neigh); - batadv_neigh_node_put(router_coding); - batadv_neigh_ifinfo_put(router_neigh_ifinfo); - batadv_neigh_ifinfo_put(router_coding_ifinfo); - return res; -} - -/** - * batadv_nc_skb_coding_possible() - true if a decoded skb is available at dst. - * @skb: data skb to forward - * @dst: destination mac address of the other skb to code with - * @src: source mac address of skb - * - * Whenever we network code a packet we have to check whether we received it in - * a network coded form. If so, we may not be able to use it for coding because - * some neighbors may also have received (overheard) the packet in the network - * coded form without being able to decode it. It is hard to know which of the - * neighboring nodes was able to decode the packet, therefore we can only - * re-code the packet if the source of the previous encoded packet is involved. - * Since the source encoded the packet we can be certain it has all necessary - * decode information. - * - * Return: true if coding of a decoded packet is allowed. - */ -static bool batadv_nc_skb_coding_possible(struct sk_buff *skb, u8 *dst, u8 *src) -{ - if (BATADV_SKB_CB(skb)->decoded && !batadv_compare_eth(dst, src)) - return false; - return true; -} - -/** - * batadv_nc_path_search() - Find the coding path matching in_nc_node and - * out_nc_node to retrieve a buffered packet that can be used for coding. - * @bat_priv: the bat priv with all the mesh interface information - * @in_nc_node: pointer to skb next hop's neighbor nc node - * @out_nc_node: pointer to skb source's neighbor nc node - * @skb: data skb to forward - * @eth_dst: next hop mac address of skb - * - * Return: true if coding of a decoded skb is allowed. - */ -static struct batadv_nc_packet * -batadv_nc_path_search(struct batadv_priv *bat_priv, - struct batadv_nc_node *in_nc_node, - struct batadv_nc_node *out_nc_node, - struct sk_buff *skb, - u8 *eth_dst) -{ - struct batadv_nc_path *nc_path, nc_path_key; - struct batadv_nc_packet *nc_packet_out = NULL; - struct batadv_nc_packet *nc_packet, *nc_packet_tmp; - struct batadv_hashtable *hash = bat_priv->nc.coding_hash; - int idx; - - if (!hash) - return NULL; - - /* Create almost path key */ - batadv_nc_hash_key_gen(&nc_path_key, in_nc_node->addr, - out_nc_node->addr); - idx = batadv_nc_hash_choose(&nc_path_key, hash->size); - - /* Check for coding opportunities in this nc_path */ - rcu_read_lock(); - hlist_for_each_entry_rcu(nc_path, &hash->table[idx], hash_entry) { - if (!batadv_compare_eth(nc_path->prev_hop, in_nc_node->addr)) - continue; - - if (!batadv_compare_eth(nc_path->next_hop, out_nc_node->addr)) - continue; - - spin_lock_bh(&nc_path->packet_list_lock); - if (list_empty(&nc_path->packet_list)) { - spin_unlock_bh(&nc_path->packet_list_lock); - continue; - } - - list_for_each_entry_safe(nc_packet, nc_packet_tmp, - &nc_path->packet_list, list) { - if (!batadv_nc_skb_coding_possible(nc_packet->skb, - eth_dst, - in_nc_node->addr)) - continue; - - /* Coding opportunity is found! */ - list_del(&nc_packet->list); - nc_packet_out = nc_packet; - break; - } - - spin_unlock_bh(&nc_path->packet_list_lock); - break; - } - rcu_read_unlock(); - - return nc_packet_out; -} - -/** - * batadv_nc_skb_src_search() - Loops through the list of neighboring nodes of - * the skb's sender (may be equal to the originator). - * @bat_priv: the bat priv with all the mesh interface information - * @skb: data skb to forward - * @eth_dst: next hop mac address of skb - * @eth_src: source mac address of skb - * @in_nc_node: pointer to skb next hop's neighbor nc node - * - * Return: an nc packet if a suitable coding packet was found, NULL otherwise. - */ -static struct batadv_nc_packet * -batadv_nc_skb_src_search(struct batadv_priv *bat_priv, - struct sk_buff *skb, - u8 *eth_dst, - u8 *eth_src, - struct batadv_nc_node *in_nc_node) -{ - struct batadv_orig_node *orig_node; - struct batadv_nc_node *out_nc_node; - struct batadv_nc_packet *nc_packet = NULL; - - orig_node = batadv_orig_hash_find(bat_priv, eth_src); - if (!orig_node) - return NULL; - - rcu_read_lock(); - list_for_each_entry_rcu(out_nc_node, - &orig_node->out_coding_list, list) { - /* Check if the skb is decoded and if recoding is possible */ - if (!batadv_nc_skb_coding_possible(skb, - out_nc_node->addr, eth_src)) - continue; - - /* Search for an opportunity in this nc_path */ - nc_packet = batadv_nc_path_search(bat_priv, in_nc_node, - out_nc_node, skb, eth_dst); - if (nc_packet) - break; - } - rcu_read_unlock(); - - batadv_orig_node_put(orig_node); - return nc_packet; -} - -/** - * batadv_nc_skb_store_before_coding() - set the ethernet src and dst of the - * unicast skb before it is stored for use in later decoding - * @bat_priv: the bat priv with all the mesh interface information - * @skb: data skb to store - * @eth_dst_new: new destination mac address of skb - */ -static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv, - struct sk_buff *skb, - u8 *eth_dst_new) -{ - struct ethhdr *ethhdr; - - /* Copy skb header to change the mac header */ - skb = pskb_copy_for_clone(skb, GFP_ATOMIC); - if (!skb) - return; - - /* Set the mac header as if we actually sent the packet uncoded */ - ethhdr = eth_hdr(skb); - ether_addr_copy(ethhdr->h_source, ethhdr->h_dest); - ether_addr_copy(ethhdr->h_dest, eth_dst_new); - - /* Set data pointer to MAC header to mimic packets from our tx path */ - skb_push(skb, ETH_HLEN); - - /* Add the packet to the decoding packet pool */ - batadv_nc_skb_store_for_decoding(bat_priv, skb); - - /* batadv_nc_skb_store_for_decoding() clones the skb, so we must free - * our ref - */ - consume_skb(skb); -} - -/** - * batadv_nc_skb_dst_search() - Loops through list of neighboring nodes to dst. - * @skb: data skb to forward - * @neigh_node: next hop to forward packet to - * @ethhdr: pointer to the ethernet header inside the skb - * - * Loops through the list of neighboring nodes the next hop has a good - * connection to (receives OGMs with a sufficient quality). We need to find a - * neighbor of our next hop that potentially sent a packet which our next hop - * also received (overheard) and has stored for later decoding. - * - * Return: true if the skb was consumed (encoded packet sent) or false otherwise - */ -static bool batadv_nc_skb_dst_search(struct sk_buff *skb, - struct batadv_neigh_node *neigh_node, - struct ethhdr *ethhdr) -{ - struct net_device *netdev = neigh_node->if_incoming->mesh_iface; - struct batadv_priv *bat_priv = netdev_priv(netdev); - struct batadv_orig_node *orig_node = neigh_node->orig_node; - struct batadv_nc_node *nc_node; - struct batadv_nc_packet *nc_packet = NULL; - - rcu_read_lock(); - list_for_each_entry_rcu(nc_node, &orig_node->in_coding_list, list) { - /* Search for coding opportunity with this in_nc_node */ - nc_packet = batadv_nc_skb_src_search(bat_priv, skb, - neigh_node->addr, - ethhdr->h_source, nc_node); - - /* Opportunity was found, so stop searching */ - if (nc_packet) - break; - } - rcu_read_unlock(); - - if (!nc_packet) - return false; - - /* Save packets for later decoding */ - batadv_nc_skb_store_before_coding(bat_priv, skb, - neigh_node->addr); - batadv_nc_skb_store_before_coding(bat_priv, nc_packet->skb, - nc_packet->neigh_node->addr); - - /* Code and send packets */ - if (batadv_nc_code_packets(bat_priv, skb, ethhdr, nc_packet, - neigh_node)) - return true; - - /* out of mem ? Coding failed - we have to free the buffered packet - * to avoid memleaks. The skb passed as argument will be dealt with - * by the calling function. - */ - batadv_nc_send_packet(nc_packet); - return false; -} - -/** - * batadv_nc_skb_add_to_path() - buffer skb for later encoding / decoding - * @skb: skb to add to path - * @nc_path: path to add skb to - * @neigh_node: next hop to forward packet to - * @packet_id: checksum to identify packet - * - * Return: true if the packet was buffered or false in case of an error. - */ -static bool batadv_nc_skb_add_to_path(struct sk_buff *skb, - struct batadv_nc_path *nc_path, - struct batadv_neigh_node *neigh_node, - __be32 packet_id) -{ - struct batadv_nc_packet *nc_packet; - - nc_packet = kzalloc(sizeof(*nc_packet), GFP_ATOMIC); - if (!nc_packet) - return false; - - /* Initialize nc_packet */ - nc_packet->timestamp = jiffies; - nc_packet->packet_id = packet_id; - nc_packet->skb = skb; - nc_packet->neigh_node = neigh_node; - nc_packet->nc_path = nc_path; - - /* Add coding packet to list */ - spin_lock_bh(&nc_path->packet_list_lock); - list_add_tail(&nc_packet->list, &nc_path->packet_list); - spin_unlock_bh(&nc_path->packet_list_lock); - - return true; -} - -/** - * batadv_nc_skb_forward() - try to code a packet or add it to the coding packet - * buffer - * @skb: data skb to forward - * @neigh_node: next hop to forward packet to - * - * Return: true if the skb was consumed (encoded packet sent) or false otherwise - */ -bool batadv_nc_skb_forward(struct sk_buff *skb, - struct batadv_neigh_node *neigh_node) -{ - const struct net_device *netdev = neigh_node->if_incoming->mesh_iface; - struct batadv_priv *bat_priv = netdev_priv(netdev); - struct batadv_unicast_packet *packet; - struct batadv_nc_path *nc_path; - struct ethhdr *ethhdr = eth_hdr(skb); - __be32 packet_id; - u8 *payload; - - /* Check if network coding is enabled */ - if (!atomic_read(&bat_priv->network_coding)) - goto out; - - /* We only handle unicast packets */ - payload = skb_network_header(skb); - packet = (struct batadv_unicast_packet *)payload; - if (packet->packet_type != BATADV_UNICAST) - goto out; - - /* Try to find a coding opportunity and send the skb if one is found */ - if (batadv_nc_skb_dst_search(skb, neigh_node, ethhdr)) - return true; - - /* Find or create a nc_path for this src-dst pair */ - nc_path = batadv_nc_get_path(bat_priv, - bat_priv->nc.coding_hash, - ethhdr->h_source, - neigh_node->addr); - - if (!nc_path) - goto out; - - /* Add skb to nc_path */ - packet_id = batadv_skb_crc32(skb, payload + sizeof(*packet)); - if (!batadv_nc_skb_add_to_path(skb, nc_path, neigh_node, packet_id)) - goto free_nc_path; - - /* Packet is consumed */ - return true; - -free_nc_path: - batadv_nc_path_put(nc_path); -out: - /* Packet is not consumed */ - return false; -} - -/** - * batadv_nc_skb_store_for_decoding() - save a clone of the skb which can be - * used when decoding coded packets - * @bat_priv: the bat priv with all the mesh interface information - * @skb: data skb to store - */ -void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv, - struct sk_buff *skb) -{ - struct batadv_unicast_packet *packet; - struct batadv_nc_path *nc_path; - struct ethhdr *ethhdr = eth_hdr(skb); - __be32 packet_id; - u8 *payload; - - /* Check if network coding is enabled */ - if (!atomic_read(&bat_priv->network_coding)) - goto out; - - /* Check for supported packet type */ - payload = skb_network_header(skb); - packet = (struct batadv_unicast_packet *)payload; - if (packet->packet_type != BATADV_UNICAST) - goto out; - - /* Find existing nc_path or create a new */ - nc_path = batadv_nc_get_path(bat_priv, - bat_priv->nc.decoding_hash, - ethhdr->h_source, - ethhdr->h_dest); - - if (!nc_path) - goto out; - - /* Clone skb and adjust skb->data to point at batman header */ - skb = skb_clone(skb, GFP_ATOMIC); - if (unlikely(!skb)) - goto free_nc_path; - - if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) - goto free_skb; - - if (unlikely(!skb_pull_rcsum(skb, ETH_HLEN))) - goto free_skb; - - /* Add skb to nc_path */ - packet_id = batadv_skb_crc32(skb, payload + sizeof(*packet)); - if (!batadv_nc_skb_add_to_path(skb, nc_path, NULL, packet_id)) - goto free_skb; - - batadv_inc_counter(bat_priv, BATADV_CNT_NC_BUFFER); - return; - -free_skb: - kfree_skb(skb); -free_nc_path: - batadv_nc_path_put(nc_path); -out: - return; -} - -/** - * batadv_nc_skb_store_sniffed_unicast() - check if a received unicast packet - * should be saved in the decoding buffer and, if so, store it there - * @bat_priv: the bat priv with all the mesh interface information - * @skb: unicast skb to store - */ -void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, - struct sk_buff *skb) -{ - struct ethhdr *ethhdr = eth_hdr(skb); - - if (batadv_is_my_mac(bat_priv, ethhdr->h_dest)) - return; - - /* Set data pointer to MAC header to mimic packets from our tx path */ - skb_push(skb, ETH_HLEN); - - batadv_nc_skb_store_for_decoding(bat_priv, skb); -} - -/** - * batadv_nc_skb_decode_packet() - decode given skb using the decode data stored - * in nc_packet - * @bat_priv: the bat priv with all the mesh interface information - * @skb: unicast skb to decode - * @nc_packet: decode data needed to decode the skb - * - * Return: pointer to decoded unicast packet if the packet was decoded or NULL - * in case of an error. - */ -static struct batadv_unicast_packet * -batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, - struct batadv_nc_packet *nc_packet) -{ - const int h_size = sizeof(struct batadv_unicast_packet); - const int h_diff = sizeof(struct batadv_coded_packet) - h_size; - struct batadv_unicast_packet *unicast_packet; - struct batadv_coded_packet coded_packet_tmp; - struct ethhdr *ethhdr, ethhdr_tmp; - u8 *orig_dest, ttl, ttvn; - unsigned int coding_len; - int err; - - /* Save headers temporarily */ - memcpy(&coded_packet_tmp, skb->data, sizeof(coded_packet_tmp)); - memcpy(ðhdr_tmp, skb_mac_header(skb), sizeof(ethhdr_tmp)); - - if (skb_cow(skb, 0) < 0) - return NULL; - - if (unlikely(!skb_pull_rcsum(skb, h_diff))) - return NULL; - - /* Data points to batman header, so set mac header 14 bytes before - * and network to data - */ - skb_set_mac_header(skb, -ETH_HLEN); - skb_reset_network_header(skb); - - /* Reconstruct original mac header */ - ethhdr = eth_hdr(skb); - *ethhdr = ethhdr_tmp; - - /* Select the correct unicast header information based on the location - * of our mac address in the coded_packet header - */ - if (batadv_is_my_mac(bat_priv, coded_packet_tmp.second_dest)) { - /* If we are the second destination the packet was overheard, - * so the Ethernet address must be copied to h_dest and - * pkt_type changed from PACKET_OTHERHOST to PACKET_HOST - */ - ether_addr_copy(ethhdr->h_dest, coded_packet_tmp.second_dest); - skb->pkt_type = PACKET_HOST; - - orig_dest = coded_packet_tmp.second_orig_dest; - ttl = coded_packet_tmp.second_ttl; - ttvn = coded_packet_tmp.second_ttvn; - } else { - orig_dest = coded_packet_tmp.first_orig_dest; - ttl = coded_packet_tmp.ttl; - ttvn = coded_packet_tmp.first_ttvn; - } - - coding_len = ntohs(coded_packet_tmp.coded_len); - - /* ensure dst buffer is large enough (payload only) */ - if (coding_len + h_size > skb->len) - return NULL; - - /* ensure src buffer is large enough (payload only) */ - if (coding_len + h_size > nc_packet->skb->len) - return NULL; - - /* Here the magic is reversed: - * extract the missing packet from the received coded packet - */ - batadv_nc_memxor(skb->data + h_size, - nc_packet->skb->data + h_size, - coding_len); - - /* Resize decoded skb if decoded with larger packet */ - if (nc_packet->skb->len > coding_len + h_size) { - err = pskb_trim_rcsum(skb, coding_len + h_size); - if (err) - return NULL; - } - - /* Create decoded unicast packet */ - unicast_packet = (struct batadv_unicast_packet *)skb->data; - unicast_packet->packet_type = BATADV_UNICAST; - unicast_packet->version = BATADV_COMPAT_VERSION; - unicast_packet->ttl = ttl; - ether_addr_copy(unicast_packet->dest, orig_dest); - unicast_packet->ttvn = ttvn; - - batadv_nc_packet_free(nc_packet, false); - return unicast_packet; -} - -/** - * batadv_nc_find_decoding_packet() - search through buffered decoding data to - * find the data needed to decode the coded packet - * @bat_priv: the bat priv with all the mesh interface information - * @ethhdr: pointer to the ethernet header inside the coded packet - * @coded: coded packet we try to find decode data for - * - * Return: pointer to nc packet if the needed data was found or NULL otherwise. - */ -static struct batadv_nc_packet * -batadv_nc_find_decoding_packet(struct batadv_priv *bat_priv, - struct ethhdr *ethhdr, - struct batadv_coded_packet *coded) -{ - struct batadv_hashtable *hash = bat_priv->nc.decoding_hash; - struct batadv_nc_packet *tmp_nc_packet, *nc_packet = NULL; - struct batadv_nc_path *nc_path, nc_path_key; - u8 *dest, *source; - __be32 packet_id; - int index; - - if (!hash) - return NULL; - - /* Select the correct packet id based on the location of our mac-addr */ - dest = ethhdr->h_source; - if (!batadv_is_my_mac(bat_priv, coded->second_dest)) { - source = coded->second_source; - packet_id = coded->second_crc; - } else { - source = coded->first_source; - packet_id = coded->first_crc; - } - - batadv_nc_hash_key_gen(&nc_path_key, source, dest); - index = batadv_nc_hash_choose(&nc_path_key, hash->size); - - /* Search for matching coding path */ - rcu_read_lock(); - hlist_for_each_entry_rcu(nc_path, &hash->table[index], hash_entry) { - /* Find matching nc_packet */ - spin_lock_bh(&nc_path->packet_list_lock); - list_for_each_entry(tmp_nc_packet, - &nc_path->packet_list, list) { - if (packet_id == tmp_nc_packet->packet_id) { - list_del(&tmp_nc_packet->list); - - nc_packet = tmp_nc_packet; - break; - } - } - spin_unlock_bh(&nc_path->packet_list_lock); - - if (nc_packet) - break; - } - rcu_read_unlock(); - - if (!nc_packet) - batadv_dbg(BATADV_DBG_NC, bat_priv, - "No decoding packet found for %u\n", packet_id); - - return nc_packet; -} - -/** - * batadv_nc_recv_coded_packet() - try to decode coded packet and enqueue the - * resulting unicast packet - * @skb: incoming coded packet - * @recv_if: pointer to interface this packet was received on - * - * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP - * otherwise. - */ -static int batadv_nc_recv_coded_packet(struct sk_buff *skb, - struct batadv_hard_iface *recv_if) -{ - struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface); - struct batadv_unicast_packet *unicast_packet; - struct batadv_coded_packet *coded_packet; - struct batadv_nc_packet *nc_packet; - struct ethhdr *ethhdr; - int hdr_size = sizeof(*coded_packet); - - /* Check if network coding is enabled */ - if (!atomic_read(&bat_priv->network_coding)) - goto free_skb; - - /* Make sure we can access (and remove) header */ - if (unlikely(!pskb_may_pull(skb, hdr_size))) - goto free_skb; - - coded_packet = (struct batadv_coded_packet *)skb->data; - ethhdr = eth_hdr(skb); - - /* Verify frame is destined for us */ - if (!batadv_is_my_mac(bat_priv, ethhdr->h_dest) && - !batadv_is_my_mac(bat_priv, coded_packet->second_dest)) - goto free_skb; - - /* Update stat counter */ - if (batadv_is_my_mac(bat_priv, coded_packet->second_dest)) - batadv_inc_counter(bat_priv, BATADV_CNT_NC_SNIFFED); - - nc_packet = batadv_nc_find_decoding_packet(bat_priv, ethhdr, - coded_packet); - if (!nc_packet) { - batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE_FAILED); - goto free_skb; - } - - /* Make skb's linear, because decoding accesses the entire buffer */ - if (skb_linearize(skb) < 0) - goto free_nc_packet; - - if (skb_linearize(nc_packet->skb) < 0) - goto free_nc_packet; - - /* Decode the packet */ - unicast_packet = batadv_nc_skb_decode_packet(bat_priv, skb, nc_packet); - if (!unicast_packet) { - batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE_FAILED); - goto free_nc_packet; - } - - /* Mark packet as decoded to do correct recoding when forwarding */ - BATADV_SKB_CB(skb)->decoded = true; - batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE); - batadv_add_counter(bat_priv, BATADV_CNT_NC_DECODE_BYTES, - skb->len + ETH_HLEN); - return batadv_recv_unicast_packet(skb, recv_if); - -free_nc_packet: - batadv_nc_packet_free(nc_packet, true); -free_skb: - kfree_skb(skb); - - return NET_RX_DROP; -} - -/** - * batadv_nc_mesh_free() - clean up network coding memory - * @bat_priv: the bat priv with all the mesh interface information - */ -void batadv_nc_mesh_free(struct batadv_priv *bat_priv) -{ - batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_NC, 1); - batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_NC, 1); - cancel_delayed_work_sync(&bat_priv->nc.work); - - batadv_nc_purge_paths(bat_priv, bat_priv->nc.coding_hash, NULL); - batadv_hash_destroy(bat_priv->nc.coding_hash); - batadv_nc_purge_paths(bat_priv, bat_priv->nc.decoding_hash, NULL); - batadv_hash_destroy(bat_priv->nc.decoding_hash); -} diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h deleted file mode 100644 index 368cc3130e4c..000000000000 --- a/net/batman-adv/network-coding.h +++ /dev/null @@ -1,106 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) B.A.T.M.A.N. contributors: - * - * Martin Hundebøll, Jeppe Ledet-Pedersen - */ - -#ifndef _NET_BATMAN_ADV_NETWORK_CODING_H_ -#define _NET_BATMAN_ADV_NETWORK_CODING_H_ - -#include "main.h" - -#include -#include -#include -#include - -#ifdef CONFIG_BATMAN_ADV_NC - -void batadv_nc_status_update(struct net_device *net_dev); -int batadv_nc_init(void); -int batadv_nc_mesh_init(struct batadv_priv *bat_priv); -void batadv_nc_mesh_free(struct batadv_priv *bat_priv); -void batadv_nc_update_nc_node(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh_node, - struct batadv_ogm_packet *ogm_packet, - int is_single_hop_neigh); -void batadv_nc_purge_orig(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - bool (*to_purge)(struct batadv_priv *, - struct batadv_nc_node *)); -void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv); -void batadv_nc_init_orig(struct batadv_orig_node *orig_node); -bool batadv_nc_skb_forward(struct sk_buff *skb, - struct batadv_neigh_node *neigh_node); -void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv, - struct sk_buff *skb); -void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, - struct sk_buff *skb); - -#else /* ifdef CONFIG_BATMAN_ADV_NC */ - -static inline void batadv_nc_status_update(struct net_device *net_dev) -{ -} - -static inline int batadv_nc_init(void) -{ - return 0; -} - -static inline int batadv_nc_mesh_init(struct batadv_priv *bat_priv) -{ - return 0; -} - -static inline void batadv_nc_mesh_free(struct batadv_priv *bat_priv) -{ -} - -static inline void -batadv_nc_update_nc_node(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh_node, - struct batadv_ogm_packet *ogm_packet, - int is_single_hop_neigh) -{ -} - -static inline void -batadv_nc_purge_orig(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - bool (*to_purge)(struct batadv_priv *, - struct batadv_nc_node *)) -{ -} - -static inline void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv) -{ -} - -static inline void batadv_nc_init_orig(struct batadv_orig_node *orig_node) -{ -} - -static inline bool batadv_nc_skb_forward(struct sk_buff *skb, - struct batadv_neigh_node *neigh_node) -{ - return false; -} - -static inline void -batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv, - struct sk_buff *skb) -{ -} - -static inline void -batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, - struct sk_buff *skb) -{ -} - -#endif /* ifdef CONFIG_BATMAN_ADV_NC */ - -#endif /* _NET_BATMAN_ADV_NETWORK_CODING_H_ */ diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index a464ff96b929..c84420cb410d 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -37,7 +37,6 @@ #include "log.h" #include "multicast.h" #include "netlink.h" -#include "network-coding.h" #include "routing.h" #include "translation-table.h" @@ -883,9 +882,6 @@ void batadv_orig_node_release(struct kref *ref) } spin_unlock_bh(&orig_node->vlan_list_lock); - /* Free nc_nodes */ - batadv_nc_purge_orig(orig_node->bat_priv, orig_node, NULL); - call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu); } @@ -959,8 +955,6 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, spin_lock_init(&orig_node->tt_lock); spin_lock_init(&orig_node->vlan_list_lock); - batadv_nc_init_orig(orig_node); - /* extra reference for return */ kref_init(&orig_node->refcount); diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 35d8c5783999..12c16f81cc51 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -31,7 +31,6 @@ #include "hard-interface.h" #include "log.h" #include "mesh-interface.h" -#include "network-coding.h" #include "originator.h" #include "send.h" #include "tp_meter.h" @@ -956,15 +955,9 @@ int batadv_recv_unicast_packet(struct sk_buff *skb, /* function returns -EREMOTE for promiscuous packets */ check = batadv_check_unicast_packet(bat_priv, skb, hdr_size); - - /* Even though the packet is not for us, we might save it to use for - * decoding a later received coded packet - */ - if (check == -EREMOTE) - batadv_nc_skb_store_sniffed_unicast(bat_priv, skb); - if (check < 0) goto free_skb; + if (!batadv_check_unicast_ttvn(bat_priv, skb, hdr_size)) goto free_skb; diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 95849ba004e7..20d85c681064 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -34,7 +34,6 @@ #include "hard-interface.h" #include "log.h" #include "mesh-interface.h" -#include "network-coding.h" #include "originator.h" #include "routing.h" #include "translation-table.h" @@ -63,12 +62,9 @@ int batadv_send_skb_packet(struct sk_buff *skb, struct batadv_hard_iface *hard_iface, const u8 *dst_addr) { - struct batadv_priv *bat_priv; struct ethhdr *ethhdr; int ret; - bat_priv = netdev_priv(hard_iface->mesh_iface); - if (hard_iface->if_status != BATADV_IF_ACTIVE) goto send_skb_err; @@ -97,9 +93,6 @@ int batadv_send_skb_packet(struct sk_buff *skb, skb->dev = hard_iface->net_dev; - /* Save a clone of the skb to use when decoding coded packets */ - batadv_nc_skb_store_for_decoding(bat_priv, skb); - /* dev_queue_xmit() returns a negative result on error. However on * congestion and traffic shaping, it drops and returns NET_XMIT_DROP * (which is > 0). This will not be treated as an error. @@ -202,14 +195,7 @@ int batadv_send_skb_to_orig(struct sk_buff *skb, goto put_neigh_node; } - /* try to network code the packet, if it is received on an interface - * (i.e. being forwarded). If the packet originates from this node or if - * network coding fails, then send the packet as usual. - */ - if (recv_if && batadv_nc_skb_forward(skb, neigh_node)) - ret = -EINPROGRESS; - else - ret = batadv_send_unicast_skb(skb, neigh_node); + ret = batadv_send_unicast_skb(skb, neigh_node); /* skb was consumed */ skb = NULL; diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 8d0e04e770cb..6e95e883c2bf 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -212,7 +212,7 @@ batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, /** * batadv_tt_local_entry_release() - release tt_local_entry from lists and queue * for free after rcu grace period - * @ref: kref pointer of the nc_node + * @ref: kref pointer of the batadv_tt_local_entry */ static void batadv_tt_local_entry_release(struct kref *ref) { @@ -244,7 +244,7 @@ batadv_tt_local_entry_put(struct batadv_tt_local_entry *tt_local_entry) /** * batadv_tt_global_entry_release() - release tt_global_entry from lists and * queue for free after rcu grace period - * @ref: kref pointer of the nc_node + * @ref: kref pointer of the batadv_tt_global_entry */ void batadv_tt_global_entry_release(struct kref *ref) { diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 0ca0fc072fc9..ae1d7a8dc480 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -505,20 +505,6 @@ struct batadv_orig_node { /** @rcu: struct used for freeing in an RCU-safe manner */ struct rcu_head rcu; -#ifdef CONFIG_BATMAN_ADV_NC - /** @in_coding_list: list of nodes this orig can hear */ - struct list_head in_coding_list; - - /** @out_coding_list: list of nodes that can hear this orig */ - struct list_head out_coding_list; - - /** @in_coding_list_lock: protects in_coding_list */ - spinlock_t in_coding_list_lock; - - /** @out_coding_list_lock: protects out_coding_list */ - spinlock_t out_coding_list_lock; -#endif - /** @fragments: array with heads for fragment chains */ struct batadv_frag_table_entry fragments[BATADV_FRAG_BUFFER_COUNT]; @@ -545,9 +531,6 @@ enum batadv_orig_capabilities { */ BATADV_ORIG_CAPA_HAS_DAT, - /** @BATADV_ORIG_CAPA_HAS_NC: orig node has network coding enabled */ - BATADV_ORIG_CAPA_HAS_NC, - /** @BATADV_ORIG_CAPA_HAS_TT: orig node has tt capability */ BATADV_ORIG_CAPA_HAS_TT, @@ -953,60 +936,6 @@ enum batadv_counters { BATADV_CNT_DAT_CACHED_REPLY_TX, #endif -#ifdef CONFIG_BATMAN_ADV_NC - /** - * @BATADV_CNT_NC_CODE: transmitted nc-combined traffic packet counter - */ - BATADV_CNT_NC_CODE, - - /** - * @BATADV_CNT_NC_CODE_BYTES: transmitted nc-combined traffic bytes - * counter - */ - BATADV_CNT_NC_CODE_BYTES, - - /** - * @BATADV_CNT_NC_RECODE: transmitted nc-recombined traffic packet - * counter - */ - BATADV_CNT_NC_RECODE, - - /** - * @BATADV_CNT_NC_RECODE_BYTES: transmitted nc-recombined traffic bytes - * counter - */ - BATADV_CNT_NC_RECODE_BYTES, - - /** - * @BATADV_CNT_NC_BUFFER: counter for packets buffered for later nc - * decoding - */ - BATADV_CNT_NC_BUFFER, - - /** - * @BATADV_CNT_NC_DECODE: received and nc-decoded traffic packet counter - */ - BATADV_CNT_NC_DECODE, - - /** - * @BATADV_CNT_NC_DECODE_BYTES: received and nc-decoded traffic bytes - * counter - */ - BATADV_CNT_NC_DECODE_BYTES, - - /** - * @BATADV_CNT_NC_DECODE_FAILED: received and decode-failed traffic - * packet counter - */ - BATADV_CNT_NC_DECODE_FAILED, - - /** - * @BATADV_CNT_NC_SNIFFED: counter for nc-decoded packets received in - * promisc mode. - */ - BATADV_CNT_NC_SNIFFED, -#endif - /** @BATADV_CNT_NUM: number of traffic counters */ BATADV_CNT_NUM, }; @@ -1339,56 +1268,6 @@ struct batadv_priv_mcast { }; #endif -/** - * struct batadv_priv_nc - per mesh interface network coding private data - */ -struct batadv_priv_nc { - /** @work: work queue callback item for cleanup */ - struct delayed_work work; - - /** - * @min_tq: only consider neighbors for encoding if neigh_tq > min_tq - */ - u8 min_tq; - - /** - * @max_fwd_delay: maximum packet forward delay to allow coding of - * packets - */ - u32 max_fwd_delay; - - /** - * @max_buffer_time: buffer time for sniffed packets used to decoding - */ - u32 max_buffer_time; - - /** - * @timestamp_fwd_flush: timestamp of last forward packet queue flush - */ - unsigned long timestamp_fwd_flush; - - /** - * @timestamp_sniffed_purge: timestamp of last sniffed packet queue - * purge - */ - unsigned long timestamp_sniffed_purge; - - /** - * @coding_hash: Hash table used to buffer skbs while waiting for - * another incoming skb to code it with. Skbs are added to the buffer - * just before being forwarded in routing.c - */ - struct batadv_hashtable *coding_hash; - - /** - * @decoding_hash: Hash table used to buffer skbs that might be needed - * to decode a received coded skb. The buffer is used for 1) skbs - * arriving on the mesh-interface; 2) skbs overheard on the - * hard-interface; and 3) skbs forwarded by batman-adv. - */ - struct batadv_hashtable *decoding_hash; -}; - /** * struct batadv_tp_unacked - unacked packet meta-information * @@ -1775,16 +1654,6 @@ struct batadv_priv { struct batadv_priv_mcast mcast; #endif -#ifdef CONFIG_BATMAN_ADV_NC - /** - * @network_coding: bool indicating whether network coding is enabled - */ - atomic_t network_coding; - - /** @nc: network coding data */ - struct batadv_priv_nc nc; -#endif /* CONFIG_BATMAN_ADV_NC */ - #ifdef CONFIG_BATMAN_ADV_BATMAN_V /** @bat_v: B.A.T.M.A.N. V per mesh-interface private data */ struct batadv_priv_bat_v bat_v; @@ -2016,96 +1885,11 @@ struct batadv_tt_roam_node { struct list_head list; }; -/** - * struct batadv_nc_node - network coding node - */ -struct batadv_nc_node { - /** @list: next and prev pointer for the list handling */ - struct list_head list; - - /** @addr: the node's mac address */ - u8 addr[ETH_ALEN]; - - /** @refcount: number of contexts the object is used by */ - struct kref refcount; - - /** @rcu: struct used for freeing in an RCU-safe manner */ - struct rcu_head rcu; - - /** @orig_node: pointer to corresponding orig node struct */ - struct batadv_orig_node *orig_node; - - /** @last_seen: timestamp of last ogm received from this node */ - unsigned long last_seen; -}; - -/** - * struct batadv_nc_path - network coding path - */ -struct batadv_nc_path { - /** @hash_entry: next and prev pointer for the list handling */ - struct hlist_node hash_entry; - - /** @rcu: struct used for freeing in an RCU-safe manner */ - struct rcu_head rcu; - - /** @refcount: number of contexts the object is used by */ - struct kref refcount; - - /** @packet_list: list of buffered packets for this path */ - struct list_head packet_list; - - /** @packet_list_lock: access lock for packet list */ - spinlock_t packet_list_lock; - - /** @next_hop: next hop (destination) of path */ - u8 next_hop[ETH_ALEN]; - - /** @prev_hop: previous hop (source) of path */ - u8 prev_hop[ETH_ALEN]; - - /** @last_valid: timestamp for last validation of path */ - unsigned long last_valid; -}; - -/** - * struct batadv_nc_packet - network coding packet used when coding and - * decoding packets - */ -struct batadv_nc_packet { - /** @list: next and prev pointer for the list handling */ - struct list_head list; - - /** @packet_id: crc32 checksum of skb data */ - __be32 packet_id; - - /** - * @timestamp: field containing the info when the packet was added to - * path - */ - unsigned long timestamp; - - /** @neigh_node: pointer to original next hop neighbor of skb */ - struct batadv_neigh_node *neigh_node; - - /** @skb: skb which can be encoded or used for decoding */ - struct sk_buff *skb; - - /** @nc_path: pointer to path this nc packet is attached to */ - struct batadv_nc_path *nc_path; -}; - /** * struct batadv_skb_cb - control buffer structure used to store private data * relevant to batman-adv in the skb->cb buffer in skbs. */ struct batadv_skb_cb { - /** - * @decoded: Marks a skb as decoded, which is checked when searching for - * coding opportunities in network-coding.c - */ - unsigned char decoded:1; - /** @num_bcasts: Counter for broadcast packet retransmissions */ unsigned char num_bcasts; }; -- cgit v1.2.3 From d5d80ac74f80fab4e2647c1030053d71d8c81bc9 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Thu, 28 Aug 2025 17:58:59 +0200 Subject: batman-adv: keep skb crc32 helper local in BLA The batadv_skb_crc32() helper was shared between Bridge Loop Avoidance and Network Coding. With the removal of the network coding feature, it is possible to just move this helper directly to Bridge Loop Avoidance. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bridge_loop_avoidance.c | 34 ++++++++++++++++++++++++++++++++++ net/batman-adv/main.c | 34 ---------------------------------- net/batman-adv/main.h | 1 - 3 files changed, 34 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 747755647c6a..b992ba12aa24 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -1584,6 +1585,39 @@ int batadv_bla_init(struct batadv_priv *bat_priv) return 0; } +/** + * batadv_skb_crc32() - calculate CRC32 of the whole packet and skip bytes in + * the header + * @skb: skb pointing to fragmented socket buffers + * @payload_ptr: Pointer to position inside the head buffer of the skb + * marking the start of the data to be CRC'ed + * + * payload_ptr must always point to an address in the skb head buffer and not to + * a fragment. + * + * Return: big endian crc32c of the checksummed data + */ +static __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr) +{ + unsigned int to = skb->len; + unsigned int consumed = 0; + struct skb_seq_state st; + unsigned int from; + unsigned int len; + const u8 *data; + u32 crc = 0; + + from = (unsigned int)(payload_ptr - skb->data); + + skb_prepare_seq_read(skb, from, to, &st); + while ((len = skb_seq_read(consumed, &data, &st)) != 0) { + crc = crc32c(crc, data, len); + consumed += len; + } + + return htonl(crc); +} + /** * batadv_bla_check_duplist() - Check if a frame is in the broadcast dup. * @bat_priv: the bat priv with all the mesh interface information diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 1dcacfc310ee..3a35aadd8b41 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -561,39 +560,6 @@ void batadv_recv_handler_unregister(u8 packet_type) batadv_rx_handler[packet_type] = batadv_recv_unhandled_packet; } -/** - * batadv_skb_crc32() - calculate CRC32 of the whole packet and skip bytes in - * the header - * @skb: skb pointing to fragmented socket buffers - * @payload_ptr: Pointer to position inside the head buffer of the skb - * marking the start of the data to be CRC'ed - * - * payload_ptr must always point to an address in the skb head buffer and not to - * a fragment. - * - * Return: big endian crc32c of the checksummed data - */ -__be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr) -{ - u32 crc = 0; - unsigned int from; - unsigned int to = skb->len; - struct skb_seq_state st; - const u8 *data; - unsigned int len; - unsigned int consumed = 0; - - from = (unsigned int)(payload_ptr - skb->data); - - skb_prepare_seq_read(skb, from, to, &st); - while ((len = skb_seq_read(consumed, &data, &st)) != 0) { - crc = crc32c(crc, data, len); - consumed += len; - } - - return htonl(crc); -} - /** * batadv_get_vid() - extract the VLAN identifier from skb if any * @skb: the buffer containing the packet diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 7352b11df968..2be1ac17acaa 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -248,7 +248,6 @@ batadv_recv_handler_register(u8 packet_type, int (*recv_handler)(struct sk_buff *, struct batadv_hard_iface *)); void batadv_recv_handler_unregister(u8 packet_type); -__be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr); /** * batadv_compare_eth() - Compare two not u16 aligned Ethernet addresses -- cgit v1.2.3 From 629a2b18e8729497eeac5b63e575e0961ca3a4ab Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Thu, 28 Aug 2025 20:21:43 +0200 Subject: batman-adv: remove includes for extern declarations It is not necessary to include the header for the struct definition for an "extern " declaration. It can simply be dropped from the headers to reduce the number of includes the preprocessor has to process. If needed, it can be added to the actual C source file. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/hard-interface.c | 1 + net/batman-adv/hard-interface.h | 1 - net/batman-adv/mesh-interface.c | 1 + net/batman-adv/mesh-interface.h | 1 - net/batman-adv/netlink.h | 1 - 5 files changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index bace57e4f9a5..5113f879736b 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index 262a78364742..9db8a310961e 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/net/batman-adv/mesh-interface.c b/net/batman-adv/mesh-interface.c index be55d8d87348..df7e95811ef5 100644 --- a/net/batman-adv/mesh-interface.c +++ b/net/batman-adv/mesh-interface.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include diff --git a/net/batman-adv/mesh-interface.h b/net/batman-adv/mesh-interface.h index 7ba055b2bc26..53756c5a45e0 100644 --- a/net/batman-adv/mesh-interface.h +++ b/net/batman-adv/mesh-interface.h @@ -13,7 +13,6 @@ #include #include #include -#include int batadv_skb_head_push(struct sk_buff *skb, unsigned int len); void batadv_interface_rx(struct net_device *mesh_iface, diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h index fe4548b974bb..4eae9e5ff135 100644 --- a/net/batman-adv/netlink.h +++ b/net/batman-adv/netlink.h @@ -11,7 +11,6 @@ #include #include -#include void batadv_netlink_register(void); void batadv_netlink_unregister(void); -- cgit v1.2.3 From 199cd9e8d14bc14bdbd1fa3031ce26dac9781507 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 3 Sep 2025 09:49:33 -0400 Subject: Revert "SUNRPC: Don't allow waiting for exiting tasks" This reverts commit 14e41b16e8cb677bb440dca2edba8b041646c742. This patch breaks the LTP acct02 test, so let's revert and look for a better solution. Reported-by: Mark Brown Reported-by: Harshvardhan Jha Link: https://lore.kernel.org/linux-nfs/7d4d57b0-39a3-49f1-8ada-60364743e3b4@sirena.org.uk/ Cc: stable@vger.kernel.org # 6.15.x Signed-off-by: Trond Myklebust --- net/sunrpc/sched.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 73bc39281ef5..9b45fbdc90ca 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -276,8 +276,6 @@ EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue); static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode) { - if (unlikely(current->flags & PF_EXITING)) - return -EINTR; schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; -- cgit v1.2.3 From 9559d2fffd4f9b892165eed48198a0e5cb8504e6 Mon Sep 17 00:00:00 2001 From: Justin Worrell Date: Thu, 4 Sep 2025 16:09:57 -0500 Subject: SUNRPC: call xs_sock_process_cmsg for all cmsg xs_sock_recv_cmsg was failing to call xs_sock_process_cmsg for any cmsg type other than TLS_RECORD_TYPE_ALERT (TLS_RECORD_TYPE_DATA, and other values not handled.) Based on my reading of the previous commit (cc5d5908: sunrpc: fix client side handling of tls alerts), it looks like only iov_iter_revert should be conditional on TLS_RECORD_TYPE_ALERT (but that other cmsg types should still call xs_sock_process_cmsg). On my machine, I was unable to connect (over mtls) to an NFS share hosted on FreeBSD. With this patch applied, I am able to mount the share again. Fixes: cc5d59081fa2 ("sunrpc: fix client side handling of tls alerts") Signed-off-by: Justin Worrell Reviewed-and-tested-by: Scott Mayhew Link: https://lore.kernel.org/r/20250904211038.12874-3-jworrell@gmail.com Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c5f7bbf5775f..3aa987e7f072 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -407,9 +407,9 @@ xs_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags, int flags) iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1, alert_kvec.iov_len); ret = sock_recvmsg(sock, &msg, flags); - if (ret > 0 && - tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) { - iov_iter_revert(&msg.msg_iter, ret); + if (ret > 0) { + if (tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) + iov_iter_revert(&msg.msg_iter, ret); ret = xs_sock_process_cmsg(sock, &msg, msg_flags, &u.cmsg, -EAGAIN); } -- cgit v1.2.3 From bd64723327e33758803f3b6105f4ef0a1e6cebe0 Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Fri, 5 Sep 2025 09:50:03 -0700 Subject: net: mctp: fix typo in comment Correct a typo in af_mctp.c: "fist" -> "first". Signed-off-by: Alok Tiwari Acked-by: Jeremy Kerr Link: https://patch.msgid.link/20250905165006.3032472-1-alok.a.tiwari@oracle.com Signed-off-by: Jakub Kicinski --- net/mctp/af_mctp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index 685524800d70..b99ba14f39d2 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -256,7 +256,7 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) skb_reserve(skb, hlen); - /* set type as fist byte in payload */ + /* set type as first byte in payload */ *(u8 *)skb_put(skb, 1) = addr->smctp_type; rc = memcpy_from_msg((void *)skb_put(skb, len), msg, len); -- cgit v1.2.3 From 1dbfb0363224f6da56f6655d596dc5097308d6f5 Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Fri, 5 Sep 2025 06:57:27 -0700 Subject: genetlink: fix genl_bind() invoking bind() after -EPERM Per family bind/unbind callbacks were introduced to allow families to track multicast group consumer presence, e.g. to start or stop producing events depending on listeners. However, in genl_bind() the bind() callback was invoked even if capability checks failed and ret was set to -EPERM. This means that callbacks could run on behalf of unauthorized callers while the syscall still returned failure to user space. Fix this by only invoking bind() after "if (ret) break;" check i.e. after permission checks have succeeded. Fixes: 3de21a8990d3 ("genetlink: Add per family bind/unbind callbacks") Signed-off-by: Alok Tiwari Link: https://patch.msgid.link/20250905135731.3026965-1-alok.a.tiwari@oracle.com Signed-off-by: Jakub Kicinski --- net/netlink/genetlink.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 104732d34543..978c129c6095 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1836,6 +1836,9 @@ static int genl_bind(struct net *net, int group) !ns_capable(net->user_ns, CAP_SYS_ADMIN)) ret = -EPERM; + if (ret) + break; + if (family->bind) family->bind(i); -- cgit v1.2.3 From b7fe8c1be776baa1bec587499e989395c0aee8ef Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Sep 2025 16:58:05 +0000 Subject: ipv6: snmp: remove icmp6type2name[] This 2KB array can be replaced by a switch() to save space. Before: $ size net/ipv6/proc.o text data bss dec hex filename 6410 624 0 7034 1b7a net/ipv6/proc.o After: $ size net/ipv6/proc.o text data bss dec hex filename 5516 592 0 6108 17dc net/ipv6/proc.o Signed-off-by: Eric Dumazet Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250905165813.1470708-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/proc.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 752327b10dde..e96f14a36834 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -99,26 +99,6 @@ static const struct snmp_mib snmp6_icmp6_list[] = { SNMP_MIB_SENTINEL }; -/* RFC 4293 v6 ICMPMsgStatsTable; named items for RFC 2466 compatibility */ -static const char *const icmp6type2name[256] = { - [ICMPV6_DEST_UNREACH] = "DestUnreachs", - [ICMPV6_PKT_TOOBIG] = "PktTooBigs", - [ICMPV6_TIME_EXCEED] = "TimeExcds", - [ICMPV6_PARAMPROB] = "ParmProblems", - [ICMPV6_ECHO_REQUEST] = "Echos", - [ICMPV6_ECHO_REPLY] = "EchoReplies", - [ICMPV6_MGM_QUERY] = "GroupMembQueries", - [ICMPV6_MGM_REPORT] = "GroupMembResponses", - [ICMPV6_MGM_REDUCTION] = "GroupMembReductions", - [ICMPV6_MLD2_REPORT] = "MLDv2Reports", - [NDISC_ROUTER_ADVERTISEMENT] = "RouterAdvertisements", - [NDISC_ROUTER_SOLICITATION] = "RouterSolicits", - [NDISC_NEIGHBOUR_ADVERTISEMENT] = "NeighborAdvertisements", - [NDISC_NEIGHBOUR_SOLICITATION] = "NeighborSolicits", - [NDISC_REDIRECT] = "Redirects", -}; - - static const struct snmp_mib snmp6_udp6_list[] = { SNMP_MIB_ITEM("Udp6InDatagrams", UDP_MIB_INDATAGRAMS), SNMP_MIB_ITEM("Udp6NoPorts", UDP_MIB_NOPORTS), @@ -151,11 +131,31 @@ static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib) /* print by name -- deprecated items */ for (i = 0; i < ICMP6MSG_MIB_MAX; i++) { + const char *p = NULL; int icmptype; - const char *p; + +#define CASE(TYP, STR) case TYP: p = STR; break; icmptype = i & 0xff; - p = icmp6type2name[icmptype]; + switch (icmptype) { +/* RFC 4293 v6 ICMPMsgStatsTable; named items for RFC 2466 compatibility */ + CASE(ICMPV6_DEST_UNREACH, "DestUnreachs") + CASE(ICMPV6_PKT_TOOBIG, "PktTooBigs") + CASE(ICMPV6_TIME_EXCEED, "TimeExcds") + CASE(ICMPV6_PARAMPROB, "ParmProblems") + CASE(ICMPV6_ECHO_REQUEST, "Echos") + CASE(ICMPV6_ECHO_REPLY, "EchoReplies") + CASE(ICMPV6_MGM_QUERY, "GroupMembQueries") + CASE(ICMPV6_MGM_REPORT, "GroupMembResponses") + CASE(ICMPV6_MGM_REDUCTION, "GroupMembReductions") + CASE(ICMPV6_MLD2_REPORT, "MLDv2Reports") + CASE(NDISC_ROUTER_ADVERTISEMENT, "RouterAdvertisements") + CASE(NDISC_ROUTER_SOLICITATION, "RouterSolicits") + CASE(NDISC_NEIGHBOUR_ADVERTISEMENT, "NeighborAdvertisements") + CASE(NDISC_NEIGHBOUR_SOLICITATION, "NeighborSolicits") + CASE(NDISC_REDIRECT, "Redirects") + } +#undef CASE if (!p) /* don't print un-named types here */ continue; snprintf(name, sizeof(name), "Icmp6%s%s", -- cgit v1.2.3 From ceac1fb2290d230eb83aff3761058c559440de13 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Sep 2025 16:58:06 +0000 Subject: ipv6: snmp: do not use SNMP_MIB_SENTINEL anymore Use ARRAY_SIZE(), so that we know the limit at compile time. Following patch needs this preliminary change. Signed-off-by: Eric Dumazet Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250905165813.1470708-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 24 ++++++++++++++++++++++++ net/ipv6/proc.c | 43 ++++++++++++++++++++++++------------------- 2 files changed, 48 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/include/net/ip.h b/include/net/ip.h index 6dbd2bf8fa9c..a1624e8db1ab 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -338,6 +338,19 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o } \ } +#define snmp_get_cpu_field64_batch_cnt(buff64, stats_list, cnt, \ + mib_statistic, offset) \ +{ \ + int i, c; \ + for_each_possible_cpu(c) { \ + for (i = 0; i < cnt; i++) \ + buff64[i] += snmp_get_cpu_field64( \ + mib_statistic, \ + c, stats_list[i].entry, \ + offset); \ + } \ +} + #define snmp_get_cpu_field_batch(buff, stats_list, mib_statistic) \ { \ int i, c; \ @@ -349,6 +362,17 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o } \ } +#define snmp_get_cpu_field_batch_cnt(buff, stats_list, cnt, mib_statistic) \ +{ \ + int i, c; \ + for_each_possible_cpu(c) { \ + for (i = 0; i < cnt; i++) \ + buff[i] += snmp_get_cpu_field( \ + mib_statistic, \ + c, stats_list[i].entry); \ + } \ +} + static inline void inet_get_local_port_range(const struct net *net, int *low, int *high) { u32 range = READ_ONCE(net->ipv4.ip_local_ports.range); diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index e96f14a36834..92ed04729c2f 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -85,7 +85,6 @@ static const struct snmp_mib snmp6_ipstats_list[] = { SNMP_MIB_ITEM("Ip6InECT0Pkts", IPSTATS_MIB_ECT0PKTS), SNMP_MIB_ITEM("Ip6InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_ITEM("Ip6OutTransmits", IPSTATS_MIB_OUTPKTS), - SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp6_icmp6_list[] = { @@ -96,7 +95,6 @@ static const struct snmp_mib snmp6_icmp6_list[] = { SNMP_MIB_ITEM("Icmp6OutErrors", ICMP6_MIB_OUTERRORS), SNMP_MIB_ITEM("Icmp6InCsumErrors", ICMP6_MIB_CSUMERRORS), SNMP_MIB_ITEM("Icmp6OutRateLimitHost", ICMP6_MIB_RATELIMITHOST), - SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp6_udp6_list[] = { @@ -109,7 +107,6 @@ static const struct snmp_mib snmp6_udp6_list[] = { SNMP_MIB_ITEM("Udp6InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("Udp6IgnoredMulti", UDP_MIB_IGNOREDMULTI), SNMP_MIB_ITEM("Udp6MemErrors", UDP_MIB_MEMERRORS), - SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp6_udplite6_list[] = { @@ -121,7 +118,6 @@ static const struct snmp_mib snmp6_udplite6_list[] = { SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("UdpLite6InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("UdpLite6MemErrors", UDP_MIB_MEMERRORS), - SNMP_MIB_SENTINEL }; static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib) @@ -182,35 +178,37 @@ static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib) */ static void snmp6_seq_show_item(struct seq_file *seq, void __percpu *pcpumib, atomic_long_t *smib, - const struct snmp_mib *itemlist) + const struct snmp_mib *itemlist, + int cnt) { unsigned long buff[SNMP_MIB_MAX]; int i; if (pcpumib) { - memset(buff, 0, sizeof(unsigned long) * SNMP_MIB_MAX); + memset(buff, 0, sizeof(unsigned long) * cnt); - snmp_get_cpu_field_batch(buff, itemlist, pcpumib); - for (i = 0; itemlist[i].name; i++) + snmp_get_cpu_field_batch_cnt(buff, itemlist, cnt, pcpumib); + for (i = 0; i < cnt; i++) seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, buff[i]); } else { - for (i = 0; itemlist[i].name; i++) + for (i = 0; i < cnt; i++) seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, atomic_long_read(smib + itemlist[i].entry)); } } static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib, - const struct snmp_mib *itemlist, size_t syncpoff) + const struct snmp_mib *itemlist, + int cnt, size_t syncpoff) { u64 buff64[SNMP_MIB_MAX]; int i; - memset(buff64, 0, sizeof(u64) * SNMP_MIB_MAX); + memset(buff64, 0, sizeof(u64) * cnt); - snmp_get_cpu_field64_batch(buff64, itemlist, mib, syncpoff); - for (i = 0; itemlist[i].name; i++) + snmp_get_cpu_field64_batch_cnt(buff64, itemlist, cnt, mib, syncpoff); + for (i = 0; i < cnt; i++) seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name, buff64[i]); } @@ -219,14 +217,19 @@ static int snmp6_seq_show(struct seq_file *seq, void *v) struct net *net = (struct net *)seq->private; snmp6_seq_show_item64(seq, net->mib.ipv6_statistics, - snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); + snmp6_ipstats_list, + ARRAY_SIZE(snmp6_ipstats_list), + offsetof(struct ipstats_mib, syncp)); snmp6_seq_show_item(seq, net->mib.icmpv6_statistics, - NULL, snmp6_icmp6_list); + NULL, snmp6_icmp6_list, + ARRAY_SIZE(snmp6_icmp6_list)); snmp6_seq_show_icmpv6msg(seq, net->mib.icmpv6msg_statistics->mibs); snmp6_seq_show_item(seq, net->mib.udp_stats_in6, - NULL, snmp6_udp6_list); + NULL, snmp6_udp6_list, + ARRAY_SIZE(snmp6_udp6_list)); snmp6_seq_show_item(seq, net->mib.udplite_stats_in6, - NULL, snmp6_udplite6_list); + NULL, snmp6_udplite6_list, + ARRAY_SIZE(snmp6_udplite6_list)); return 0; } @@ -236,9 +239,11 @@ static int snmp6_dev_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex); snmp6_seq_show_item64(seq, idev->stats.ipv6, - snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); + snmp6_ipstats_list, + ARRAY_SIZE(snmp6_ipstats_list), + offsetof(struct ipstats_mib, syncp)); snmp6_seq_show_item(seq, NULL, idev->stats.icmpv6dev->mibs, - snmp6_icmp6_list); + snmp6_icmp6_list, ARRAY_SIZE(snmp6_icmp6_list)); snmp6_seq_show_icmpv6msg(seq, idev->stats.icmpv6msgdev->mibs); return 0; } -- cgit v1.2.3 From 2fab94bcf313480336b0a41eb45a24ffd5087490 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Sep 2025 16:58:07 +0000 Subject: ipv6: snmp: do not track per idev ICMP6_MIB_RATELIMITHOST Blamed commit added a critical false sharing on a single atomic_long_t under DOS, like receiving UDP packets to closed ports. Per netns ICMP6_MIB_RATELIMITHOST tracking uses per-cpu storage and is enough, we do not need per-device and slow tracking. Fixes: d0941130c9351 ("icmp: Add counters for rate limits") Signed-off-by: Eric Dumazet Cc: Jamie Bainbridge Cc: Abhishek Rawal Link: https://patch.msgid.link/20250905165813.1470708-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/icmp.c | 3 +-- net/ipv6/proc.c | 6 +++++- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 95cdd4cacb00..56c974cf75d1 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -230,8 +230,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, } rcu_read_unlock(); if (!res) - __ICMP6_INC_STATS(net, ip6_dst_idev(dst), - ICMP6_MIB_RATELIMITHOST); + __ICMP6_INC_STATS(net, NULL, ICMP6_MIB_RATELIMITHOST); else icmp_global_consume(net); dst_release(dst); diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 92ed04729c2f..73296f38c252 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -94,6 +94,7 @@ static const struct snmp_mib snmp6_icmp6_list[] = { SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS), SNMP_MIB_ITEM("Icmp6OutErrors", ICMP6_MIB_OUTERRORS), SNMP_MIB_ITEM("Icmp6InCsumErrors", ICMP6_MIB_CSUMERRORS), +/* ICMP6_MIB_RATELIMITHOST needs to be last, see snmp6_dev_seq_show(). */ SNMP_MIB_ITEM("Icmp6OutRateLimitHost", ICMP6_MIB_RATELIMITHOST), }; @@ -242,8 +243,11 @@ static int snmp6_dev_seq_show(struct seq_file *seq, void *v) snmp6_ipstats_list, ARRAY_SIZE(snmp6_ipstats_list), offsetof(struct ipstats_mib, syncp)); + + /* Per idev icmp stats do not have ICMP6_MIB_RATELIMITHOST */ snmp6_seq_show_item(seq, NULL, idev->stats.icmpv6dev->mibs, - snmp6_icmp6_list, ARRAY_SIZE(snmp6_icmp6_list)); + snmp6_icmp6_list, ARRAY_SIZE(snmp6_icmp6_list) - 1); + snmp6_seq_show_icmpv6msg(seq, idev->stats.icmpv6msgdev->mibs); return 0; } -- cgit v1.2.3 From b7b74953f8343ceca86df694322440104110c146 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Sep 2025 16:58:08 +0000 Subject: ipv4: snmp: do not use SNMP_MIB_SENTINEL anymore Use ARRAY_SIZE(), so that we know the limit at compile time. Signed-off-by: Eric Dumazet Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250905165813.1470708-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/proc.c | 65 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 33 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 65b0d0ab0084..974afc4ecbe2 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -95,7 +95,6 @@ static const struct snmp_mib snmp4_ipstats_list[] = { SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS), SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES), SNMP_MIB_ITEM("OutTransmits", IPSTATS_MIB_OUTPKTS), - SNMP_MIB_SENTINEL }; /* Following items are displayed in /proc/net/netstat */ @@ -119,7 +118,6 @@ static const struct snmp_mib snmp4_ipextstats_list[] = { SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS), SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS), - SNMP_MIB_SENTINEL }; static const struct { @@ -157,7 +155,6 @@ static const struct snmp_mib snmp4_tcp_list[] = { SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS), SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS), SNMP_MIB_ITEM("InCsumErrors", TCP_MIB_CSUMERRORS), - SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp4_udp_list[] = { @@ -170,7 +167,6 @@ static const struct snmp_mib snmp4_udp_list[] = { SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("IgnoredMulti", UDP_MIB_IGNOREDMULTI), SNMP_MIB_ITEM("MemErrors", UDP_MIB_MEMERRORS), - SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp4_net_list[] = { @@ -309,7 +305,6 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPAOKeyNotFound", LINUX_MIB_TCPAOKEYNOTFOUND), SNMP_MIB_ITEM("TCPAOGood", LINUX_MIB_TCPAOGOOD), SNMP_MIB_ITEM("TCPAODroppedIcmps", LINUX_MIB_TCPAODROPPEDICMPS), - SNMP_MIB_SENTINEL }; static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals, @@ -389,14 +384,15 @@ static void icmp_put(struct seq_file *seq) */ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v) { + const int cnt = ARRAY_SIZE(snmp4_ipstats_list); + u64 buff64[ARRAY_SIZE(snmp4_ipstats_list)]; struct net *net = seq->private; - u64 buff64[IPSTATS_MIB_MAX]; int i; - memset(buff64, 0, IPSTATS_MIB_MAX * sizeof(u64)); + memset(buff64, 0, sizeof(buff64)); seq_puts(seq, "Ip: Forwarding DefaultTTL"); - for (i = 0; snmp4_ipstats_list[i].name; i++) + for (i = 0; i < cnt; i++) seq_printf(seq, " %s", snmp4_ipstats_list[i].name); seq_printf(seq, "\nIp: %d %d", @@ -404,10 +400,10 @@ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v) READ_ONCE(net->ipv4.sysctl_ip_default_ttl)); BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); - snmp_get_cpu_field64_batch(buff64, snmp4_ipstats_list, - net->mib.ip_statistics, - offsetof(struct ipstats_mib, syncp)); - for (i = 0; snmp4_ipstats_list[i].name; i++) + snmp_get_cpu_field64_batch_cnt(buff64, snmp4_ipstats_list, cnt, + net->mib.ip_statistics, + offsetof(struct ipstats_mib, syncp)); + for (i = 0; i < cnt; i++) seq_printf(seq, " %llu", buff64[i]); return 0; @@ -415,20 +411,23 @@ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v) static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v) { + const int udp_cnt = ARRAY_SIZE(snmp4_udp_list); + const int tcp_cnt = ARRAY_SIZE(snmp4_tcp_list); unsigned long buff[TCPUDP_MIB_MAX]; struct net *net = seq->private; int i; - memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); + memset(buff, 0, tcp_cnt * sizeof(unsigned long)); seq_puts(seq, "\nTcp:"); - for (i = 0; snmp4_tcp_list[i].name; i++) + for (i = 0; i < tcp_cnt; i++) seq_printf(seq, " %s", snmp4_tcp_list[i].name); seq_puts(seq, "\nTcp:"); - snmp_get_cpu_field_batch(buff, snmp4_tcp_list, - net->mib.tcp_statistics); - for (i = 0; snmp4_tcp_list[i].name; i++) { + snmp_get_cpu_field_batch_cnt(buff, snmp4_tcp_list, + tcp_cnt, + net->mib.tcp_statistics); + for (i = 0; i < tcp_cnt; i++) { /* MaxConn field is signed, RFC 2012 */ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) seq_printf(seq, " %ld", buff[i]); @@ -436,27 +435,29 @@ static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v) seq_printf(seq, " %lu", buff[i]); } - memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); + memset(buff, 0, udp_cnt * sizeof(unsigned long)); - snmp_get_cpu_field_batch(buff, snmp4_udp_list, - net->mib.udp_statistics); + snmp_get_cpu_field_batch_cnt(buff, snmp4_udp_list, + udp_cnt, + net->mib.udp_statistics); seq_puts(seq, "\nUdp:"); - for (i = 0; snmp4_udp_list[i].name; i++) + for (i = 0; i < udp_cnt; i++) seq_printf(seq, " %s", snmp4_udp_list[i].name); seq_puts(seq, "\nUdp:"); - for (i = 0; snmp4_udp_list[i].name; i++) + for (i = 0; i < udp_cnt; i++) seq_printf(seq, " %lu", buff[i]); - memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); + memset(buff, 0, udp_cnt * sizeof(unsigned long)); /* the UDP and UDP-Lite MIBs are the same */ seq_puts(seq, "\nUdpLite:"); - snmp_get_cpu_field_batch(buff, snmp4_udp_list, - net->mib.udplite_statistics); - for (i = 0; snmp4_udp_list[i].name; i++) + snmp_get_cpu_field_batch_cnt(buff, snmp4_udp_list, + udp_cnt, + net->mib.udplite_statistics); + for (i = 0; i < udp_cnt; i++) seq_printf(seq, " %s", snmp4_udp_list[i].name); seq_puts(seq, "\nUdpLite:"); - for (i = 0; snmp4_udp_list[i].name; i++) + for (i = 0; i < udp_cnt; i++) seq_printf(seq, " %lu", buff[i]); seq_putc(seq, '\n'); @@ -480,8 +481,8 @@ static int snmp_seq_show(struct seq_file *seq, void *v) */ static int netstat_seq_show(struct seq_file *seq, void *v) { - const int ip_cnt = ARRAY_SIZE(snmp4_ipextstats_list) - 1; - const int tcp_cnt = ARRAY_SIZE(snmp4_net_list) - 1; + const int ip_cnt = ARRAY_SIZE(snmp4_ipextstats_list); + const int tcp_cnt = ARRAY_SIZE(snmp4_net_list); struct net *net = seq->private; unsigned long *buff; int i; @@ -494,8 +495,8 @@ static int netstat_seq_show(struct seq_file *seq, void *v) buff = kzalloc(max(tcp_cnt * sizeof(long), ip_cnt * sizeof(u64)), GFP_KERNEL); if (buff) { - snmp_get_cpu_field_batch(buff, snmp4_net_list, - net->mib.net_statistics); + snmp_get_cpu_field_batch_cnt(buff, snmp4_net_list, tcp_cnt, + net->mib.net_statistics); for (i = 0; i < tcp_cnt; i++) seq_printf(seq, " %lu", buff[i]); } else { @@ -513,7 +514,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) u64 *buff64 = (u64 *)buff; memset(buff64, 0, ip_cnt * sizeof(u64)); - snmp_get_cpu_field64_batch(buff64, snmp4_ipextstats_list, + snmp_get_cpu_field64_batch_cnt(buff64, snmp4_ipextstats_list, ip_cnt, net->mib.ip_statistics, offsetof(struct ipstats_mib, syncp)); for (i = 0; i < ip_cnt; i++) -- cgit v1.2.3 From 35cb2da0abafe148fbb592063259978cdba29c6c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Sep 2025 16:58:09 +0000 Subject: mptcp: snmp: do not use SNMP_MIB_SENTINEL anymore Use ARRAY_SIZE(), so that we know the limit at compile time. Signed-off-by: Eric Dumazet Cc: Mat Martineau Cc: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250905165813.1470708-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/mptcp/mib.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index cf879c188ca2..6003e47c770a 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -85,7 +85,6 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("DssFallback", MPTCP_MIB_DSSFALLBACK), SNMP_MIB_ITEM("SimultConnectFallback", MPTCP_MIB_SIMULTCONNFALLBACK), SNMP_MIB_ITEM("FallbackFailed", MPTCP_MIB_FALLBACKFAILED), - SNMP_MIB_SENTINEL }; /* mptcp_mib_alloc - allocate percpu mib counters @@ -108,22 +107,23 @@ bool mptcp_mib_alloc(struct net *net) void mptcp_seq_show(struct seq_file *seq) { - unsigned long sum[ARRAY_SIZE(mptcp_snmp_list) - 1]; + unsigned long sum[ARRAY_SIZE(mptcp_snmp_list)]; + const int cnt = ARRAY_SIZE(mptcp_snmp_list); struct net *net = seq->private; int i; seq_puts(seq, "MPTcpExt:"); - for (i = 0; mptcp_snmp_list[i].name; i++) + for (i = 0; i < cnt; i++) seq_printf(seq, " %s", mptcp_snmp_list[i].name); seq_puts(seq, "\nMPTcpExt:"); memset(sum, 0, sizeof(sum)); if (net->mib.mptcp_statistics) - snmp_get_cpu_field_batch(sum, mptcp_snmp_list, - net->mib.mptcp_statistics); + snmp_get_cpu_field_batch_cnt(sum, mptcp_snmp_list, cnt, + net->mib.mptcp_statistics); - for (i = 0; mptcp_snmp_list[i].name; i++) + for (i = 0; i < cnt; i++) seq_printf(seq, " %lu", sum[i]); seq_putc(seq, '\n'); -- cgit v1.2.3 From 52a33cae6a6faf17345d1b570ecf1f669f7d9648 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Sep 2025 16:58:10 +0000 Subject: sctp: snmp: do not use SNMP_MIB_SENTINEL anymore Use ARRAY_SIZE(), so that we know the limit at compile time. Signed-off-by: Eric Dumazet Cc: Marcelo Ricardo Leitner Reviewed-by: Sabrina Dubroca Acked-by: Xin Long Link: https://patch.msgid.link/20250905165813.1470708-7-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sctp/proc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sctp/proc.c b/net/sctp/proc.c index 74bff317e205..1ed281f3c355 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -52,21 +52,21 @@ static const struct snmp_mib sctp_snmp_list[] = { SNMP_MIB_ITEM("SctpInPktBacklog", SCTP_MIB_IN_PKT_BACKLOG), SNMP_MIB_ITEM("SctpInPktDiscards", SCTP_MIB_IN_PKT_DISCARDS), SNMP_MIB_ITEM("SctpInDataChunkDiscards", SCTP_MIB_IN_DATA_CHUNK_DISCARDS), - SNMP_MIB_SENTINEL }; /* Display sctp snmp mib statistics(/proc/net/sctp/snmp). */ static int sctp_snmp_seq_show(struct seq_file *seq, void *v) { - unsigned long buff[SCTP_MIB_MAX]; + unsigned long buff[ARRAY_SIZE(sctp_snmp_list)]; + const int cnt = ARRAY_SIZE(sctp_snmp_list); struct net *net = seq->private; int i; - memset(buff, 0, sizeof(unsigned long) * SCTP_MIB_MAX); + memset(buff, 0, sizeof(buff)); - snmp_get_cpu_field_batch(buff, sctp_snmp_list, - net->sctp.sctp_statistics); - for (i = 0; sctp_snmp_list[i].name; i++) + snmp_get_cpu_field_batch_cnt(buff, sctp_snmp_list, cnt, + net->sctp.sctp_statistics); + for (i = 0; i < cnt; i++) seq_printf(seq, "%-32s\t%ld\n", sctp_snmp_list[i].name, buff[i]); -- cgit v1.2.3 From 3a951f95202cfc189f5dd285bbc177dfbe19389d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Sep 2025 16:58:11 +0000 Subject: tls: snmp: do not use SNMP_MIB_SENTINEL anymore Use ARRAY_SIZE(), so that we know the limit at compile time. Signed-off-by: Eric Dumazet Cc: John Fastabend Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250905165813.1470708-8-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/tls/tls_proc.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c index 367666aa07b8..4012c4372d4c 100644 --- a/net/tls/tls_proc.c +++ b/net/tls/tls_proc.c @@ -27,17 +27,19 @@ static const struct snmp_mib tls_mib_list[] = { SNMP_MIB_ITEM("TlsTxRekeyOk", LINUX_MIB_TLSTXREKEYOK), SNMP_MIB_ITEM("TlsTxRekeyError", LINUX_MIB_TLSTXREKEYERROR), SNMP_MIB_ITEM("TlsRxRekeyReceived", LINUX_MIB_TLSRXREKEYRECEIVED), - SNMP_MIB_SENTINEL }; static int tls_statistics_seq_show(struct seq_file *seq, void *v) { - unsigned long buf[LINUX_MIB_TLSMAX] = {}; + unsigned long buf[ARRAY_SIZE(tls_mib_list)]; + const int cnt = ARRAY_SIZE(tls_mib_list); struct net *net = seq->private; int i; - snmp_get_cpu_field_batch(buf, tls_mib_list, net->mib.tls_statistics); - for (i = 0; tls_mib_list[i].name; i++) + memset(buf, 0, sizeof(buf)); + snmp_get_cpu_field_batch_cnt(buf, tls_mib_list, cnt, + net->mib.tls_statistics); + for (i = 0; i < cnt; i++) seq_printf(seq, "%-32s\t%lu\n", tls_mib_list[i].name, buf[i]); return 0; -- cgit v1.2.3 From c73d583e7008336b1fa53275fa3f65aaaba00e6e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Sep 2025 16:58:12 +0000 Subject: xfrm: snmp: do not use SNMP_MIB_SENTINEL anymore Use ARRAY_SIZE(), so that we know the limit at compile time. Signed-off-by: Eric Dumazet Cc: Steffen Klassert Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250905165813.1470708-9-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/xfrm/xfrm_proc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c index 8e07dd614b0b..5e1fd6b1d503 100644 --- a/net/xfrm/xfrm_proc.c +++ b/net/xfrm/xfrm_proc.c @@ -45,21 +45,21 @@ static const struct snmp_mib xfrm_mib_list[] = { SNMP_MIB_ITEM("XfrmInStateDirError", LINUX_MIB_XFRMINSTATEDIRERROR), SNMP_MIB_ITEM("XfrmInIptfsError", LINUX_MIB_XFRMINIPTFSERROR), SNMP_MIB_ITEM("XfrmOutNoQueueSpace", LINUX_MIB_XFRMOUTNOQSPACE), - SNMP_MIB_SENTINEL }; static int xfrm_statistics_seq_show(struct seq_file *seq, void *v) { - unsigned long buff[LINUX_MIB_XFRMMAX]; + unsigned long buff[ARRAY_SIZE(xfrm_mib_list)]; + const int cnt = ARRAY_SIZE(xfrm_mib_list); struct net *net = seq->private; int i; - memset(buff, 0, sizeof(unsigned long) * LINUX_MIB_XFRMMAX); + memset(buff, 0, sizeof(buff)); xfrm_state_update_stats(net); - snmp_get_cpu_field_batch(buff, xfrm_mib_list, - net->mib.xfrm_statistics); - for (i = 0; xfrm_mib_list[i].name; i++) + snmp_get_cpu_field_batch_cnt(buff, xfrm_mib_list, cnt, + net->mib.xfrm_statistics); + for (i = 0; i < cnt; i++) seq_printf(seq, "%-24s\t%lu\n", xfrm_mib_list[i].name, buff[i]); -- cgit v1.2.3 From 9f0730b063b436938ebb6371aecb12ec6ed896e9 Mon Sep 17 00:00:00 2001 From: Håkon Bugge Date: Fri, 5 Sep 2025 12:19:57 +0200 Subject: rds: ib: Remove unused extern definition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the old days, RDS used FMR (Fast Memory Registration) to register IB MRs to be used by RDMA. A newer and better verbs based registration/de-registration method called FRWR (Fast Registration Work Request) was added to RDS by commit 1659185fb4d0 ("RDS: IB: Support Fastreg MR (FRMR) memory registration mode") in 2016. Detection and enablement of FRWR was done in commit 2cb2912d6563 ("RDS: IB: add Fastreg MR (FRMR) detection support"). But said commit added an extern bool prefer_frmr, which was not used by said commit - nor used by later commits. Hence, remove it. Signed-off-by: Håkon Bugge Reviewed-by: Allison Henderson Link: https://patch.msgid.link/20250905101958.4028647-1-haakon.bugge@oracle.com Signed-off-by: Jakub Kicinski --- net/rds/ib_mr.h | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h index ea5e9aee4959..5884de8c6f45 100644 --- a/net/rds/ib_mr.h +++ b/net/rds/ib_mr.h @@ -108,7 +108,6 @@ struct rds_ib_mr_pool { }; extern struct workqueue_struct *rds_ib_mr_wq; -extern bool prefer_frmr; struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev, int npages); -- cgit v1.2.3 From 8625f5748fea960d2af4f3c3e9891ee8f6f80906 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 5 Sep 2025 13:12:33 +0200 Subject: net: bridge: Bounce invalid boolopts The bridge driver currently tolerates options that it does not recognize. Instead, it should bounce them. Fixes: a428afe82f98 ("net: bridge: add support for user-controlled bool options") Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/e6fdca3b5a8d54183fbda075daffef38bdd7ddce.1757070067.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- net/bridge/br.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/bridge/br.c b/net/bridge/br.c index 1885d0c315f0..c683baa3847f 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -324,6 +324,13 @@ int br_boolopt_multi_toggle(struct net_bridge *br, int err = 0; int opt_id; + opt_id = find_next_bit(&bitmap, BITS_PER_LONG, BR_BOOLOPT_MAX); + if (opt_id != BITS_PER_LONG) { + NL_SET_ERR_MSG_FMT_MOD(extack, "Unknown boolean option %d", + opt_id); + return -EINVAL; + } + for_each_set_bit(opt_id, &bitmap, BR_BOOLOPT_MAX) { bool on = !!(bm->optval & BIT(opt_id)); -- cgit v1.2.3 From d67ca09ca39f9605459959004b28c56899e3bca3 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 2 Sep 2025 06:55:58 +0000 Subject: hsr: use netdev_master_upper_dev_link() when linking lower ports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unlike VLAN devices, HSR changes the lower device’s rx_handler, which prevents the lower device from being attached to another master. Switch to using netdev_master_upper_dev_link() when setting up the lower device. This could improves user experience, since ip link will now display the HSR device as the master for its ports. Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250902065558.360927-1-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- net/hsr/hsr_slave.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index 102eccf5ead7..8177ac6c2d26 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -143,6 +143,7 @@ static int hsr_portdev_setup(struct hsr_priv *hsr, struct net_device *dev, struct netlink_ext_ack *extack) { + struct netdev_lag_upper_info lag_upper_info; struct net_device *hsr_dev; struct hsr_port *master; int res; @@ -159,7 +160,9 @@ static int hsr_portdev_setup(struct hsr_priv *hsr, struct net_device *dev, master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); hsr_dev = master->dev; - res = netdev_upper_dev_link(dev, hsr_dev, extack); + lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_BROADCAST; + lag_upper_info.hash_type = NETDEV_LAG_HASH_UNKNOWN; + res = netdev_master_upper_dev_link(dev, hsr_dev, NULL, &lag_upper_info, extack); if (res) goto fail_upper_dev_link; -- cgit v1.2.3 From e3c674db356c4303804b2415e7c2b11776cdd8c3 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 4 Sep 2025 14:53:50 +0200 Subject: tunnels: reset the GSO metadata before reusing the skb If a GSO skb is sent through a Geneve tunnel and if Geneve options are added, the split GSO skb might not fit in the MTU anymore and an ICMP frag needed packet can be generated. In such case the ICMP packet might go through the segmentation logic (and dropped) later if it reaches a path were the GSO status is checked and segmentation is required. This is especially true when an OvS bridge is used with a Geneve tunnel attached to it. The following set of actions could lead to the ICMP packet being wrongfully segmented: 1. An skb is constructed by the TCP layer (e.g. gso_type SKB_GSO_TCPV4, segs >= 2). 2. The skb hits the OvS bridge where Geneve options are added by an OvS action before being sent through the tunnel. 3. When the skb is xmited in the tunnel, the split skb does not fit anymore in the MTU and iptunnel_pmtud_build_icmp is called to generate an ICMP fragmentation needed packet. This is done by reusing the original (GSO!) skb. The GSO metadata is not cleared. 4. The ICMP packet being sent back hits the OvS bridge again and because skb_is_gso returns true, it goes through queue_gso_packets... 5. ...where __skb_gso_segment is called. The skb is then dropped. 6. Note that in the above example on re-transmission the skb won't be a GSO one as it would be segmented (len > MSS) and the ICMP packet should go through. Fix this by resetting the GSO information before reusing an skb in iptunnel_pmtud_build_icmp and iptunnel_pmtud_build_icmpv6. Fixes: 4cb47a8644cc ("tunnels: PMTU discovery support for directly bridged IP packets") Reported-by: Adrian Moreno Signed-off-by: Antoine Tenart Reviewed-by: Stefano Brivio Link: https://patch.msgid.link/20250904125351.159740-1-atenart@kernel.org Signed-off-by: Paolo Abeni --- net/ipv4/ip_tunnel_core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index cc9915543637..2e61ac137128 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -206,6 +206,9 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu) if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr))) return -EINVAL; + if (skb_is_gso(skb)) + skb_gso_reset(skb); + skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN); pskb_pull(skb, ETH_HLEN); skb_reset_network_header(skb); @@ -300,6 +303,9 @@ static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu) if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr))) return -EINVAL; + if (skb_is_gso(skb)) + skb_gso_reset(skb); + skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN); pskb_pull(skb, ETH_HLEN); skb_reset_network_header(skb); -- cgit v1.2.3 From 30f241fcf52aaaef7ac16e66530faa11be78a865 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 4 Sep 2025 21:49:07 +0200 Subject: xsk: Fix immature cq descriptor production Eryk reported an issue that I have put under Closes: tag, related to umem addrs being prematurely produced onto pool's completion queue. Let us make the skb's destructor responsible for producing all addrs that given skb used. Commit from fixes tag introduced the buggy behavior, it was not broken from day 1, but rather when xsk multi-buffer got introduced. In order to mitigate performance impact as much as possible, mimic the linear and frag parts within skb by storing the first address from XSK descriptor at sk_buff::destructor_arg. For fragments, store them at ::cb via list. The nodes that will go onto list will be allocated via kmem_cache. xsk_destruct_skb() will consume address stored at ::destructor_arg and optionally go through list from ::cb, if count of descriptors associated with this particular skb is bigger than 1. Previous approach where whole array for storing UMEM addresses from XSK descriptors was pre-allocated during first fragment processing yielded too big performance regression for 64b traffic. In current approach impact is much reduced on my tests and for jumbo frames I observed traffic being slower by at most 9%. Magnus suggested to have this way of processing special cased for XDP_SHARED_UMEM, so we would identify this during bind and set different hooks for 'backpressure mechanism' on CQ and for skb destructor, but given that results looked promising on my side I decided to have a single data path for XSK generic Tx. I suppose other auxiliary stuff would have to land as well in order to make it work. Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path") Reported-by: Eryk Kubanski Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@partner.samsung.com/ Acked-by: Stanislav Fomichev Signed-off-by: Maciej Fijalkowski Tested-by: Jason Xing Reviewed-by: Jason Xing Link: https://lore.kernel.org/r/20250904194907.2342177-1-maciej.fijalkowski@intel.com Signed-off-by: Alexei Starovoitov --- net/xdp/xsk.c | 113 +++++++++++++++++++++++++++++++++++++++++++++------- net/xdp/xsk_queue.h | 12 ++++++ 2 files changed, 111 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 9c3acecc14b1..72e34bd2d925 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -36,6 +36,20 @@ #define TX_BATCH_SIZE 32 #define MAX_PER_SOCKET_BUDGET 32 +struct xsk_addr_node { + u64 addr; + struct list_head addr_node; +}; + +struct xsk_addr_head { + u32 num_descs; + struct list_head addrs_list; +}; + +static struct kmem_cache *xsk_tx_generic_cache; + +#define XSKCB(skb) ((struct xsk_addr_head *)((skb)->cb)) + void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { if (pool->cached_need_wakeup & XDP_WAKEUP_RX) @@ -532,24 +546,43 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags) return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); } -static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr) +static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool) { unsigned long flags; int ret; spin_lock_irqsave(&pool->cq_lock, flags); - ret = xskq_prod_reserve_addr(pool->cq, addr); + ret = xskq_prod_reserve(pool->cq); spin_unlock_irqrestore(&pool->cq_lock, flags); return ret; } -static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n) +static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool, + struct sk_buff *skb) { + struct xsk_addr_node *pos, *tmp; + u32 descs_processed = 0; unsigned long flags; + u32 idx; spin_lock_irqsave(&pool->cq_lock, flags); - xskq_prod_submit_n(pool->cq, n); + idx = xskq_get_prod(pool->cq); + + xskq_prod_write_addr(pool->cq, idx, + (u64)(uintptr_t)skb_shinfo(skb)->destructor_arg); + descs_processed++; + + if (unlikely(XSKCB(skb)->num_descs > 1)) { + list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) { + xskq_prod_write_addr(pool->cq, idx + descs_processed, + pos->addr); + descs_processed++; + list_del(&pos->addr_node); + kmem_cache_free(xsk_tx_generic_cache, pos); + } + } + xskq_prod_submit_n(pool->cq, descs_processed); spin_unlock_irqrestore(&pool->cq_lock, flags); } @@ -562,9 +595,14 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n) spin_unlock_irqrestore(&pool->cq_lock, flags); } +static void xsk_inc_num_desc(struct sk_buff *skb) +{ + XSKCB(skb)->num_descs++; +} + static u32 xsk_get_num_desc(struct sk_buff *skb) { - return skb ? (long)skb_shinfo(skb)->destructor_arg : 0; + return XSKCB(skb)->num_descs; } static void xsk_destruct_skb(struct sk_buff *skb) @@ -576,23 +614,33 @@ static void xsk_destruct_skb(struct sk_buff *skb) *compl->tx_timestamp = ktime_get_tai_fast_ns(); } - xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb)); + xsk_cq_submit_addr_locked(xdp_sk(skb->sk)->pool, skb); sock_wfree(skb); } -static void xsk_set_destructor_arg(struct sk_buff *skb) +static void xsk_set_destructor_arg(struct sk_buff *skb, u64 addr) { - long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1; - - skb_shinfo(skb)->destructor_arg = (void *)num; + BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb)); + INIT_LIST_HEAD(&XSKCB(skb)->addrs_list); + XSKCB(skb)->num_descs = 0; + skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr; } static void xsk_consume_skb(struct sk_buff *skb) { struct xdp_sock *xs = xdp_sk(skb->sk); + u32 num_descs = xsk_get_num_desc(skb); + struct xsk_addr_node *pos, *tmp; + + if (unlikely(num_descs > 1)) { + list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) { + list_del(&pos->addr_node); + kmem_cache_free(xsk_tx_generic_cache, pos); + } + } skb->destructor = sock_wfree; - xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb)); + xsk_cq_cancel_locked(xs->pool, num_descs); /* Free skb without triggering the perf drop trace */ consume_skb(skb); xs->skb = NULL; @@ -609,6 +657,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, { struct xsk_buff_pool *pool = xs->pool; u32 hr, len, ts, offset, copy, copied; + struct xsk_addr_node *xsk_addr; struct sk_buff *skb = xs->skb; struct page *page; void *buffer; @@ -623,6 +672,19 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, return ERR_PTR(err); skb_reserve(skb, hr); + + xsk_set_destructor_arg(skb, desc->addr); + } else { + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); + if (!xsk_addr) + return ERR_PTR(-ENOMEM); + + /* in case of -EOVERFLOW that could happen below, + * xsk_consume_skb() will release this node as whole skb + * would be dropped, which implies freeing all list elements + */ + xsk_addr->addr = desc->addr; + list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list); } addr = desc->addr; @@ -694,8 +756,11 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, err = skb_store_bits(skb, 0, buffer, len); if (unlikely(err)) goto free_err; + + xsk_set_destructor_arg(skb, desc->addr); } else { int nr_frags = skb_shinfo(skb)->nr_frags; + struct xsk_addr_node *xsk_addr; struct page *page; u8 *vaddr; @@ -710,12 +775,22 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, goto free_err; } + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); + if (!xsk_addr) { + __free_page(page); + err = -ENOMEM; + goto free_err; + } + vaddr = kmap_local_page(page); memcpy(vaddr, buffer, len); kunmap_local(vaddr); skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE); refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc); + + xsk_addr->addr = desc->addr; + list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list); } if (first_frag && desc->options & XDP_TX_METADATA) { @@ -759,7 +834,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, skb->mark = READ_ONCE(xs->sk.sk_mark); skb->destructor = xsk_destruct_skb; xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); - xsk_set_destructor_arg(skb); + xsk_inc_num_desc(skb); return skb; @@ -769,7 +844,7 @@ free_err: if (err == -EOVERFLOW) { /* Drop the packet */ - xsk_set_destructor_arg(xs->skb); + xsk_inc_num_desc(xs->skb); xsk_drop_skb(xs->skb); xskq_cons_release(xs->tx); } else { @@ -812,7 +887,7 @@ static int __xsk_generic_xmit(struct sock *sk) * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ - err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr); + err = xsk_cq_reserve_locked(xs->pool); if (err) { err = -EAGAIN; goto out; @@ -1815,8 +1890,18 @@ static int __init xsk_init(void) if (err) goto out_pernet; + xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache", + sizeof(struct xsk_addr_node), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!xsk_tx_generic_cache) { + err = -ENOMEM; + goto out_unreg_notif; + } + return 0; +out_unreg_notif: + unregister_netdevice_notifier(&xsk_netdev_notifier); out_pernet: unregister_pernet_subsys(&xsk_net_ops); out_sk: diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 46d87e961ad6..f16f390370dc 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -344,6 +344,11 @@ static inline u32 xskq_cons_present_entries(struct xsk_queue *q) /* Functions for producers */ +static inline u32 xskq_get_prod(struct xsk_queue *q) +{ + return READ_ONCE(q->ring->producer); +} + static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max) { u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); @@ -390,6 +395,13 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) return 0; } +static inline void xskq_prod_write_addr(struct xsk_queue *q, u32 idx, u64 addr) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + + ring->desc[idx & q->ring_mask] = addr; +} + static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs, u32 nb_entries) { -- cgit v1.2.3 From d436b5abba4f80e968b3ff83be4363c7aedcc799 Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Sun, 7 Sep 2025 12:25:32 -0700 Subject: ipv4: udp: fix typos in comments Correct typos in ipv4/udp.c comments for clarity: "Encapulation" -> "Encapsulation" "measureable" -> "measurable" "tacking care" -> "taking care" No functional changes. Signed-off-by: Alok Tiwari Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250907192535.3610686-1-alok.a.tiwari@oracle.com Signed-off-by: Jakub Kicinski --- net/ipv4/udp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 732bdad43626..cca41c569f37 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -68,7 +68,7 @@ * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which * Alexey Kuznetsov: allow both IPv4 and IPv6 sockets to bind * a single port at the same time. - * Derek Atkins : Add Encapulation Support + * Derek Atkins : Add Encapsulation Support * James Chapman : Add L2TP encapsulation type. */ @@ -509,7 +509,7 @@ rescore: /* compute_score is too long of a function to be * inlined, and calling it again here yields - * measureable overhead for some + * measurable overhead for some * workloads. Work around it by jumping * backwards to rescore 'result'. */ @@ -2609,7 +2609,7 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, return 0; } -/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and +/* wrapper for udp_queue_rcv_skb taking care of csum conversion and * return code conversion for ip layer consumption */ static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, -- cgit v1.2.3 From 686cab5a18e443e1d5f2abb17bed45837836425f Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Sun, 7 Sep 2025 11:08:21 +0300 Subject: net: dev_ioctl: take ops lock in hwtstamp lower paths ndo hwtstamp callbacks are expected to run under the per-device ops lock. Make the lower get/set paths consistent with the rest of ndo invocations. Kernel log: WARNING: CPU: 13 PID: 51364 at ./include/net/netdev_lock.h:70 __netdev_update_features+0x4bd/0xe60 ... RIP: 0010:__netdev_update_features+0x4bd/0xe60 ... Call Trace: netdev_update_features+0x1f/0x60 mlx5_hwtstamp_set+0x181/0x290 [mlx5_core] mlx5e_hwtstamp_set+0x19/0x30 [mlx5_core] dev_set_hwtstamp_phylib+0x9f/0x220 dev_set_hwtstamp_phylib+0x9f/0x220 dev_set_hwtstamp+0x13d/0x240 dev_ioctl+0x12f/0x4b0 sock_ioctl+0x171/0x370 __x64_sys_ioctl+0x3f7/0x900 ? __sys_setsockopt+0x69/0xb0 do_syscall_64+0x6f/0x2e0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 ... .... ---[ end trace 0000000000000000 ]--- Note that the mlx5_hwtstamp_set and mlx5e_hwtstamp_set functions shown in the trace come from an in progress patch converting the legacy ioctl to ndo_hwtstamp_get/set and are not present in mainline. Fixes: ffb7ed19ac0a ("net: hold netdev instance lock during ioctl operations") Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Link: https://patch.msgid.link/20250907080821.2353388-1-cjubran@nvidia.com Signed-off-by: Jakub Kicinski --- net/core/dev_ioctl.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 9c0ad7f4b5d8..ad54b12d4b4c 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -464,8 +464,15 @@ int generic_hwtstamp_get_lower(struct net_device *dev, if (!netif_device_present(dev)) return -ENODEV; - if (ops->ndo_hwtstamp_get) - return dev_get_hwtstamp_phylib(dev, kernel_cfg); + if (ops->ndo_hwtstamp_get) { + int err; + + netdev_lock_ops(dev); + err = dev_get_hwtstamp_phylib(dev, kernel_cfg); + netdev_unlock_ops(dev); + + return err; + } /* Legacy path: unconverted lower driver */ return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg); @@ -481,8 +488,15 @@ int generic_hwtstamp_set_lower(struct net_device *dev, if (!netif_device_present(dev)) return -ENODEV; - if (ops->ndo_hwtstamp_set) - return dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); + if (ops->ndo_hwtstamp_set) { + int err; + + netdev_lock_ops(dev); + err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); + netdev_unlock_ops(dev); + + return err; + } /* Legacy path: unconverted lower driver */ return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg); -- cgit v1.2.3 From 648de37416b301f046f62f1b65715c7fa8ebaa67 Mon Sep 17 00:00:00 2001 From: Krister Johansen Date: Mon, 8 Sep 2025 11:16:01 -0700 Subject: mptcp: sockopt: make sync_socket_options propagate SOCK_KEEPOPEN Users reported a scenario where MPTCP connections that were configured with SO_KEEPALIVE prior to connect would fail to enable their keepalives if MTPCP fell back to TCP mode. After investigating, this affects keepalives for any connection where sync_socket_options is called on a socket that is in the closed or listening state. Joins are handled properly. For connects, sync_socket_options is called when the socket is still in the closed state. The tcp_set_keepalive() function does not act on sockets that are closed or listening, hence keepalive is not immediately enabled. Since the SO_KEEPOPEN flag is absent, it is not enabled later in the connect sequence via tcp_finish_connect. Setting the keepalive via sockopt after connect does work, but would not address any subsequently created flows. Fortunately, the fix here is straight-forward: set SOCK_KEEPOPEN on the subflow when calling sync_socket_options. The fix was valdidated both by using tcpdump to observe keepalive packets not being sent before the fix, and being sent after the fix. It was also possible to observe via ss that the keepalive timer was not enabled on these sockets before the fix, but was enabled afterwards. Fixes: 1b3e7ede1365 ("mptcp: setsockopt: handle SO_KEEPALIVE and SO_PRIORITY") Cc: stable@vger.kernel.org Signed-off-by: Krister Johansen Reviewed-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/aL8dYfPZrwedCIh9@templeofstupid.com Signed-off-by: Jakub Kicinski --- net/mptcp/sockopt.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 2c267aff95be..2abe6f1e9940 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -1532,13 +1532,12 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) { static const unsigned int tx_rx_locks = SOCK_RCVBUF_LOCK | SOCK_SNDBUF_LOCK; struct sock *sk = (struct sock *)msk; + bool keep_open; - if (ssk->sk_prot->keepalive) { - if (sock_flag(sk, SOCK_KEEPOPEN)) - ssk->sk_prot->keepalive(ssk, 1); - else - ssk->sk_prot->keepalive(ssk, 0); - } + keep_open = sock_flag(sk, SOCK_KEEPOPEN); + if (ssk->sk_prot->keepalive) + ssk->sk_prot->keepalive(ssk, keep_open); + sock_valbool_flag(ssk, SOCK_KEEPOPEN, keep_open); ssk->sk_priority = sk->sk_priority; ssk->sk_bound_dev_if = sk->sk_bound_dev_if; -- cgit v1.2.3 From 30549eebc4d84f844c62965f2e1d362bc1accdce Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sun, 7 Sep 2025 17:32:42 +0200 Subject: mptcp: make ADD_ADDR retransmission timeout adaptive Currently the ADD_ADDR option is retransmitted with a fixed timeout. This patch makes the retransmission timeout adaptive by using the maximum RTO among all the subflows, while still capping it at the configured maximum value (add_addr_timeout_max). This improves responsiveness when establishing new subflows. Specifically: 1. Adds mptcp_adjust_add_addr_timeout() helper to compute the adaptive timeout. 2. Uses maximum subflow RTO (icsk_rto) when available. 3. Applies exponential backoff based on retransmission count. 4. Maintains fallback to configured max timeout when no RTO data exists. This slightly changes the behaviour of the MPTCP "add_addr_timeout" sysctl knob to be used as a maximum instead of a fixed value. But this is seen as an improvement: the ADD_ADDR might be sent quicker than before to improve the overall MPTCP connection. Also, the default value is set to 2 min, which was already way too long, and caused the ADD_ADDR not to be retransmitted for connections shorter than 2 minutes. Suggested-by: Matthieu Baerts (NGI0) Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/576 Reviewed-by: Christoph Paasch Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250907-net-next-mptcp-add_addr-retrans-adapt-v1-1-824cc805772b@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/mptcp-sysctl.rst | 8 +++++--- net/mptcp/pm.c | 28 ++++++++++++++++++++++++---- 2 files changed, 29 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/Documentation/networking/mptcp-sysctl.rst b/Documentation/networking/mptcp-sysctl.rst index 1683c139821e..1eb6af26b4a7 100644 --- a/Documentation/networking/mptcp-sysctl.rst +++ b/Documentation/networking/mptcp-sysctl.rst @@ -8,9 +8,11 @@ MPTCP Sysfs variables =============================== add_addr_timeout - INTEGER (seconds) - Set the timeout after which an ADD_ADDR control message will be - resent to an MPTCP peer that has not acknowledged a previous - ADD_ADDR message. + Set the maximum value of timeout after which an ADD_ADDR control message + will be resent to an MPTCP peer that has not acknowledged a previous + ADD_ADDR message. A dynamically estimated retransmission timeout based + on the estimated connection round-trip-time is used if this value is + lower than the maximum one. Do not retransmit if set to 0. diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 136a380602ca..204e1f61212e 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -268,6 +268,27 @@ int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, return -EINVAL; } +static unsigned int mptcp_adjust_add_addr_timeout(struct mptcp_sock *msk) +{ + const struct net *net = sock_net((struct sock *)msk); + unsigned int rto = mptcp_get_add_addr_timeout(net); + struct mptcp_subflow_context *subflow; + unsigned int max = 0; + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + struct inet_connection_sock *icsk = inet_csk(ssk); + + if (icsk->icsk_rto > max) + max = icsk->icsk_rto; + } + + if (max && max < rto) + rto = max; + + return rto; +} + static void mptcp_pm_add_timer(struct timer_list *timer) { struct mptcp_pm_add_entry *entry = timer_container_of(entry, timer, @@ -292,7 +313,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer) goto out; } - timeout = mptcp_get_add_addr_timeout(sock_net(sk)); + timeout = mptcp_adjust_add_addr_timeout(msk); if (!timeout) goto out; @@ -307,7 +328,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer) if (entry->retrans_times < ADD_ADDR_RETRANS_MAX) sk_reset_timer(sk, timer, - jiffies + timeout); + jiffies + (timeout << entry->retrans_times)); spin_unlock_bh(&msk->pm.lock); @@ -348,7 +369,6 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, { struct mptcp_pm_add_entry *add_entry = NULL; struct sock *sk = (struct sock *)msk; - struct net *net = sock_net(sk); unsigned int timeout; lockdep_assert_held(&msk->pm.lock); @@ -374,7 +394,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); reset_timer: - timeout = mptcp_get_add_addr_timeout(net); + timeout = mptcp_adjust_add_addr_timeout(msk); if (timeout) sk_reset_timer(sk, &add_entry->add_timer, jiffies + timeout); -- cgit v1.2.3 From ce0b015e2619ae64b7d33fb24a6b6cadcd70c317 Mon Sep 17 00:00:00 2001 From: Vlad Dumitrescu Date: Sat, 6 Sep 2025 18:29:43 -0700 Subject: devlink: Add 'total_vfs' generic device param NICs are typically configured with total_vfs=0, forcing users to rely on external tools to enable SR-IOV (a widely used and essential feature). Add total_vfs parameter to devlink for SR-IOV max VF configurability. Enables standard kernel tools to manage SR-IOV, addressing the need for flexible VF configuration. Signed-off-by: Vlad Dumitrescu Tested-by: Kamal Heib Reviewed-by: Jiri Pirko Signed-off-by: Saeed Mahameed Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250907012953.301746-2-saeed@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/devlink/devlink-params.rst | 5 +++++ include/net/devlink.h | 4 ++++ net/devlink/param.c | 5 +++++ 3 files changed, 14 insertions(+) (limited to 'net') diff --git a/Documentation/networking/devlink/devlink-params.rst b/Documentation/networking/devlink/devlink-params.rst index 211b58177e12..c51da4fba7e7 100644 --- a/Documentation/networking/devlink/devlink-params.rst +++ b/Documentation/networking/devlink/devlink-params.rst @@ -143,3 +143,8 @@ own name. * - ``clock_id`` - u64 - Clock ID used by the device for registering DPLL devices and pins. + * - ``total_vfs`` + - u32 + - The max number of Virtual Functions (VFs) exposed by the PF. + after reboot/pci reset, 'sriov_totalvfs' entry under the device's sysfs + directory will report this value. diff --git a/include/net/devlink.h b/include/net/devlink.h index 5f44e702c25c..8d4362f010e4 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -530,6 +530,7 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE, DEVLINK_PARAM_GENERIC_ID_ENABLE_PHC, DEVLINK_PARAM_GENERIC_ID_CLOCK_ID, + DEVLINK_PARAM_GENERIC_ID_TOTAL_VFS, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -594,6 +595,9 @@ enum devlink_param_generic_id { #define DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME "clock_id" #define DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE DEVLINK_PARAM_TYPE_U64 +#define DEVLINK_PARAM_GENERIC_TOTAL_VFS_NAME "total_vfs" +#define DEVLINK_PARAM_GENERIC_TOTAL_VFS_TYPE DEVLINK_PARAM_TYPE_U32 + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ diff --git a/net/devlink/param.c b/net/devlink/param.c index 41dcc86cfd94..33134940c266 100644 --- a/net/devlink/param.c +++ b/net/devlink/param.c @@ -102,6 +102,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME, .type = DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_TOTAL_VFS, + .name = DEVLINK_PARAM_GENERIC_TOTAL_VFS_NAME, + .type = DEVLINK_PARAM_GENERIC_TOTAL_VFS_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) -- cgit v1.2.3 From 8cc71fc3b82b51e155fbe20876b1aa17a315ac4c Mon Sep 17 00:00:00 2001 From: Nithyanantham Paramasivam Date: Fri, 5 Sep 2025 18:18:00 +0530 Subject: wifi: cfg80211: Fix "no buffer space available" error in nl80211_get_station() for MLO Currently, nl80211_get_station() allocates a fixed buffer size using NLMSG_DEFAULT_SIZE. In multi-link scenarios - particularly when the number of links exceeds two - this buffer size is often insufficient to accommodate complete station statistics, resulting in "no buffer space available" errors. To address this, modify nl80211_get_station() to return only accumulated station statistics and exclude per link stats. Pass a new flag (link_stats) to nl80211_send_station() to control the inclusion of per link statistics. This allows retaining detailed output with per link data in dump commands, while excluding it from other commands where it is not needed. This change modifies the handling of per link stats introduced in commit 82d7f841d9bd ("wifi: cfg80211: extend to embed link level statistics in NL message") to enable them only for nl80211_dump_station(). Apply the same fix to cfg80211_del_sta_sinfo() by skipping per link stats to avoid buffer issues. cfg80211_new_sta() doesn't include stats and is therefore not impacted. Fixes: 82d7f841d9bd ("wifi: cfg80211: extend to embed link level statistics in NL message") Signed-off-by: Nithyanantham Paramasivam Link: https://patch.msgid.link/20250905124800.1448493-1-nithyanantham.paramasivam@oss.qualcomm.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 89519aa52893..f2f7424e930c 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7062,7 +7062,8 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, u32 seq, int flags, struct cfg80211_registered_device *rdev, struct net_device *dev, - const u8 *mac_addr, struct station_info *sinfo) + const u8 *mac_addr, struct station_info *sinfo, + bool link_stats) { void *hdr; struct nlattr *sinfoattr, *bss_param; @@ -7283,7 +7284,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, goto nla_put_failure; } - if (sinfo->valid_links) { + if (link_stats && sinfo->valid_links) { links = nla_nest_start(msg, NL80211_ATTR_MLO_LINKS); if (!links) goto nla_put_failure; @@ -7574,7 +7575,7 @@ static int nl80211_dump_station(struct sk_buff *skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, rdev, wdev->netdev, mac_addr, - &sinfo) < 0) + &sinfo, true) < 0) goto out; sta_idx++; @@ -7635,7 +7636,7 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info) if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION, info->snd_portid, info->snd_seq, 0, - rdev, dev, mac_addr, &sinfo) < 0) { + rdev, dev, mac_addr, &sinfo, false) < 0) { nlmsg_free(msg); return -ENOBUFS; } @@ -19680,7 +19681,7 @@ void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr, return; if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION, 0, 0, 0, - rdev, dev, mac_addr, sinfo) < 0) { + rdev, dev, mac_addr, sinfo, false) < 0) { nlmsg_free(msg); return; } @@ -19710,7 +19711,7 @@ void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr, } if (nl80211_send_station(msg, NL80211_CMD_DEL_STATION, 0, 0, 0, - rdev, dev, mac_addr, sinfo) < 0) { + rdev, dev, mac_addr, sinfo, false) < 0) { nlmsg_free(msg); return; } -- cgit v1.2.3 From eebccbfea4184feb758c104783b870ec4ddb6aec Mon Sep 17 00:00:00 2001 From: Sarika Sharma Date: Thu, 4 Sep 2025 16:10:54 +0530 Subject: wifi: mac80211: fix reporting of all valid links in sta_set_sinfo() Currently, sta_set_sinfo() fails to populate link-level station info when sinfo->valid_links is initially 0 and sta->sta.valid_links has bits set for links other than link 0. This typically occurs when association happens on a non-zero link or link 0 deleted dynamically. In such cases, the for_each_valid_link(sinfo, link_id) loop only executes for link 0 and terminates early, since sinfo->valid_links remains 0. As a result, only MLD-level information is reported to userspace. Hence to fix, initialize sinfo->valid_links with sta->sta.valid_links before entering the loop to ensure loop executes for each valid link. During iteration, mask out invalid links from sinfo->valid_links if any of sta->link[link_id], sdata->link[link_id], or sinfo->links[link_id] are not present, to report only valid link information. Fixes: 505991fba9ec ("wifi: mac80211: extend support to fill link level sinfo structure") Signed-off-by: Sarika Sharma Link: https://patch.msgid.link/20250904104054.790321-1-quic_sarishar@quicinc.com [clarify comment] Signed-off-by: Johannes Berg --- net/mac80211/sta_info.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 8e275f0a1238..1bd75e0375a0 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -3207,16 +3207,20 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, int link_id; ether_addr_copy(sinfo->mld_addr, sta->addr); + + /* assign valid links first for iteration */ + sinfo->valid_links = sta->sta.valid_links; + for_each_valid_link(sinfo, link_id) { link_sta = wiphy_dereference(sta->local->hw.wiphy, sta->link[link_id]); link = wiphy_dereference(sdata->local->hw.wiphy, sdata->link[link_id]); - if (!link_sta || !sinfo->links[link_id] || !link) + if (!link_sta || !sinfo->links[link_id] || !link) { + sinfo->valid_links &= ~BIT(link_id); continue; - - sinfo->valid_links = sta->sta.valid_links; + } sta_set_link_sinfo(sta, sinfo->links[link_id], link, tidstats); } -- cgit v1.2.3 From 906a5a8c7152ec2f76280f7224bb13adab64118c Mon Sep 17 00:00:00 2001 From: Sarika Sharma Date: Fri, 22 Aug 2025 10:51:10 +0530 Subject: wifi: mac80211: add tx_handlers_drop statistics to ethtool Currently tx_handlers_drop statistics are handled only for slow TX path and only at radio level. This also requires CONFIG_MAC80211_DEBUG_COUNTERS to be enabled to account the dropped packets. There is no way to check these stats for fast TX, at interface level and monitor without enabling the debug configuration. Hence, add a new counter at the sdata level to track packets dropped with reason as TX_DROP during transmission for fast path, slow path and other tx management packets. Expose this via ethtool statistics, to improve visibility into transmission failures at interface level and aid debugging and performance monitoring. Place the counter in ethtool with other available tx_* stats for better readability and accurate tracking. Sample output: root@buildroot:~# ethtool -S wlan0 NIC statistics: rx_packets: 5904 rx_bytes: 508122 rx_duplicates: 12 rx_fragments: 5900 rx_dropped: 12 tx_packets: 391487 tx_bytes: 600423383 tx_filtered: 0 tx_retry_failed: 10332 tx_retries: 1548 tx_handlers_drop: 4 .... Co-developed-by: Hari Chandrakanthan Signed-off-by: Hari Chandrakanthan Signed-off-by: Sarika Sharma Link: https://patch.msgid.link/20250822052110.513804-1-quic_sarishar@quicinc.com Signed-off-by: Johannes Berg --- net/mac80211/ethtool.c | 6 ++++-- net/mac80211/ieee80211_i.h | 1 + net/mac80211/tx.c | 7 ++++++- 3 files changed, 11 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c index 0397755a3bd1..3d365626faa4 100644 --- a/net/mac80211/ethtool.c +++ b/net/mac80211/ethtool.c @@ -48,8 +48,8 @@ static const char ieee80211_gstrings_sta_stats[][ETH_GSTRING_LEN] = { "rx_duplicates", "rx_fragments", "rx_dropped", "tx_packets", "tx_bytes", "tx_filtered", "tx_retry_failed", "tx_retries", - "sta_state", "txrate", "rxrate", "signal", - "channel", "noise", "ch_time", "ch_time_busy", + "tx_handlers_drop", "sta_state", "txrate", "rxrate", + "signal", "channel", "noise", "ch_time", "ch_time_busy", "ch_time_ext_busy", "ch_time_rx", "ch_time_tx" }; #define STA_STATS_LEN ARRAY_SIZE(ieee80211_gstrings_sta_stats) @@ -120,6 +120,7 @@ static void ieee80211_get_stats(struct net_device *dev, i = 0; ADD_STA_STATS(&sta->deflink); + data[i++] = sdata->tx_handlers_drop; data[i++] = sta->sta_state; @@ -145,6 +146,7 @@ static void ieee80211_get_stats(struct net_device *dev, sta_set_sinfo(sta, &sinfo, false); i = 0; ADD_STA_STATS(&sta->deflink); + data[i++] = sdata->tx_handlers_drop; } } diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 07f5fb11569b..8a666faeb1ec 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1218,6 +1218,7 @@ struct ieee80211_sub_if_data { } debugfs; #endif + u32 tx_handlers_drop; /* must be last, dynamically sized area in this! */ struct ieee80211_vif vif; }; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 0ece8d89e094..a27e2af5d569 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1814,6 +1814,7 @@ static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx) txh_done: if (unlikely(res == TX_DROP)) { + tx->sdata->tx_handlers_drop++; I802_DEBUG_INC(tx->local->tx_handlers_drop); if (tx->skb) ieee80211_free_txskb(&tx->local->hw, tx->skb); @@ -1858,6 +1859,7 @@ static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx) txh_done: if (unlikely(res == TX_DROP)) { + tx->sdata->tx_handlers_drop++; I802_DEBUG_INC(tx->local->tx_handlers_drop); if (tx->skb) ieee80211_free_txskb(&tx->local->hw, tx->skb); @@ -1942,6 +1944,7 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, if (unlikely(res_prepare == TX_DROP)) { ieee80211_free_txskb(&local->hw, skb); + tx.sdata->tx_handlers_drop++; return true; } else if (unlikely(res_prepare == TX_QUEUED)) { return true; @@ -3728,8 +3731,10 @@ void __ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, r = ieee80211_xmit_fast_finish(sdata, sta, fast_tx->pn_offs, fast_tx->key, &tx); tx.skb = NULL; - if (r == TX_DROP) + if (r == TX_DROP) { + tx.sdata->tx_handlers_drop++; goto free; + } if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, -- cgit v1.2.3 From ea928544f3215fdeac24d66bef85e10bb638b8c1 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Sun, 7 Sep 2025 11:51:17 +0300 Subject: wifi: mac80211: Fix HE capabilities element check The element data length check did not account for the extra octet used for the extension ID. Fix it. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250907115109.8da0012e2286.I8c0c69a0011f7153c13b365b14dfef48cfe7c3e3@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 83a9986dd1c4..f73e3222981b 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -5725,7 +5725,7 @@ static u8 ieee80211_max_rx_chains(struct ieee80211_link_data *link, he_cap_elem = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_CAPABILITY, ies->data, ies->len); - if (!he_cap_elem || he_cap_elem->datalen < sizeof(*he_cap)) + if (!he_cap_elem || he_cap_elem->datalen < sizeof(*he_cap) + 1) return chains; /* skip one byte ext_tag_id */ -- cgit v1.2.3 From 185cc2352cb1ef2178fe4e9a220a73c94007b8bb Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Sun, 7 Sep 2025 11:51:43 +0300 Subject: wifi: cfg80211: update the time stamps in hidden ssid In hidden SSID we have separate BSS entries for the beacon and for the probe response(s). The BSS entry time stamps represent the age of the BSS; when was the last time we heard the BSS. When we receive a beacon of a hidden SSID it means that we heard that BSS, so it makes sense to indicate that in the probe response entries. Do that. Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250907115135.712745e498c0.I38186abf5d20dec6f6f2d42d2e1cdb50c6bfea25@changeid Signed-off-by: Johannes Berg --- net/wireless/scan.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/scan.c b/net/wireless/scan.c index a8339ed52404..52a3d32c16fe 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1816,6 +1816,9 @@ static void cfg80211_update_hidden_bsses(struct cfg80211_internal_bss *known, WARN_ON(ies != old_ies); rcu_assign_pointer(bss->pub.beacon_ies, new_ies); + + bss->ts = known->ts; + bss->pub.ts_boottime = known->pub.ts_boottime; } } @@ -1882,6 +1885,10 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, { lockdep_assert_held(&rdev->bss_lock); + /* Update time stamps */ + known->ts = new->ts; + known->pub.ts_boottime = new->pub.ts_boottime; + /* Update IEs */ if (rcu_access_pointer(new->pub.proberesp_ies)) { const struct cfg80211_bss_ies *old; @@ -1944,8 +1951,6 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, if (signal_valid) known->pub.signal = new->pub.signal; known->pub.capability = new->pub.capability; - known->ts = new->ts; - known->pub.ts_boottime = new->pub.ts_boottime; known->parent_tsf = new->parent_tsf; known->pub.chains = new->pub.chains; memcpy(known->pub.chain_signal, new->pub.chain_signal, -- cgit v1.2.3 From 691009b7ef08f7c9adee19706c8865a7e4d87037 Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Mon, 8 Sep 2025 12:27:45 +0300 Subject: wifi: mac80211: fix incorrect comment As opposed to what the comment says, we don't count in the skb size of the association request frame the length of the Per STA Profile of the association link. Fix the comment. Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908122652.7022f33b1f33.Iac0d35744df883e8b96d71bbe8da518cc5d514bf@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index f73e3222981b..43a53da42e52 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2183,11 +2183,7 @@ static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) 2 + /* ext capa & op */ 2; /* EML capa */ - /* - * The capability elements were already considered above; - * note this over-estimates a bit because there's no - * STA profile for the assoc link. - */ + /* The capability elements were already considered above */ size += (n_links - 1) * (1 + 1 + /* subelement ID/length */ 2 + /* STA control */ -- cgit v1.2.3 From b2422712d15db99c6e11a31a8df53e603604c494 Mon Sep 17 00:00:00 2001 From: Zheng tan Date: Wed, 10 Sep 2025 09:55:56 +0800 Subject: wifi: cfg80211: Remove the redundant wiphy_dev There is no need to call wiphy_dev again.Simplifying the code makes it more readable. Signed-off-by: Zheng tan Reviewed-by: Arend van Spriel Link: https://patch.msgid.link/20250910015556.219298-1-tanzheng@kylinos.cn Signed-off-by: Johannes Berg --- net/wireless/ethtool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/ethtool.c b/net/wireless/ethtool.c index 2613d6ac0fda..46e4317cbd7e 100644 --- a/net/wireless/ethtool.c +++ b/net/wireless/ethtool.c @@ -23,7 +23,7 @@ void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) else strscpy(info->fw_version, "N/A", sizeof(info->fw_version)); - strscpy(info->bus_info, dev_name(wiphy_dev(wdev->wiphy)), + strscpy(info->bus_info, dev_name(pdev), sizeof(info->bus_info)); } EXPORT_SYMBOL(cfg80211_get_drvinfo); -- cgit v1.2.3 From a3967baad4d533dc254c31e0d221e51c8d223d58 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 9 Sep 2025 23:26:12 +0000 Subject: tcp_bpf: Call sk_msg_free() when tcp_bpf_send_verdict() fails to allocate psock->cork. syzbot reported the splat below. [0] The repro does the following: 1. Load a sk_msg prog that calls bpf_msg_cork_bytes(msg, cork_bytes) 2. Attach the prog to a SOCKMAP 3. Add a socket to the SOCKMAP 4. Activate fault injection 5. Send data less than cork_bytes At 5., the data is carried over to the next sendmsg() as it is smaller than the cork_bytes specified by bpf_msg_cork_bytes(). Then, tcp_bpf_send_verdict() tries to allocate psock->cork to hold the data, but this fails silently due to fault injection + __GFP_NOWARN. If the allocation fails, we need to revert the sk->sk_forward_alloc change done by sk_msg_alloc(). Let's call sk_msg_free() when tcp_bpf_send_verdict fails to allocate psock->cork. The "*copied" also needs to be updated such that a proper error can be returned to the caller, sendmsg. It fails to allocate psock->cork. Nothing has been corked so far, so this patch simply sets "*copied" to 0. [0]: WARNING: net/ipv4/af_inet.c:156 at inet_sock_destruct+0x623/0x730 net/ipv4/af_inet.c:156, CPU#1: syz-executor/5983 Modules linked in: CPU: 1 UID: 0 PID: 5983 Comm: syz-executor Not tainted syzkaller #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/12/2025 RIP: 0010:inet_sock_destruct+0x623/0x730 net/ipv4/af_inet.c:156 Code: 0f 0b 90 e9 62 fe ff ff e8 7a db b5 f7 90 0f 0b 90 e9 95 fe ff ff e8 6c db b5 f7 90 0f 0b 90 e9 bb fe ff ff e8 5e db b5 f7 90 <0f> 0b 90 e9 e1 fe ff ff 89 f9 80 e1 07 80 c1 03 38 c1 0f 8c 9f fc RSP: 0018:ffffc90000a08b48 EFLAGS: 00010246 RAX: ffffffff8a09d0b2 RBX: dffffc0000000000 RCX: ffff888024a23c80 RDX: 0000000000000100 RSI: 0000000000000fff RDI: 0000000000000000 RBP: 0000000000000fff R08: ffff88807e07c627 R09: 1ffff1100fc0f8c4 R10: dffffc0000000000 R11: ffffed100fc0f8c5 R12: ffff88807e07c380 R13: dffffc0000000000 R14: ffff88807e07c60c R15: 1ffff1100fc0f872 FS: 00005555604c4500(0000) GS:ffff888125af1000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005555604df5c8 CR3: 0000000032b06000 CR4: 00000000003526f0 Call Trace: __sk_destruct+0x86/0x660 net/core/sock.c:2339 rcu_do_batch kernel/rcu/tree.c:2605 [inline] rcu_core+0xca8/0x1770 kernel/rcu/tree.c:2861 handle_softirqs+0x286/0x870 kernel/softirq.c:579 __do_softirq kernel/softirq.c:613 [inline] invoke_softirq kernel/softirq.c:453 [inline] __irq_exit_rcu+0xca/0x1f0 kernel/softirq.c:680 irq_exit_rcu+0x9/0x30 kernel/softirq.c:696 instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1052 [inline] sysvec_apic_timer_interrupt+0xa6/0xc0 arch/x86/kernel/apic/apic.c:1052 Fixes: 4f738adba30a ("bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data") Reported-by: syzbot+4cabd1d2fa917a456db8@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/68c0b6b5.050a0220.3c6139.0013.GAE@google.com/ Signed-off-by: Kuniyuki Iwashima Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250909232623.4151337-1-kuniyu@google.com --- net/ipv4/tcp_bpf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index ba581785adb4..a268e1595b22 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -408,8 +408,11 @@ more_data: if (!psock->cork) { psock->cork = kzalloc(sizeof(*psock->cork), GFP_ATOMIC | __GFP_NOWARN); - if (!psock->cork) + if (!psock->cork) { + sk_msg_free(sk, msg); + *copied = 0; return -ENOMEM; + } } memcpy(psock->cork, msg, sizeof(*msg)); return 0; -- cgit v1.2.3 From 7fcbe5b2c6a4b5407bf2241fdb71e0a390f6ab9a Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 25 Aug 2025 23:07:24 +0900 Subject: can: j1939: implement NETDEV_UNREGISTER notification handler syzbot is reporting unregister_netdevice: waiting for vcan0 to become free. Usage count = 2 problem, for j1939 protocol did not have NETDEV_UNREGISTER notification handler for undoing changes made by j1939_sk_bind(). Commit 25fe97cb7620 ("can: j1939: move j1939_priv_put() into sk_destruct callback") expects that a call to j1939_priv_put() can be unconditionally delayed until j1939_sk_sock_destruct() is called. But we need to call j1939_priv_put() against an extra ref held by j1939_sk_bind() call (as a part of undoing changes made by j1939_sk_bind()) as soon as NETDEV_UNREGISTER notification fires (i.e. before j1939_sk_sock_destruct() is called via j1939_sk_release()). Otherwise, the extra ref on "struct j1939_priv" held by j1939_sk_bind() call prevents "struct net_device" from dropping the usage count to 1; making it impossible for unregister_netdevice() to continue. Reported-by: syzbot Closes: https://syzkaller.appspot.com/bug?extid=881d65229ca4f9ae8c84 Tested-by: syzbot Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Fixes: 25fe97cb7620 ("can: j1939: move j1939_priv_put() into sk_destruct callback") Signed-off-by: Tetsuo Handa Tested-by: Oleksij Rempel Acked-by: Oleksij Rempel Link: https://patch.msgid.link/ac9db9a4-6c30-416e-8b94-96e6559d55b2@I-love.SAKURA.ne.jp [mkl: remove space in front of label] Signed-off-by: Marc Kleine-Budde --- net/can/j1939/j1939-priv.h | 1 + net/can/j1939/main.c | 3 +++ net/can/j1939/socket.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+) (limited to 'net') diff --git a/net/can/j1939/j1939-priv.h b/net/can/j1939/j1939-priv.h index 31a93cae5111..81f58924b4ac 100644 --- a/net/can/j1939/j1939-priv.h +++ b/net/can/j1939/j1939-priv.h @@ -212,6 +212,7 @@ void j1939_priv_get(struct j1939_priv *priv); /* notify/alert all j1939 sockets bound to ifindex */ void j1939_sk_netdev_event_netdown(struct j1939_priv *priv); +void j1939_sk_netdev_event_unregister(struct j1939_priv *priv); int j1939_cancel_active_session(struct j1939_priv *priv, struct sock *sk); void j1939_tp_init(struct j1939_priv *priv); diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index 7e8a20f2fc42..3706a872ecaf 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -377,6 +377,9 @@ static int j1939_netdev_notify(struct notifier_block *nb, j1939_sk_netdev_event_netdown(priv); j1939_ecu_unmap_all(priv); break; + case NETDEV_UNREGISTER: + j1939_sk_netdev_event_unregister(priv); + break; } j1939_priv_put(priv); diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 3d8b588822f9..70ebc861ea2a 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -1300,6 +1300,55 @@ void j1939_sk_netdev_event_netdown(struct j1939_priv *priv) read_unlock_bh(&priv->j1939_socks_lock); } +void j1939_sk_netdev_event_unregister(struct j1939_priv *priv) +{ + struct sock *sk; + struct j1939_sock *jsk; + bool wait_rcu = false; + +rescan: /* The caller is holding a ref on this "priv" via j1939_priv_get_by_ndev(). */ + read_lock_bh(&priv->j1939_socks_lock); + list_for_each_entry(jsk, &priv->j1939_socks, list) { + /* Skip if j1939_jsk_add() is not called on this socket. */ + if (!(jsk->state & J1939_SOCK_BOUND)) + continue; + sk = &jsk->sk; + sock_hold(sk); + read_unlock_bh(&priv->j1939_socks_lock); + /* Check if j1939_jsk_del() is not yet called on this socket after holding + * socket's lock, for both j1939_sk_bind() and j1939_sk_release() call + * j1939_jsk_del() with socket's lock held. + */ + lock_sock(sk); + if (jsk->state & J1939_SOCK_BOUND) { + /* Neither j1939_sk_bind() nor j1939_sk_release() called j1939_jsk_del(). + * Make this socket no longer bound, by pretending as if j1939_sk_bind() + * dropped old references but did not get new references. + */ + j1939_jsk_del(priv, jsk); + j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); + j1939_netdev_stop(priv); + /* Call j1939_priv_put() now and prevent j1939_sk_sock_destruct() from + * calling the corresponding j1939_priv_put(). + * + * j1939_sk_sock_destruct() is supposed to call j1939_priv_put() after + * an RCU grace period. But since the caller is holding a ref on this + * "priv", we can defer synchronize_rcu() until immediately before + * the caller calls j1939_priv_put(). + */ + j1939_priv_put(priv); + jsk->priv = NULL; + wait_rcu = true; + } + release_sock(sk); + sock_put(sk); + goto rescan; + } + read_unlock_bh(&priv->j1939_socks_lock); + if (wait_rcu) + synchronize_rcu(); +} + static int j1939_sk_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { -- cgit v1.2.3 From f214744c8a27c3c1da6b538c232da22cd027530e Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 24 Aug 2025 19:30:09 +0900 Subject: can: j1939: j1939_sk_bind(): call j1939_priv_put() immediately when j1939_local_ecu_get() failed Commit 25fe97cb7620 ("can: j1939: move j1939_priv_put() into sk_destruct callback") expects that a call to j1939_priv_put() can be unconditionally delayed until j1939_sk_sock_destruct() is called. But a refcount leak will happen when j1939_sk_bind() is called again after j1939_local_ecu_get() from previous j1939_sk_bind() call returned an error. We need to call j1939_priv_put() before j1939_sk_bind() returns an error. Fixes: 25fe97cb7620 ("can: j1939: move j1939_priv_put() into sk_destruct callback") Signed-off-by: Tetsuo Handa Tested-by: Oleksij Rempel Acked-by: Oleksij Rempel Link: https://patch.msgid.link/4f49a1bc-a528-42ad-86c0-187268ab6535@I-love.SAKURA.ne.jp Signed-off-by: Marc Kleine-Budde --- net/can/j1939/socket.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 70ebc861ea2a..88e7160d4248 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -521,6 +521,9 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len) ret = j1939_local_ecu_get(priv, jsk->addr.src_name, jsk->addr.sa); if (ret) { j1939_netdev_stop(priv); + jsk->priv = NULL; + synchronize_rcu(); + j1939_priv_put(priv); goto out_release_sock; } -- cgit v1.2.3 From 06e02da29f6f1a45fc07bd60c7eaf172dc21e334 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 24 Aug 2025 19:27:40 +0900 Subject: can: j1939: j1939_local_ecu_get(): undo increment when j1939_local_ecu_get() fails Since j1939_sk_bind() and j1939_sk_release() call j1939_local_ecu_put() when J1939_SOCK_BOUND was already set, but the error handling path for j1939_sk_bind() will not set J1939_SOCK_BOUND when j1939_local_ecu_get() fails, j1939_local_ecu_get() needs to undo priv->ents[sa].nusers++ when j1939_local_ecu_get() returns an error. Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Signed-off-by: Tetsuo Handa Tested-by: Oleksij Rempel Acked-by: Oleksij Rempel Link: https://patch.msgid.link/e7f80046-4ff7-4ce2-8ad8-7c3c678a42c9@I-love.SAKURA.ne.jp Signed-off-by: Marc Kleine-Budde --- net/can/j1939/bus.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/can/j1939/bus.c b/net/can/j1939/bus.c index 39844f14eed8..797719cb227e 100644 --- a/net/can/j1939/bus.c +++ b/net/can/j1939/bus.c @@ -290,8 +290,11 @@ int j1939_local_ecu_get(struct j1939_priv *priv, name_t name, u8 sa) if (!ecu) ecu = j1939_ecu_create_locked(priv, name); err = PTR_ERR_OR_ZERO(ecu); - if (err) + if (err) { + if (j1939_address_is_unicast(sa)) + priv->ents[sa].nusers--; goto done; + } ecu->nusers++; /* TODO: do we care if ecu->addr != sa? */ -- cgit v1.2.3 From 5e13f2c491a4100d208e77e92fe577fe3dbad6c2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 9 Sep 2025 14:45:21 +0200 Subject: netfilter: nft_set_bitmap: fix lockdep splat due to missing annotation Running new 'set_flush_add_atomic_bitmap' test case for nftables.git with CONFIG_PROVE_RCU_LIST=y yields: net/netfilter/nft_set_bitmap.c:231 RCU-list traversed in non-reader section!! rcu_scheduler_active = 2, debug_locks = 1 1 lock held by nft/4008: #0: ffff888147f79cd8 (&nft_net->commit_mutex){+.+.}-{4:4}, at: nf_tables_valid_genid+0x2f/0xd0 lockdep_rcu_suspicious+0x116/0x160 nft_bitmap_walk+0x22d/0x240 nf_tables_delsetelem+0x1010/0x1a00 .. This is a false positive, the list cannot be altered while the transaction mutex is held, so pass the relevant argument to the iterator. Fixes tag intentionally wrong; no point in picking this up if earlier false-positive-fixups were not applied. Fixes: 28b7a6b84c0a ("netfilter: nf_tables: avoid false-positive lockdep splats in set walker") Signed-off-by: Florian Westphal --- net/netfilter/nft_set_bitmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index c24c922f895d..8d3f040a904a 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -226,7 +226,8 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx, const struct nft_bitmap *priv = nft_set_priv(set); struct nft_bitmap_elem *be; - list_for_each_entry_rcu(be, &priv->list, head) { + list_for_each_entry_rcu(be, &priv->list, head, + lockdep_is_held(&nft_pernet(ctx->net)->commit_mutex)) { if (iter->count < iter->skip) goto cont; -- cgit v1.2.3 From c4eaca2e1052adfd67bed0a36a9d4b8e515666e4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Sep 2025 10:02:18 +0200 Subject: netfilter: nft_set_pipapo: don't check genbit from packetpath lookups The pipapo set type is special in that it has two copies of its datastructure: one live copy containing only valid elements and one on-demand clone used during transaction where adds/deletes happen. This clone is not visible to the datapath. This is unlike all other set types in nftables, those all link new elements into their live hlist/tree. For those sets, the lookup functions must skip the new elements while the transaction is ongoing to ensure consistency. As the clone is shallow, removal does have an effect on the packet path: once the transaction enters the commit phase the 'gencursor' bit that determines which elements are active and which elements should be ignored (because they are no longer valid) is flipped. This causes the datapath lookup to ignore these elements if they are found during lookup. This opens up a small race window where pipapo has an inconsistent view of the dataset from when the transaction-cpu flipped the genbit until the transaction-cpu calls nft_pipapo_commit() to swap live/clone pointers: cpu0 cpu1 has added new elements to clone has marked elements as being inactive in new generation perform lookup in the set enters commit phase: I) increments the genbit A) observes new genbit removes elements from the clone so they won't be found anymore B) lookup in datastructure can't see new elements yet, but old elements are ignored -> Only matches elements that were not changed in the transaction II) calls nft_pipapo_commit(), clone and live pointers are swapped. C New nft_lookup happening now will find matching elements. Consider a packet matching range r1-r2: cpu0 processes following transaction: 1. remove r1-r2 2. add r1-r3 P is contained in both ranges. Therefore, cpu1 should always find a match for P. Due to above race, this is not the case: cpu1 does find r1-r2, but then ignores it due to the genbit indicating the range has been removed. At the same time, r1-r3 is not visible yet, because it can only be found in the clone. The situation persists for all lookups until after cpu0 hits II). The fix is easy: Don't check the genbit from pipapo lookup functions. This is possible because unlike the other set types, the new elements are not reachable from the live copy of the dataset. The clone/live pointer swap is enough to avoid matching on old elements while at the same time all new elements are exposed in one go. After this change, step B above returns a match in r1-r2. This is fine: r1-r2 only becomes truly invalid the moment they get freed. This happens after a synchronize_rcu() call and rcu read lock is held via netfilter hook traversal (nf_hook_slow()). Cc: Stefano Brivio Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Signed-off-by: Florian Westphal --- net/netfilter/nft_set_pipapo.c | 20 ++++++++++++++++++-- net/netfilter/nft_set_pipapo_avx2.c | 4 +--- 2 files changed, 19 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 9a10251228fd..793790d79d13 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -510,6 +510,23 @@ out: * * This function is called from the data path. It will search for * an element matching the given key in the current active copy. + * Unlike other set types, this uses NFT_GENMASK_ANY instead of + * nft_genmask_cur(). + * + * This is because new (future) elements are not reachable from + * priv->match, they get added to priv->clone instead. + * When the commit phase flips the generation bitmask, the + * 'now old' entries are skipped but without the 'now current' + * elements becoming visible. Using nft_genmask_cur() thus creates + * inconsistent state: matching old entries get skipped but thew + * newly matching entries are unreachable. + * + * GENMASK will still find the 'now old' entries which ensures consistent + * priv->match view. + * + * nft_pipapo_commit swaps ->clone and ->match shortly after the + * genbit flip. As ->clone doesn't contain the old entries in the first + * place, lookup will only find the now-current ones. * * Return: ntables API extension pointer or NULL if no match. */ @@ -518,12 +535,11 @@ nft_pipapo_lookup(const struct net *net, const struct nft_set *set, const u32 *key) { struct nft_pipapo *priv = nft_set_priv(set); - u8 genmask = nft_genmask_cur(net); const struct nft_pipapo_match *m; const struct nft_pipapo_elem *e; m = rcu_dereference(priv->match); - e = pipapo_get(m, (const u8 *)key, genmask, get_jiffies_64()); + e = pipapo_get(m, (const u8 *)key, NFT_GENMASK_ANY, get_jiffies_64()); return e ? &e->ext : NULL; } diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index 2f090e253caf..c0884fa68c79 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1152,7 +1152,6 @@ nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, struct nft_pipapo *priv = nft_set_priv(set); const struct nft_set_ext *ext = NULL; struct nft_pipapo_scratch *scratch; - u8 genmask = nft_genmask_cur(net); const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; const u8 *rp = (const u8 *)key; @@ -1248,8 +1247,7 @@ next_match: if (last) { const struct nft_set_ext *e = &f->mt[ret].e->ext; - if (unlikely(nft_set_elem_expired(e) || - !nft_set_elem_active(e, genmask))) + if (unlikely(nft_set_elem_expired(e))) goto next_match; ext = e; -- cgit v1.2.3 From a60f7bf4a1524d8896b76ba89623080aebf44272 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Sep 2025 10:02:19 +0200 Subject: netfilter: nft_set_rbtree: continue traversal if element is inactive When the rbtree lookup function finds a match in the rbtree, it sets the range start interval to a potentially inactive element. Then, after tree lookup, if the matching element is inactive, it returns NULL and suppresses a matching result. This is wrong and leads to false negative matches when a transaction has already entered the commit phase. cpu0 cpu1 has added new elements to clone has marked elements as being inactive in new generation perform lookup in the set enters commit phase: I) increments the genbit A) observes new genbit B) finds matching range C) returns no match: found range invalid in new generation II) removes old elements from the tree C New nft_lookup happening now will find matching element, because it is no longer obscured by old, inactive one. Consider a packet matching range r1-r2: cpu0 processes following transaction: 1. remove r1-r2 2. add r1-r3 P is contained in both ranges. Therefore, cpu1 should always find a match for P. Due to above race, this is not the case: cpu1 does find r1-r2, but then ignores it due to the genbit indicating the range has been removed. It does NOT test for further matches. The situation persists for all lookups until after cpu0 hits II) after which r1-r3 range start node is tested for the first time. Move the "interval start is valid" check ahead so that tree traversal continues if the starting interval is not valid in this generation. Thanks to Stefan Hanreich for providing an initial reproducer for this bug. Reported-by: Stefan Hanreich Fixes: c1eda3c6394f ("netfilter: nft_rbtree: ignore inactive matching element with no descendants") Signed-off-by: Florian Westphal --- net/netfilter/nft_set_rbtree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 938a257c069e..b1f04168ec93 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -77,7 +77,9 @@ __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, nft_rbtree_interval_end(rbe) && nft_rbtree_interval_start(interval)) continue; - interval = rbe; + if (nft_set_elem_active(&rbe->ext, genmask) && + !nft_rbtree_elem_expired(rbe)) + interval = rbe; } else if (d > 0) parent = rcu_dereference_raw(parent->rb_right); else { @@ -102,8 +104,6 @@ __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, } if (set->flags & NFT_SET_INTERVAL && interval != NULL && - nft_set_elem_active(&interval->ext, genmask) && - !nft_rbtree_elem_expired(interval) && nft_rbtree_interval_start(interval)) return &interval->ext; -- cgit v1.2.3 From 64102d9bbc3d41dac5188b8fba75b1344c438970 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Sep 2025 10:02:20 +0200 Subject: netfilter: nf_tables: place base_seq in struct net This will soon be read from packet path around same time as the gencursor. Both gencursor and base_seq get incremented almost at the same time, so it makes sense to place them in the same structure. This doesn't increase struct net size on 64bit due to padding. Signed-off-by: Florian Westphal --- include/net/netfilter/nf_tables.h | 1 - include/net/netns/nftables.h | 1 + net/netfilter/nf_tables_api.c | 65 ++++++++++++++++++++------------------- 3 files changed, 34 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 891e43a01bdc..3faa80f5d801 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1912,7 +1912,6 @@ struct nftables_pernet { struct mutex commit_mutex; u64 table_handle; u64 tstamp; - unsigned int base_seq; unsigned int gc_seq; u8 validate_state; struct work_struct destroy_work; diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index cc8060c017d5..99dd166c5d07 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -3,6 +3,7 @@ #define _NETNS_NFTABLES_H_ struct netns_nftables { + unsigned int base_seq; u8 gencursor; }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c1082de09656..9518b50695ba 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1131,11 +1131,14 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, return ERR_PTR(-ENOENT); } -static __be16 nft_base_seq(const struct net *net) +static unsigned int nft_base_seq(const struct net *net) { - struct nftables_pernet *nft_net = nft_pernet(net); + return READ_ONCE(net->nft.base_seq); +} - return htons(nft_net->base_seq & 0xffff); +static __be16 nft_base_seq_be16(const struct net *net) +{ + return htons(nft_base_seq(net) & 0xffff); } static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { @@ -1155,7 +1158,7 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -1248,7 +1251,7 @@ static int nf_tables_dump_tables(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -2030,7 +2033,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -2133,7 +2136,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -3671,7 +3674,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, type, flags, family, NFNETLINK_V0, - nft_base_seq(net)); + nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -3839,7 +3842,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -4050,7 +4053,7 @@ static int nf_tables_getrule_reset(struct sk_buff *skb, buf = kasprintf(GFP_ATOMIC, "%.*s:%u", nla_len(nla[NFTA_RULE_TABLE]), (char *)nla_data(nla[NFTA_RULE_TABLE]), - nft_net->base_seq); + nft_base_seq(net)); audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1, AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC); kfree(buf); @@ -4887,7 +4890,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), flags, ctx->family, NFNETLINK_V0, - nft_base_seq(ctx->net)); + nft_base_seq_be16(ctx->net)); if (!nlh) goto nla_put_failure; @@ -5032,7 +5035,7 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (ctx->family != NFPROTO_UNSPEC && @@ -6209,7 +6212,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (dump_ctx->ctx.family != NFPROTO_UNSPEC && @@ -6238,7 +6241,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) seq = cb->nlh->nlmsg_seq; nlh = nfnl_msg_put(skb, portid, seq, event, NLM_F_MULTI, - table->family, NFNETLINK_V0, nft_base_seq(net)); + table->family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -6331,7 +6334,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family, - NFNETLINK_V0, nft_base_seq(ctx->net)); + NFNETLINK_V0, nft_base_seq_be16(ctx->net)); if (!nlh) goto nla_put_failure; @@ -6630,7 +6633,7 @@ static int nf_tables_getsetelem_reset(struct sk_buff *skb, } nelems++; } - audit_log_nft_set_reset(dump_ctx.ctx.table, nft_net->base_seq, nelems); + audit_log_nft_set_reset(dump_ctx.ctx.table, nft_base_seq(info->net), nelems); out_unlock: rcu_read_unlock(); @@ -8381,7 +8384,7 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -8446,7 +8449,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -8480,7 +8483,7 @@ cont: idx++; } if (ctx->reset && entries) - audit_log_obj_reset(table, nft_net->base_seq, entries); + audit_log_obj_reset(table, nft_base_seq(net), entries); if (rc < 0) break; } @@ -8649,7 +8652,7 @@ static int nf_tables_getobj_reset(struct sk_buff *skb, buf = kasprintf(GFP_ATOMIC, "%.*s:%u", nla_len(nla[NFTA_OBJ_TABLE]), (char *)nla_data(nla[NFTA_OBJ_TABLE]), - nft_net->base_seq); + nft_base_seq(net)); audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1, AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC); kfree(buf); @@ -8754,9 +8757,8 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, u16 flags, int family, int report, gfp_t gfp) { - struct nftables_pernet *nft_net = nft_pernet(net); char *buf = kasprintf(gfp, "%s:%u", - table->name, nft_net->base_seq); + table->name, nft_base_seq(net)); audit_log_nfcfg(buf, family, @@ -9442,7 +9444,7 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -9511,7 +9513,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -9696,17 +9698,16 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq) { - struct nftables_pernet *nft_net = nft_pernet(net); struct nlmsghdr *nlh; char buf[TASK_COMM_LEN]; int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN); nlh = nfnl_msg_put(skb, portid, seq, event, 0, AF_UNSPEC, - NFNETLINK_V0, nft_base_seq(net)); + NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; - if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_net->base_seq)) || + if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_base_seq(net))) || nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) || nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current))) goto nla_put_failure; @@ -10968,11 +10969,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) * Bump generation counter, invalidate any dump in progress. * Cannot fail after this point. */ - base_seq = READ_ONCE(nft_net->base_seq); + base_seq = nft_base_seq(net); while (++base_seq == 0) ; - WRITE_ONCE(nft_net->base_seq, base_seq); + WRITE_ONCE(net->nft.base_seq, base_seq); gc_seq = nft_gc_seq_begin(nft_net); @@ -11181,7 +11182,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_commit_notify(net, NETLINK_CB(skb).portid); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); - nf_tables_commit_audit_log(&adl, nft_net->base_seq); + nf_tables_commit_audit_log(&adl, nft_base_seq(net)); nft_gc_seq_end(nft_net, gc_seq); nft_net->validate_state = NFT_VALIDATE_SKIP; @@ -11506,7 +11507,7 @@ static bool nf_tables_valid_genid(struct net *net, u32 genid) mutex_lock(&nft_net->commit_mutex); nft_net->tstamp = get_jiffies_64(); - genid_ok = genid == 0 || nft_net->base_seq == genid; + genid_ok = genid == 0 || nft_base_seq(net) == genid; if (!genid_ok) mutex_unlock(&nft_net->commit_mutex); @@ -12143,7 +12144,7 @@ static int __net_init nf_tables_init_net(struct net *net) INIT_LIST_HEAD(&nft_net->module_list); INIT_LIST_HEAD(&nft_net->notify_list); mutex_init(&nft_net->commit_mutex); - nft_net->base_seq = 1; + net->nft.base_seq = 1; nft_net->gc_seq = 0; nft_net->validate_state = NFT_VALIDATE_SKIP; INIT_WORK(&nft_net->destroy_work, nf_tables_trans_destroy_work); -- cgit v1.2.3 From 11fe5a82e53ac3581a80c88e0e35fb8a80e15f48 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Sep 2025 10:02:21 +0200 Subject: netfilter: nf_tables: make nft_set_do_lookup available unconditionally This function was added for retpoline mitigation and is replaced by a static inline helper if mitigations are not enabled. Enable this helper function unconditionally so next patch can add a lookup restart mechanism to fix possible false negatives while transactions are in progress. Adding lookup restarts in nft_lookup_eval doesn't work as nft_objref would then need the same copypaste loop. This patch is separate to ease review of the actual bug fix. Suggested-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal --- include/net/netfilter/nf_tables_core.h | 10 ++-------- net/netfilter/nft_lookup.c | 17 ++++++++++++----- 2 files changed, 14 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 6c2f483d9828..656e784714f3 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -109,17 +109,11 @@ nft_hash_lookup_fast(const struct net *net, const struct nft_set *set, const struct nft_set_ext * nft_hash_lookup(const struct net *net, const struct nft_set *set, const u32 *key); +#endif + const struct nft_set_ext * nft_set_do_lookup(const struct net *net, const struct nft_set *set, const u32 *key); -#else -static inline const struct nft_set_ext * -nft_set_do_lookup(const struct net *net, const struct nft_set *set, - const u32 *key) -{ - return set->ops->lookup(net, set, key); -} -#endif /* called from nft_pipapo_avx2.c */ const struct nft_set_ext * diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 40c602ffbcba..2c6909bf1b40 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -24,11 +24,11 @@ struct nft_lookup { struct nft_set_binding binding; }; -#ifdef CONFIG_MITIGATION_RETPOLINE -const struct nft_set_ext * -nft_set_do_lookup(const struct net *net, const struct nft_set *set, - const u32 *key) +static const struct nft_set_ext * +__nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { +#ifdef CONFIG_MITIGATION_RETPOLINE if (set->ops == &nft_set_hash_fast_type.ops) return nft_hash_lookup_fast(net, set, key); if (set->ops == &nft_set_hash_type.ops) @@ -51,10 +51,17 @@ nft_set_do_lookup(const struct net *net, const struct nft_set *set, return nft_rbtree_lookup(net, set, key); WARN_ON_ONCE(1); +#endif return set->ops->lookup(net, set, key); } + +const struct nft_set_ext * +nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) +{ + return __nft_set_do_lookup(net, set, key); +} EXPORT_SYMBOL_GPL(nft_set_do_lookup); -#endif void nft_lookup_eval(const struct nft_expr *expr, struct nft_regs *regs, -- cgit v1.2.3 From b2f742c846cab9afc5953a5d8f17b54922dcc723 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Sep 2025 10:02:22 +0200 Subject: netfilter: nf_tables: restart set lookup on base_seq change The hash, hash_fast, rhash and bitwise sets may indicate no result even though a matching element exists during a short time window while other cpu is finalizing the transaction. This happens when the hash lookup/bitwise lookup function has picked up the old genbit, right before it was toggled by nf_tables_commit(), but then the same cpu managed to unlink the matching old element from the hash table: cpu0 cpu1 has added new elements to clone has marked elements as being inactive in new generation perform lookup in the set enters commit phase: A) observes old genbit increments base_seq I) increments the genbit II) removes old element from the set B) finds matching element C) returns no match: found element is not valid in old generation Next lookup observes new genbit and finds matching e2. Consider a packet matching element e1, e2. cpu0 processes following transaction: 1. remove e1 2. adds e2, which has same key as e1. P matches both e1 and e2. Therefore, cpu1 should always find a match for P. Due to above race, this is not the case: cpu1 observed the old genbit. e2 will not be considered once it is found. The element e1 is not found anymore if cpu0 managed to unlink it from the hlist before cpu1 found it during list traversal. The situation only occurs for a brief time period, lookups happening after I) observe new genbit and return e2. This problem exists in all set types except nft_set_pipapo, so fix it once in nft_lookup rather than each set ops individually. Sample the base sequence counter, which gets incremented right before the genbit is changed. Then, if no match is found, retry the lookup if the base sequence was altered in between. If the base sequence hasn't changed: - No update took place: no-match result is expected. This is the common case. or: - nf_tables_commit() hasn't progressed to genbit update yet. Old elements were still visible and nomatch result is expected, or: - nf_tables_commit updated the genbit: We picked up the new base_seq, so the lookup function also picked up the new genbit, no-match result is expected. If the old genbit was observed, then nft_lookup also picked up the old base_seq: nft_lookup_should_retry() returns true and relookup is performed in the new generation. This problem was added when the unconditional synchronize_rcu() call that followed the current/next generation bit toggle was removed. Thanks to Pablo Neira Ayuso for reviewing an earlier version of this patchset, for suggesting re-use of existing base_seq and placement of the restart loop in nft_set_do_lookup(). Fixes: 0cbc06b3faba ("netfilter: nf_tables: remove synchronize_rcu in commit phase") Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 3 ++- net/netfilter/nft_lookup.c | 31 ++++++++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9518b50695ba..c3c73411c40c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -10973,7 +10973,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) while (++base_seq == 0) ; - WRITE_ONCE(net->nft.base_seq, base_seq); + /* pairs with smp_load_acquire in nft_lookup_eval */ + smp_store_release(&net->nft.base_seq, base_seq); gc_seq = nft_gc_seq_begin(nft_net); diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 2c6909bf1b40..58c5b14889c4 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -55,11 +55,40 @@ __nft_set_do_lookup(const struct net *net, const struct nft_set *set, return set->ops->lookup(net, set, key); } +static unsigned int nft_base_seq(const struct net *net) +{ + /* pairs with smp_store_release() in nf_tables_commit() */ + return smp_load_acquire(&net->nft.base_seq); +} + +static bool nft_lookup_should_retry(const struct net *net, unsigned int seq) +{ + return unlikely(seq != nft_base_seq(net)); +} + const struct nft_set_ext * nft_set_do_lookup(const struct net *net, const struct nft_set *set, const u32 *key) { - return __nft_set_do_lookup(net, set, key); + const struct nft_set_ext *ext; + unsigned int base_seq; + + do { + base_seq = nft_base_seq(net); + + ext = __nft_set_do_lookup(net, set, key); + if (ext) + break; + /* No match? There is a small chance that lookup was + * performed in the old generation, but nf_tables_commit() + * already unlinked a (matching) element. + * + * We need to repeat the lookup to make sure that we didn't + * miss a matching element in the new generation. + */ + } while (nft_lookup_should_retry(net, base_seq)); + + return ext; } EXPORT_SYMBOL_GPL(nft_set_do_lookup); -- cgit v1.2.3 From cdbc9836c7afadad68f374791738f118263c5371 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 3 Jul 2025 12:10:50 +0200 Subject: libceph: fix invalid accesses to ceph_connection_v1_info There is a place where generic code in messenger.c is reading and another place where it is writing to con->v1 union member without checking that the union member is active (i.e. msgr1 is in use). On 64-bit systems, con->v1.auth_retry overlaps with con->v2.out_iter, so such a read is almost guaranteed to return a bogus value instead of 0 when msgr2 is in use. This ends up being fairly benign because the side effect is just the invalidation of the authorizer and successive fetching of new tickets. con->v1.connect_seq overlaps with con->v2.conn_bufs and the fact that it's being written to can cause more serious consequences, but luckily it's not something that happens often. Cc: stable@vger.kernel.org Fixes: cd1a677cad99 ("libceph, ceph: implement msgr2.1 protocol (crc and secure modes)") Signed-off-by: Ilya Dryomov Reviewed-by: Viacheslav Dubeyko --- net/ceph/messenger.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index d1b5705dc0c6..9f6d860411cb 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1524,7 +1524,7 @@ static void con_fault_finish(struct ceph_connection *con) * in case we faulted due to authentication, invalidate our * current tickets so that we can get new ones. */ - if (con->v1.auth_retry) { + if (!ceph_msgr2(from_msgr(con->msgr)) && con->v1.auth_retry) { dout("auth_retry %d, invalidating\n", con->v1.auth_retry); if (con->ops->invalidate_authorizer) con->ops->invalidate_authorizer(con); @@ -1714,9 +1714,10 @@ static void clear_standby(struct ceph_connection *con) { /* come back from STANDBY? */ if (con->state == CEPH_CON_S_STANDBY) { - dout("clear_standby %p and ++connect_seq\n", con); + dout("clear_standby %p\n", con); con->state = CEPH_CON_S_PREOPEN; - con->v1.connect_seq++; + if (!ceph_msgr2(from_msgr(con->msgr))) + con->v1.connect_seq++; WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)); WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)); } -- cgit v1.2.3 From c3f8d13357deab1e04f8a52b499d6b9b704e578e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 10 Sep 2025 15:11:21 +0200 Subject: wifi: nl80211: completely disable per-link stats for now After commit 8cc71fc3b82b ("wifi: cfg80211: Fix "no buffer space available" error in nl80211_get_station() for MLO"), the per-link data is only included in station dumps, where the size limit is somewhat less of an issue. However, it's still an issue, depending on how many links a station has and how much per-link data there is. Thus, for now, disable per-link statistics entirely. A complete fix will need to take this into account, make it opt-in by userspace, and change the dump format to be able to split a single station's data across multiple netlink dump messages, which all together is too much development for a fix. Fixes: 82d7f841d9bd ("wifi: cfg80211: extend to embed link level statistics in NL message") Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f2f7424e930c..852573423e52 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7575,7 +7575,7 @@ static int nl80211_dump_station(struct sk_buff *skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, rdev, wdev->netdev, mac_addr, - &sinfo, true) < 0) + &sinfo, false) < 0) goto out; sta_idx++; -- cgit v1.2.3 From 8884c693991333ae065830554b9b0c96590b1bb2 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 5 Sep 2025 09:15:31 +0000 Subject: hsr: use rtnl lock when iterating over ports hsr_for_each_port is called in many places without holding the RCU read lock, this may trigger warnings on debug kernels. Most of the callers are actually hold rtnl lock. So add a new helper hsr_for_each_port_rtnl to allow callers in suitable contexts to iterate ports safely without explicit RCU locking. This patch only fixed the callers that is hold rtnl lock. Other caller issues will be fixed in later patches. Fixes: c5a759117210 ("net/hsr: Use list_head (and rcu) instead of array for slave devices.") Signed-off-by: Hangbin Liu Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250905091533.377443-2-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- net/hsr/hsr_device.c | 18 +++++++++--------- net/hsr/hsr_main.c | 2 +- net/hsr/hsr_main.h | 3 +++ 3 files changed, 13 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 88657255fec1..bce7b4061ce0 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -49,7 +49,7 @@ static bool hsr_check_carrier(struct hsr_port *master) ASSERT_RTNL(); - hsr_for_each_port(master->hsr, port) { + hsr_for_each_port_rtnl(master->hsr, port) { if (port->type != HSR_PT_MASTER && is_slave_up(port->dev)) { netif_carrier_on(master->dev); return true; @@ -105,7 +105,7 @@ int hsr_get_max_mtu(struct hsr_priv *hsr) struct hsr_port *port; mtu_max = ETH_DATA_LEN; - hsr_for_each_port(hsr, port) + hsr_for_each_port_rtnl(hsr, port) if (port->type != HSR_PT_MASTER) mtu_max = min(port->dev->mtu, mtu_max); @@ -139,7 +139,7 @@ static int hsr_dev_open(struct net_device *dev) hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { @@ -172,7 +172,7 @@ static int hsr_dev_close(struct net_device *dev) struct hsr_priv *hsr; hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { @@ -205,7 +205,7 @@ static netdev_features_t hsr_features_recompute(struct hsr_priv *hsr, * may become enabled. */ features &= ~NETIF_F_ONE_FOR_ALL; - hsr_for_each_port(hsr, port) + hsr_for_each_port_rtnl(hsr, port) features = netdev_increment_features(features, port->dev->features, mask); @@ -484,7 +484,7 @@ static void hsr_set_rx_mode(struct net_device *dev) hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { @@ -506,7 +506,7 @@ static void hsr_change_rx_flags(struct net_device *dev, int change) hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { @@ -534,7 +534,7 @@ static int hsr_ndo_vlan_rx_add_vid(struct net_device *dev, hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER || port->type == HSR_PT_INTERLINK) continue; @@ -580,7 +580,7 @@ static int hsr_ndo_vlan_rx_kill_vid(struct net_device *dev, hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { switch (port->type) { case HSR_PT_SLAVE_A: case HSR_PT_SLAVE_B: diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 192893c3f2ec..ac1eb1db1a52 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -22,7 +22,7 @@ static bool hsr_slave_empty(struct hsr_priv *hsr) { struct hsr_port *port; - hsr_for_each_port(hsr, port) + hsr_for_each_port_rtnl(hsr, port) if (port->type != HSR_PT_MASTER) return false; return true; diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 135ec5fce019..33b0d2460c9b 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -224,6 +224,9 @@ struct hsr_priv { #define hsr_for_each_port(hsr, port) \ list_for_each_entry_rcu((port), &(hsr)->ports, port_list) +#define hsr_for_each_port_rtnl(hsr, port) \ + list_for_each_entry_rcu((port), &(hsr)->ports, port_list, lockdep_rtnl_is_held()) + struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt); /* Caller must ensure skb is a valid HSR frame */ -- cgit v1.2.3 From 393c841fe4333cdd856d0ca37b066d72746cfaa6 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 5 Sep 2025 09:15:32 +0000 Subject: hsr: use hsr_for_each_port_rtnl in hsr_port_get_hsr hsr_port_get_hsr() iterates over ports using hsr_for_each_port(), but many of its callers do not hold the required RCU lock. Switch to hsr_for_each_port_rtnl(), since most callers already hold the rtnl lock. After review, all callers are covered by either the rtnl lock or the RCU lock, except hsr_dev_xmit(). Fix this by adding an RCU read lock there. Fixes: c5a759117210 ("net/hsr: Use list_head (and rcu) instead of array for slave devices.") Signed-off-by: Hangbin Liu Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250905091533.377443-3-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- net/hsr/hsr_device.c | 3 +++ net/hsr/hsr_main.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index bce7b4061ce0..702da1f9aaa9 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -226,6 +226,7 @@ static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) struct hsr_priv *hsr = netdev_priv(dev); struct hsr_port *master; + rcu_read_lock(); master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); if (master) { skb->dev = master->dev; @@ -238,6 +239,8 @@ static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) dev_core_stats_tx_dropped_inc(dev); dev_kfree_skb_any(skb); } + rcu_read_unlock(); + return NETDEV_TX_OK; } diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index ac1eb1db1a52..bc94b07101d8 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -134,7 +134,7 @@ struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt) { struct hsr_port *port; - hsr_for_each_port(hsr, port) + hsr_for_each_port_rtnl(hsr, port) if (port->type == pt) return port; return NULL; -- cgit v1.2.3 From 847748fc66d08a89135a74e29362a66ba4e3ab15 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 5 Sep 2025 09:15:33 +0000 Subject: hsr: hold rcu and dev lock for hsr_get_port_ndev hsr_get_port_ndev calls hsr_for_each_port, which need to hold rcu lock. On the other hand, before return the port device, we need to hold the device reference to avoid UaF in the caller function. Suggested-by: Paolo Abeni Fixes: 9c10dd8eed74 ("net: hsr: Create and export hsr_get_port_ndev()") Signed-off-by: Hangbin Liu Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250905091533.377443-4-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/ti/icssg/icssg_prueth.c | 20 ++++++++++++++------ net/hsr/hsr_device.c | 7 ++++++- 2 files changed, 20 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c index dadce6009791..e42d0fdefee1 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c @@ -654,7 +654,7 @@ static void icssg_prueth_hsr_fdb_add_del(struct prueth_emac *emac, static int icssg_prueth_hsr_add_mcast(struct net_device *ndev, const u8 *addr) { - struct net_device *real_dev; + struct net_device *real_dev, *port_dev; struct prueth_emac *emac; u8 vlan_id, i; @@ -663,11 +663,15 @@ static int icssg_prueth_hsr_add_mcast(struct net_device *ndev, const u8 *addr) if (is_hsr_master(real_dev)) { for (i = HSR_PT_SLAVE_A; i < HSR_PT_INTERLINK; i++) { - emac = netdev_priv(hsr_get_port_ndev(real_dev, i)); - if (!emac) + port_dev = hsr_get_port_ndev(real_dev, i); + emac = netdev_priv(port_dev); + if (!emac) { + dev_put(port_dev); return -EINVAL; + } icssg_prueth_hsr_fdb_add_del(emac, addr, vlan_id, true); + dev_put(port_dev); } } else { emac = netdev_priv(real_dev); @@ -679,7 +683,7 @@ static int icssg_prueth_hsr_add_mcast(struct net_device *ndev, const u8 *addr) static int icssg_prueth_hsr_del_mcast(struct net_device *ndev, const u8 *addr) { - struct net_device *real_dev; + struct net_device *real_dev, *port_dev; struct prueth_emac *emac; u8 vlan_id, i; @@ -688,11 +692,15 @@ static int icssg_prueth_hsr_del_mcast(struct net_device *ndev, const u8 *addr) if (is_hsr_master(real_dev)) { for (i = HSR_PT_SLAVE_A; i < HSR_PT_INTERLINK; i++) { - emac = netdev_priv(hsr_get_port_ndev(real_dev, i)); - if (!emac) + port_dev = hsr_get_port_ndev(real_dev, i); + emac = netdev_priv(port_dev); + if (!emac) { + dev_put(port_dev); return -EINVAL; + } icssg_prueth_hsr_fdb_add_del(emac, addr, vlan_id, false); + dev_put(port_dev); } } else { emac = netdev_priv(real_dev); diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 702da1f9aaa9..fbbc3ccf9df6 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -675,9 +675,14 @@ struct net_device *hsr_get_port_ndev(struct net_device *ndev, struct hsr_priv *hsr = netdev_priv(ndev); struct hsr_port *port; + rcu_read_lock(); hsr_for_each_port(hsr, port) - if (port->type == pt) + if (port->type == pt) { + dev_hold(port->dev); + rcu_read_unlock(); return port->dev; + } + rcu_read_unlock(); return NULL; } EXPORT_SYMBOL(hsr_get_port_ndev); -- cgit v1.2.3 From 1827f773e416842bb0a1be93f313e02591e0b0c2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 5 Sep 2025 15:15:38 -0700 Subject: net: xdp: pass full flags to xdp_update_skb_shared_info() xdp_update_skb_shared_info() needs to update skb state which was maintained in xdp_buff / frame. Pass full flags into it, instead of breaking it out bit by bit. We will need to add a bit for unreadable frags (even tho XDP doesn't support those the driver paths may be common), at which point almost all call sites would become: xdp_update_skb_shared_info(skb, num_frags, sinfo->xdp_frags_size, MY_PAGE_SIZE * num_frags, xdp_buff_is_frag_pfmemalloc(xdp), xdp_buff_is_frag_unreadable(xdp)); Keep a helper for accessing the flags, in case we need to transform them somehow in the future (e.g. to cover up xdp_buff vs xdp_frame differences). While we are touching call callers - rename the helper to xdp_update_skb_frags_info(), previous name may have implied that it's shinfo that's updated. We are updating flags in struct sk_buff based on frags that got attched. Signed-off-by: Jakub Kicinski Acked-by: Jesper Dangaard Brouer Link: https://patch.msgid.link/20250905221539.2930285-2-kuba@kernel.org Acked-by: Stanislav Fomichev Reviewed-by: Alexander Lobakin Signed-off-by: Paolo Abeni --- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 7 +++---- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 15 +++++++-------- drivers/net/ethernet/intel/ice/ice_txrx.c | 15 +++++++-------- drivers/net/ethernet/marvell/mvneta.c | 7 +++---- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 23 +++++++++++------------ drivers/net/virtio_net.c | 7 +++---- include/net/xdp.h | 23 +++++++++++------------ net/core/xdp.c | 21 ++++++++++----------- 8 files changed, 55 insertions(+), 63 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index 58d579dca3f1..3e77a96e5a3e 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -468,9 +468,8 @@ bnxt_xdp_build_skb(struct bnxt *bp, struct sk_buff *skb, u8 num_frags, if (!skb) return NULL; - xdp_update_skb_shared_info(skb, num_frags, - sinfo->xdp_frags_size, - BNXT_RX_PAGE_SIZE * num_frags, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, num_frags, sinfo->xdp_frags_size, + BNXT_RX_PAGE_SIZE * num_frags, + xdp_buff_get_skb_flags(xdp)); return skb; } diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 048c33039130..98601c62c592 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -2151,10 +2151,10 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring, memcpy(&skinfo->frags[skinfo->nr_frags], &sinfo->frags[0], sizeof(skb_frag_t) * nr_frags); - xdp_update_skb_shared_info(skb, skinfo->nr_frags + nr_frags, - sinfo->xdp_frags_size, - nr_frags * xdp->frame_sz, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, skinfo->nr_frags + nr_frags, + sinfo->xdp_frags_size, + nr_frags * xdp->frame_sz, + xdp_buff_get_skb_flags(xdp)); /* First buffer has already been processed, so bump ntc */ if (++rx_ring->next_to_clean == rx_ring->count) @@ -2206,10 +2206,9 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring, skb_metadata_set(skb, metasize); if (unlikely(xdp_buff_has_frags(xdp))) { - xdp_update_skb_shared_info(skb, nr_frags, - sinfo->xdp_frags_size, - nr_frags * xdp->frame_sz, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size, + nr_frags * xdp->frame_sz, + xdp_buff_get_skb_flags(xdp)); i40e_process_rx_buffs(rx_ring, I40E_XDP_PASS, xdp); } else { diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index d2871757ec94..107632a71f3c 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -1035,10 +1035,9 @@ ice_build_skb(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp) skb_metadata_set(skb, metasize); if (unlikely(xdp_buff_has_frags(xdp))) - xdp_update_skb_shared_info(skb, nr_frags, - sinfo->xdp_frags_size, - nr_frags * xdp->frame_sz, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size, + nr_frags * xdp->frame_sz, + xdp_buff_get_skb_flags(xdp)); return skb; } @@ -1115,10 +1114,10 @@ ice_construct_skb(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp) memcpy(&skinfo->frags[skinfo->nr_frags], &sinfo->frags[0], sizeof(skb_frag_t) * nr_frags); - xdp_update_skb_shared_info(skb, skinfo->nr_frags + nr_frags, - sinfo->xdp_frags_size, - nr_frags * xdp->frame_sz, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, skinfo->nr_frags + nr_frags, + sinfo->xdp_frags_size, + nr_frags * xdp->frame_sz, + xdp_buff_get_skb_flags(xdp)); } return skb; diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 476e73e502fe..7351e98d73f4 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -2416,10 +2416,9 @@ mvneta_swbm_build_skb(struct mvneta_port *pp, struct page_pool *pool, skb->ip_summed = mvneta_rx_csum(pp, desc_status); if (unlikely(xdp_buff_has_frags(xdp))) - xdp_update_skb_shared_info(skb, num_frags, - sinfo->xdp_frags_size, - num_frags * xdp->frame_sz, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, num_frags, sinfo->xdp_frags_size, + num_frags * xdp->frame_sz, + xdp_buff_get_skb_flags(xdp)); return skb; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index b8c609d91d11..2925ece136c4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1796,10 +1796,9 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi if (xdp_buff_has_frags(&mxbuf->xdp)) { /* sinfo->nr_frags is reset by build_skb, calculate again. */ - xdp_update_skb_shared_info(skb, wi - head_wi - 1, - sinfo->xdp_frags_size, truesize, - xdp_buff_is_frag_pfmemalloc( - &mxbuf->xdp)); + xdp_update_skb_frags_info(skb, wi - head_wi - 1, + sinfo->xdp_frags_size, truesize, + xdp_buff_get_skb_flags(&mxbuf->xdp)); for (struct mlx5e_wqe_frag_info *pwi = head_wi + 1; pwi < wi; pwi++) pwi->frag_page->frags++; @@ -2105,10 +2104,10 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w struct mlx5e_frag_page *pagep; /* sinfo->nr_frags is reset by build_skb, calculate again. */ - xdp_update_skb_shared_info(skb, frag_page - head_page, - sinfo->xdp_frags_size, truesize, - xdp_buff_is_frag_pfmemalloc( - &mxbuf->xdp)); + xdp_update_skb_frags_info(skb, frag_page - head_page, + sinfo->xdp_frags_size, + truesize, + xdp_buff_get_skb_flags(&mxbuf->xdp)); pagep = head_page; do @@ -2122,10 +2121,10 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w if (xdp_buff_has_frags(&mxbuf->xdp)) { struct mlx5e_frag_page *pagep; - xdp_update_skb_shared_info(skb, sinfo->nr_frags, - sinfo->xdp_frags_size, truesize, - xdp_buff_is_frag_pfmemalloc( - &mxbuf->xdp)); + xdp_update_skb_frags_info(skb, sinfo->nr_frags, + sinfo->xdp_frags_size, + truesize, + xdp_buff_get_skb_flags(&mxbuf->xdp)); pagep = frag_page - sinfo->nr_frags; do diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 975bdc5dab84..06708c9a979e 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2185,10 +2185,9 @@ static struct sk_buff *build_skb_from_xdp_buff(struct net_device *dev, skb_metadata_set(skb, metasize); if (unlikely(xdp_buff_has_frags(xdp))) - xdp_update_skb_shared_info(skb, nr_frags, - sinfo->xdp_frags_size, - xdp_frags_truesz, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size, + xdp_frags_truesz, + xdp_buff_get_skb_flags(xdp)); return skb; } diff --git a/include/net/xdp.h b/include/net/xdp.h index af60e11b336c..976cfd2f113c 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -116,15 +116,14 @@ static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp) xdp->flags &= ~XDP_FLAGS_HAS_FRAGS; } -static __always_inline bool -xdp_buff_is_frag_pfmemalloc(const struct xdp_buff *xdp) +static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) { - return !!(xdp->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); + xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; } -static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) +static __always_inline u32 xdp_buff_get_skb_flags(const struct xdp_buff *xdp) { - xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; + return xdp->flags; } static __always_inline void @@ -294,10 +293,10 @@ static __always_inline bool xdp_frame_has_frags(const struct xdp_frame *frame) return !!(frame->flags & XDP_FLAGS_HAS_FRAGS); } -static __always_inline bool -xdp_frame_is_frag_pfmemalloc(const struct xdp_frame *frame) +static __always_inline u32 +xdp_frame_get_skb_flags(const struct xdp_frame *frame) { - return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); + return frame->flags; } #define XDP_BULK_QUEUE_SIZE 16 @@ -334,9 +333,9 @@ static inline void xdp_scrub_frame(struct xdp_frame *frame) } static inline void -xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags, - unsigned int size, unsigned int truesize, - bool pfmemalloc) +xdp_update_skb_frags_info(struct sk_buff *skb, u8 nr_frags, + unsigned int size, unsigned int truesize, + u32 xdp_flags) { struct skb_shared_info *sinfo = skb_shinfo(skb); @@ -350,7 +349,7 @@ xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags, skb->len += size; skb->data_len += size; skb->truesize += truesize; - skb->pfmemalloc |= pfmemalloc; + skb->pfmemalloc |= !!(xdp_flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); } /* Avoids inlining WARN macro in fast-path */ diff --git a/net/core/xdp.c b/net/core/xdp.c index 491334b9b8be..9100e160113a 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -663,9 +663,8 @@ struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp) u32 tsize; tsize = sinfo->xdp_frags_truesize ? : nr_frags * xdp->frame_sz; - xdp_update_skb_shared_info(skb, nr_frags, - sinfo->xdp_frags_size, tsize, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size, + tsize, xdp_buff_get_skb_flags(xdp)); } skb->protocol = eth_type_trans(skb, rxq->dev); @@ -692,7 +691,7 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb, struct skb_shared_info *sinfo = skb_shinfo(skb); const struct skb_shared_info *xinfo; u32 nr_frags, tsize = 0; - bool pfmemalloc = false; + u32 flags = 0; xinfo = xdp_get_shared_info_from_buff(xdp); nr_frags = xinfo->nr_frags; @@ -714,11 +713,12 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb, __skb_fill_page_desc_noacc(sinfo, i, page, offset, len); tsize += truesize; - pfmemalloc |= page_is_pfmemalloc(page); + if (page_is_pfmemalloc(page)) + flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; } - xdp_update_skb_shared_info(skb, nr_frags, xinfo->xdp_frags_size, - tsize, pfmemalloc); + xdp_update_skb_frags_info(skb, nr_frags, xinfo->xdp_frags_size, tsize, + flags); return true; } @@ -823,10 +823,9 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, skb_metadata_set(skb, xdpf->metasize); if (unlikely(xdp_frame_has_frags(xdpf))) - xdp_update_skb_shared_info(skb, nr_frags, - sinfo->xdp_frags_size, - nr_frags * xdpf->frame_sz, - xdp_frame_is_frag_pfmemalloc(xdpf)); + xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size, + nr_frags * xdpf->frame_sz, + xdp_frame_get_skb_flags(xdpf)); /* Essential SKB info: protocol and skb->dev */ skb->protocol = eth_type_trans(skb, dev); -- cgit v1.2.3 From cda276bcb9a5c3d53620b3af9c372a87e0f92583 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 8 Sep 2025 10:32:31 +0300 Subject: ipv4: cipso: Simplify IP options handling in cipso_v4_error() When __ip_options_compile() is called with an skb, the IP options are parsed from the skb data into the provided IP option argument. This is in contrast to the case where the skb argument is NULL and the options are parsed from opt->__data. Given that cipso_v4_error() always passes an skb to __ip_options_compile(), there is no need to allocate an extra 40 bytes (maximum IP options size). Therefore, simplify the function by removing these extra bytes and make the function similar to ipv4_send_dest_unreach() which also calls both __ip_options_compile() and __icmp_send(). This is a preparation for changing the arguments being passed to __icmp_send(). No functional changes intended. Reviewed-by: Petr Machata Reviewed-by: David Ahern Acked-by: Paul Moore Signed-off-by: Ido Schimmel Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250908073238.119240-2-idosch@nvidia.com Signed-off-by: Paolo Abeni --- net/ipv4/cipso_ipv4.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 740af8541d2f..c7c949c37e2d 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1715,8 +1715,7 @@ validate_return: */ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) { - unsigned char optbuf[sizeof(struct ip_options) + 40]; - struct ip_options *opt = (struct ip_options *)optbuf; + struct ip_options opt; int res; if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES) @@ -1727,19 +1726,19 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) * so we can not use icmp_send and IPCB here. */ - memset(opt, 0, sizeof(struct ip_options)); - opt->optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr); + memset(&opt, 0, sizeof(opt)); + opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); rcu_read_lock(); - res = __ip_options_compile(dev_net(skb->dev), opt, skb, NULL); + res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL); rcu_read_unlock(); if (res) return; if (gateway) - __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, opt); + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, &opt); else - __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, opt); + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, &opt); } /** -- cgit v1.2.3 From 0d3c4a441686663ad34aa3d6abe8c5317d21e707 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 8 Sep 2025 10:32:32 +0300 Subject: ipv4: icmp: Pass IPv4 control block structure as an argument to __icmp_send() __icmp_send() is used to generate ICMP error messages in response to various situations such as MTU errors (i.e., "Fragmentation Required") and too many hops (i.e., "Time Exceeded"). The skb that generated the error does not necessarily come from the IPv4 layer and does not always have a valid IPv4 control block in skb->cb. Therefore, commit 9ef6b42ad6fd ("net: Add __icmp_send helper.") changed the function to take the IP options structure as argument instead of deriving it from the skb's control block. Some callers of this function such as icmp_send() pass the IP options structure from the skb's control block as in these call paths the control block is known to be valid, but other callers simply pass a zeroed structure. A subsequent patch will need __icmp_send() to access more information from the IPv4 control block (specifically, the ifindex of the input interface). As a preparation for this change, change the function to take the IPv4 control block structure as an argument instead of the IP options structure. This makes the function similar to its IPv6 counterpart that already takes the IPv6 control block structure as an argument. No functional changes intended. Reviewed-by: Petr Machata Reviewed-by: David Ahern Signed-off-by: Ido Schimmel Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250908073238.119240-3-idosch@nvidia.com Signed-off-by: Paolo Abeni --- include/net/icmp.h | 10 ++++++---- net/ipv4/cipso_ipv4.c | 12 ++++++------ net/ipv4/icmp.c | 12 +++++++----- net/ipv4/route.c | 10 +++++----- 4 files changed, 24 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/include/net/icmp.h b/include/net/icmp.h index caddf4a59ad1..935ee13d9ae9 100644 --- a/include/net/icmp.h +++ b/include/net/icmp.h @@ -37,10 +37,10 @@ struct sk_buff; struct net; void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, - const struct ip_options *opt); + const struct inet_skb_parm *parm); static inline void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) { - __icmp_send(skb_in, type, code, info, &IPCB(skb_in)->opt); + __icmp_send(skb_in, type, code, info, IPCB(skb_in)); } #if IS_ENABLED(CONFIG_NF_NAT) @@ -48,8 +48,10 @@ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info); #else static inline void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) { - struct ip_options opts = { 0 }; - __icmp_send(skb_in, type, code, info, &opts); + struct inet_skb_parm parm; + + memset(&parm, 0, sizeof(parm)); + __icmp_send(skb_in, type, code, info, &parm); } #endif diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index c7c949c37e2d..709021197e1c 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1715,7 +1715,7 @@ validate_return: */ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) { - struct ip_options opt; + struct inet_skb_parm parm; int res; if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES) @@ -1726,19 +1726,19 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) * so we can not use icmp_send and IPCB here. */ - memset(&opt, 0, sizeof(opt)); - opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); + memset(&parm, 0, sizeof(parm)); + parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); rcu_read_lock(); - res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL); + res = __ip_options_compile(dev_net(skb->dev), &parm.opt, skb, NULL); rcu_read_unlock(); if (res) return; if (gateway) - __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, &opt); + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, &parm); else - __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, &opt); + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, &parm); } /** diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 863bf5023f2a..59fd0e1993a6 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -594,7 +594,7 @@ relookup_failed: */ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, - const struct ip_options *opt) + const struct inet_skb_parm *parm) { struct iphdr *iph; int room; @@ -725,7 +725,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, iph->tos; mark = IP4_REPLY_MARK(net, skb_in->mark); - if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, opt)) + if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, + &parm->opt)) goto out_unlock; @@ -799,15 +800,16 @@ EXPORT_SYMBOL(__icmp_send); void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) { struct sk_buff *cloned_skb = NULL; - struct ip_options opts = { 0 }; enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; + struct inet_skb_parm parm; struct nf_conn *ct; __be32 orig_ip; + memset(&parm, 0, sizeof(parm)); ct = nf_ct_get(skb_in, &ctinfo); if (!ct || !(READ_ONCE(ct->status) & IPS_NAT_MASK)) { - __icmp_send(skb_in, type, code, info, &opts); + __icmp_send(skb_in, type, code, info, &parm); return; } @@ -823,7 +825,7 @@ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) orig_ip = ip_hdr(skb_in)->saddr; dir = CTINFO2DIR(ctinfo); ip_hdr(skb_in)->saddr = ct->tuplehash[dir].tuple.src.u3.ip; - __icmp_send(skb_in, type, code, info, &opts); + __icmp_send(skb_in, type, code, info, &parm); ip_hdr(skb_in)->saddr = orig_ip; out: consume_skb(cloned_skb); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 50309f2ab132..6d27d3610c1c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1222,8 +1222,8 @@ EXPORT_INDIRECT_CALLABLE(ipv4_dst_check); static void ipv4_send_dest_unreach(struct sk_buff *skb) { + struct inet_skb_parm parm; struct net_device *dev; - struct ip_options opt; int res; /* Recompile ip options since IPCB may not be valid anymore. @@ -1233,21 +1233,21 @@ static void ipv4_send_dest_unreach(struct sk_buff *skb) ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5) return; - memset(&opt, 0, sizeof(opt)); + memset(&parm, 0, sizeof(parm)); if (ip_hdr(skb)->ihl > 5) { if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4)) return; - opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); + parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); rcu_read_lock(); dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev; - res = __ip_options_compile(dev_net(dev), &opt, skb, NULL); + res = __ip_options_compile(dev_net(dev), &parm.opt, skb, NULL); rcu_read_unlock(); if (res) return; } - __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt); + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &parm); } static void ipv4_link_failure(struct sk_buff *skb) -- cgit v1.2.3 From 4a8c416602d97a4e2073ed563d4d4c7627de19cf Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 8 Sep 2025 10:32:33 +0300 Subject: ipv4: icmp: Fix source IP derivation in presence of VRFs When the "icmp_errors_use_inbound_ifaddr" sysctl is enabled, the source IP of ICMP error messages should be the "primary address of the interface that received the packet that caused the icmp error". The IPv4 ICMP code determines this interface using inet_iif() which in the input path translates to skb->skb_iif. If the interface that received the packet is a VRF port, skb->skb_iif will contain the ifindex of the VRF device and not that of the receiving interface. This is because in the input path the VRF driver overrides skb->skb_iif with the ifindex of the VRF device itself (see vrf_ip_rcv()). As such, the source IP that will be chosen for the ICMP error message is either an address assigned to the VRF device itself (if present) or an address assigned to some VRF port, not necessarily the input or output interface. This behavior is especially problematic when the error messages are "Time Exceeded" messages as it means that utilities like traceroute will show an incorrect packet path. Solve this by determining the input interface based on the iif field in the control block, if present. This field is set in the input path to skb->skb_iif and is not later overridden by the VRF driver, unlike skb->skb_iif. This behavior is consistent with the IPv6 counterpart that already uses the iif from the control block. Reported-by: Andy Roulin Reported-by: Rajkumar Srinivasan Reviewed-by: Petr Machata Reviewed-by: David Ahern Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250908073238.119240-4-idosch@nvidia.com Signed-off-by: Paolo Abeni --- net/ipv4/icmp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 59fd0e1993a6..1b7fb5d935ed 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -710,7 +710,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, rcu_read_lock(); if (rt_is_input_route(rt) && READ_ONCE(net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)) - dev = dev_get_by_index_rcu(net, inet_iif(skb_in)); + dev = dev_get_by_index_rcu(net, parm->iif ? parm->iif : + inet_iif(skb_in)); if (dev) saddr = inet_select_addr(dev, iph->saddr, -- cgit v1.2.3 From ba941796d7cd1e81f51eed145dad1b47240ff420 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Fri, 29 Aug 2025 16:36:21 +0800 Subject: netfilter: ipset: Remove unused htable_bits in macro ahash_region Since the ahash_region() macro was redefined to calculate the region index solely from HTABLE_REGION_BITS, the htable_bits parameter became unused. Remove the unused htable_bits argument and its call sites, simplifying the code without changing semantics. Fixes: 8478a729c046 ("netfilter: ipset: fix region locking in hash types") Signed-off-by: Zhen Ni Reviewed-by: Phil Sutter Signed-off-by: Florian Westphal --- net/netfilter/ipset/ip_set_hash_gen.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 5251524b96af..5e4453e9ef8e 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -63,7 +63,7 @@ struct hbucket { : jhash_size((htable_bits) - HTABLE_REGION_BITS)) #define ahash_sizeof_regions(htable_bits) \ (ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region)) -#define ahash_region(n, htable_bits) \ +#define ahash_region(n) \ ((n) / jhash_size(HTABLE_REGION_BITS)) #define ahash_bucket_start(h, htable_bits) \ ((htable_bits) < HTABLE_REGION_BITS ? 0 \ @@ -702,7 +702,7 @@ retry: #endif key = HKEY(data, h->initval, htable_bits); m = __ipset_dereference(hbucket(t, key)); - nr = ahash_region(key, htable_bits); + nr = ahash_region(key); if (!m) { m = kzalloc(sizeof(*m) + AHASH_INIT_SIZE * dsize, @@ -852,7 +852,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); key = HKEY(value, h->initval, t->htable_bits); - r = ahash_region(key, t->htable_bits); + r = ahash_region(key); atomic_inc(&t->uref); elements = t->hregion[r].elements; maxelem = t->maxelem; @@ -1050,7 +1050,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); key = HKEY(value, h->initval, t->htable_bits); - r = ahash_region(key, t->htable_bits); + r = ahash_region(key); atomic_inc(&t->uref); rcu_read_unlock_bh(); -- cgit v1.2.3 From cbd2257dc96e3e46217540fcb095a757ffa20d96 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Tue, 2 Sep 2025 13:28:08 +0200 Subject: netfilter: nft_meta_bridge: introduce NFT_META_BRI_IIFHWADDR support Expose the input bridge interface ethernet address so it can be used to redirect the packet to the receiving physical device for processing. Tested with nft command line tool. table bridge nat { chain PREROUTING { type filter hook prerouting priority 0; policy accept; ether daddr de:ad:00:00:be:ef meta pkttype set host ether daddr set meta ibrhwdr accept } } Joint work with Pablo Neira. Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Florian Westphal --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/bridge/netfilter/nft_meta_bridge.c | 11 +++++++++++ 2 files changed, 13 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 8e0eb832bc01..7c0c915f0306 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -959,6 +959,7 @@ enum nft_exthdr_attributes { * @NFT_META_SDIF: slave device interface index * @NFT_META_SDIFNAME: slave device interface name * @NFT_META_BRI_BROUTE: packet br_netfilter_broute bit + * @NFT_META_BRI_IIFHWADDR: packet input bridge interface ethernet address */ enum nft_meta_keys { NFT_META_LEN, @@ -999,6 +1000,7 @@ enum nft_meta_keys { NFT_META_SDIFNAME, NFT_META_BRI_BROUTE, __NFT_META_IIFTYPE, + NFT_META_BRI_IIFHWADDR, }; /** diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index 5adced1e7d0c..b7af36bbd306 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -59,6 +59,13 @@ static void nft_meta_bridge_get_eval(const struct nft_expr *expr, nft_reg_store_be16(dest, htons(p_proto)); return; } + case NFT_META_BRI_IIFHWADDR: + br_dev = nft_meta_get_bridge(in); + if (!br_dev) + goto err; + + memcpy(dest, br_dev->dev_addr, ETH_ALEN); + return; default: return nft_meta_get_eval(expr, regs, pkt); } @@ -86,6 +93,9 @@ static int nft_meta_bridge_get_init(const struct nft_ctx *ctx, case NFT_META_BRI_IIFVPROTO: len = sizeof(u16); break; + case NFT_META_BRI_IIFHWADDR: + len = ETH_ALEN; + break; default: return nft_meta_get_init(ctx, expr, tb); } @@ -175,6 +185,7 @@ static int nft_meta_bridge_set_validate(const struct nft_ctx *ctx, switch (priv->key) { case NFT_META_BRI_BROUTE: + case NFT_META_BRI_IIFHWADDR: hooks = 1 << NF_BR_PRE_ROUTING; break; default: -- cgit v1.2.3 From 944b6b216c0387ac3050cd8b773819ae360bfb1c Mon Sep 17 00:00:00 2001 From: Zhang Tengfei Date: Mon, 1 Sep 2025 21:46:54 +0800 Subject: ipvs: Use READ_ONCE/WRITE_ONCE for ipvs->enable KCSAN reported a data-race on the `ipvs->enable` flag, which is written in the control path and read concurrently from many other contexts. Following a suggestion by Julian, this patch fixes the race by converting all accesses to use `WRITE_ONCE()/READ_ONCE()`. This lightweight approach ensures atomic access and acts as a compiler barrier, preventing unsafe optimizations where the flag is checked in loops (e.g., in ip_vs_est.c). Additionally, the `enable` checks in the fast-path hooks (`ip_vs_in_hook`, `ip_vs_out_hook`, `ip_vs_forward_icmp`) are removed. These are unnecessary since commit 857ca89711de ("ipvs: register hooks only with services"). The `enable=0` condition they check for can only occur in two rare and non-fatal scenarios: 1) after hooks are registered but before the flag is set, and 2) after hooks are unregistered on cleanup_net. In the worst case, a single packet might be mishandled (e.g., dropped), which does not lead to a system crash or data corruption. Adding a check in the performance-critical fast-path to handle this harmless condition is not a worthwhile trade-off. Fixes: 857ca89711de ("ipvs: register hooks only with services") Reported-by: syzbot+1651b5234028c294c339@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=1651b5234028c294c339 Suggested-by: Julian Anastasov Link: https://lore.kernel.org/lvs-devel/2189fc62-e51e-78c9-d1de-d35b8e3657e3@ssi.bg/ Signed-off-by: Zhang Tengfei Acked-by: Julian Anastasov Signed-off-by: Florian Westphal --- net/netfilter/ipvs/ip_vs_conn.c | 4 ++-- net/netfilter/ipvs/ip_vs_core.c | 11 ++++------- net/netfilter/ipvs/ip_vs_ctl.c | 6 +++--- net/netfilter/ipvs/ip_vs_est.c | 16 ++++++++-------- 4 files changed, 17 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 965f3c8e5089..37ebb0cb62b8 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -885,7 +885,7 @@ static void ip_vs_conn_expire(struct timer_list *t) * conntrack cleanup for the net. */ smp_rmb(); - if (ipvs->enable) + if (READ_ONCE(ipvs->enable)) ip_vs_conn_drop_conntrack(cp); } @@ -1439,7 +1439,7 @@ void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs) cond_resched_rcu(); /* netns clean up started, abort delayed work */ - if (!ipvs->enable) + if (!READ_ONCE(ipvs->enable)) break; } rcu_read_unlock(); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index c7a8a08b7308..5ea7ab8bf4dc 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1353,9 +1353,6 @@ ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *stat if (unlikely(!skb_dst(skb))) return NF_ACCEPT; - if (!ipvs->enable) - return NF_ACCEPT; - ip_vs_fill_iph_skb(af, skb, false, &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { @@ -1940,7 +1937,7 @@ ip_vs_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state return NF_ACCEPT; } /* ipvs enabled in this netns ? */ - if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) + if (unlikely(sysctl_backup_only(ipvs))) return NF_ACCEPT; ip_vs_fill_iph_skb(af, skb, false, &iph); @@ -2108,7 +2105,7 @@ ip_vs_forward_icmp(void *priv, struct sk_buff *skb, int r; /* ipvs enabled in this netns ? */ - if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) + if (unlikely(sysctl_backup_only(ipvs))) return NF_ACCEPT; if (state->pf == NFPROTO_IPV4) { @@ -2295,7 +2292,7 @@ static int __net_init __ip_vs_init(struct net *net) return -ENOMEM; /* Hold the beast until a service is registered */ - ipvs->enable = 0; + WRITE_ONCE(ipvs->enable, 0); ipvs->net = net; /* Counters used for creating unique names */ ipvs->gen = atomic_read(&ipvs_netns_cnt); @@ -2367,7 +2364,7 @@ static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list) ipvs = net_ipvs(net); ip_vs_unregister_hooks(ipvs, AF_INET); ip_vs_unregister_hooks(ipvs, AF_INET6); - ipvs->enable = 0; /* Disable packet reception */ + WRITE_ONCE(ipvs->enable, 0); /* Disable packet reception */ smp_wmb(); ip_vs_sync_net_cleanup(ipvs); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 6a6fc4478533..4c8fa22be88a 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -256,7 +256,7 @@ static void est_reload_work_handler(struct work_struct *work) struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id]; /* netns clean up started, abort delayed work */ - if (!ipvs->enable) + if (!READ_ONCE(ipvs->enable)) goto unlock; if (!kd) continue; @@ -1483,9 +1483,9 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, *svc_p = svc; - if (!ipvs->enable) { + if (!READ_ONCE(ipvs->enable)) { /* Now there is a service - full throttle */ - ipvs->enable = 1; + WRITE_ONCE(ipvs->enable, 1); /* Start estimation for first time */ ip_vs_est_reload_start(ipvs); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 15049b826732..93a925f1ed9b 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -231,7 +231,7 @@ static int ip_vs_estimation_kthread(void *data) void ip_vs_est_reload_start(struct netns_ipvs *ipvs) { /* Ignore reloads before first service is added */ - if (!ipvs->enable) + if (!READ_ONCE(ipvs->enable)) return; ip_vs_est_stopped_recalc(ipvs); /* Bump the kthread configuration genid */ @@ -306,7 +306,7 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) int i; if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && - ipvs->enable && ipvs->est_max_threads) + READ_ONCE(ipvs->enable) && ipvs->est_max_threads) return -EINVAL; mutex_lock(&ipvs->est_mutex); @@ -343,7 +343,7 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) } /* Start kthread tasks only when services are present */ - if (ipvs->enable && !ip_vs_est_stopped(ipvs)) { + if (READ_ONCE(ipvs->enable) && !ip_vs_est_stopped(ipvs)) { ret = ip_vs_est_kthread_start(ipvs, kd); if (ret < 0) goto out; @@ -486,7 +486,7 @@ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) struct ip_vs_estimator *est = &stats->est; int ret; - if (!ipvs->est_max_threads && ipvs->enable) + if (!ipvs->est_max_threads && READ_ONCE(ipvs->enable)) ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); est->ktid = -1; @@ -663,7 +663,7 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) /* Wait for cpufreq frequency transition */ wait_event_idle_timeout(wq, kthread_should_stop(), HZ / 50); - if (!ipvs->enable || kthread_should_stop()) + if (!READ_ONCE(ipvs->enable) || kthread_should_stop()) goto stop; } @@ -681,7 +681,7 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) rcu_read_unlock(); local_bh_enable(); - if (!ipvs->enable || kthread_should_stop()) + if (!READ_ONCE(ipvs->enable) || kthread_should_stop()) goto stop; cond_resched(); @@ -757,7 +757,7 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) mutex_lock(&ipvs->est_mutex); for (id = 1; id < ipvs->est_kt_count; id++) { /* netns clean up started, abort */ - if (!ipvs->enable) + if (!READ_ONCE(ipvs->enable)) goto unlock2; kd = ipvs->est_kt_arr[id]; if (!kd) @@ -787,7 +787,7 @@ last_kt: id = ipvs->est_kt_count; next_kt: - if (!ipvs->enable || kthread_should_stop()) + if (!READ_ONCE(ipvs->enable) || kthread_should_stop()) goto unlock; id--; if (id < 0) -- cgit v1.2.3 From db99b2f2b3e2cd8227ac9990ca4a8a31a1e95e56 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Aug 2025 17:01:02 +0200 Subject: netfilter: nf_reject: don't reply to icmp error messages tcp reject code won't reply to a tcp reset. But the icmp reject 'netdev' family versions will reply to icmp dst-unreach errors, unlike icmp_send() and icmp6_send() which are used by the inet family implementation (and internally by the REJECT target). Check for the icmp(6) type and do not respond if its an unreachable error. Without this, something like 'ip protocol icmp reject', when used in a netdev chain attached to 'lo', cause a packet loop. Same for two hosts that both use such a rule: each error packet will be replied to. Such situation persist until the (bogus) rule is amended to ratelimit or checks the icmp type before the reject statement. As the inet versions don't do this make the netdev ones follow along. Signed-off-by: Florian Westphal --- net/ipv4/netfilter/nf_reject_ipv4.c | 25 +++++++++++++++++++++++++ net/ipv6/netfilter/nf_reject_ipv6.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 05631abe3f0d..fae4aa4a5f09 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -80,6 +80,27 @@ struct sk_buff *nf_reject_skb_v4_tcp_reset(struct net *net, } EXPORT_SYMBOL_GPL(nf_reject_skb_v4_tcp_reset); +static bool nf_skb_is_icmp_unreach(const struct sk_buff *skb) +{ + const struct iphdr *iph = ip_hdr(skb); + u8 *tp, _type; + int thoff; + + if (iph->protocol != IPPROTO_ICMP) + return false; + + thoff = skb_network_offset(skb) + sizeof(*iph); + + tp = skb_header_pointer(skb, + thoff + offsetof(struct icmphdr, type), + sizeof(_type), &_type); + + if (!tp) + return false; + + return *tp == ICMP_DEST_UNREACH; +} + struct sk_buff *nf_reject_skb_v4_unreach(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, @@ -100,6 +121,10 @@ struct sk_buff *nf_reject_skb_v4_unreach(struct net *net, if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) return NULL; + /* don't reply to ICMP_DEST_UNREACH with ICMP_DEST_UNREACH. */ + if (nf_skb_is_icmp_unreach(oldskb)) + return NULL; + /* RFC says return as much as we can without exceeding 576 bytes. */ len = min_t(unsigned int, 536, oldskb->len); diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 6b022449f867..ef5b7e85cffa 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -104,6 +104,32 @@ struct sk_buff *nf_reject_skb_v6_tcp_reset(struct net *net, } EXPORT_SYMBOL_GPL(nf_reject_skb_v6_tcp_reset); +static bool nf_skb_is_icmp6_unreach(const struct sk_buff *skb) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + u8 proto = ip6h->nexthdr; + u8 _type, *tp; + int thoff; + __be16 fo; + + thoff = ipv6_skip_exthdr(skb, ((u8 *)(ip6h + 1) - skb->data), &proto, &fo); + + if (thoff < 0 || thoff >= skb->len || fo != 0) + return false; + + if (proto != IPPROTO_ICMPV6) + return false; + + tp = skb_header_pointer(skb, + thoff + offsetof(struct icmp6hdr, icmp6_type), + sizeof(_type), &_type); + + if (!tp) + return false; + + return *tp == ICMPV6_DEST_UNREACH; +} + struct sk_buff *nf_reject_skb_v6_unreach(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, @@ -117,6 +143,10 @@ struct sk_buff *nf_reject_skb_v6_unreach(struct net *net, if (!nf_reject_ip6hdr_validate(oldskb)) return NULL; + /* Don't reply to ICMPV6_DEST_UNREACH with ICMPV6_DEST_UNREACH */ + if (nf_skb_is_icmp6_unreach(oldskb)) + return NULL; + /* Include "As much of invoking packet as possible without the ICMPv6 * packet exceeding the minimum IPv6 MTU" in the ICMP payload. */ -- cgit v1.2.3 From 247981eecd3dd6ff51bd0a0223deba8af39c5498 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Wed, 10 Sep 2025 20:37:16 +0000 Subject: net: Use NAPI_* in test_bit when stopping napi kthread napi_stop_kthread waits for the NAPI_STATE_SCHED_THREADED to be unset before stopping the kthread. But it uses test_bit with the NAPIF_STATE_SCHED_THREADED and that might stop the kthread early before the flag is unset. Use the NAPI_* variant of the NAPI state bits in test_bit instead. Tested: ./tools/testing/selftests/net/nl_netdev.py TAP version 13 1..7 ok 1 nl_netdev.empty_check ok 2 nl_netdev.lo_check ok 3 nl_netdev.page_pool_check ok 4 nl_netdev.napi_list_check ok 5 nl_netdev.dev_set_threaded ok 6 nl_netdev.napi_set_threaded ok 7 nl_netdev.nsim_rxq_reset_down # Totals: pass:7 fail:0 xfail:0 xpass:0 skip:0 error:0 ./tools/testing/selftests/drivers/net/napi_threaded.py TAP version 13 1..2 ok 1 napi_threaded.change_num_queues ok 2 napi_threaded.enable_dev_threaded_disable_napi_threaded # Totals: pass:2 fail:0 xfail:0 xpass:0 skip:0 error:0 Fixes: 689883de94dd ("net: stop napi kthreads when THREADED napi is disabled") Signed-off-by: Samiullah Khawaja Link: https://patch.msgid.link/20250910203716.1016546-1-skhawaja@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 93a25d87b86b..8d49b2198d07 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6965,7 +6965,7 @@ static void napi_stop_kthread(struct napi_struct *napi) * the kthread. */ while (true) { - if (!test_bit(NAPIF_STATE_SCHED_THREADED, &napi->state)) + if (!test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) break; msleep(20); -- cgit v1.2.3 From 28d2420d403ada8a5ff1bf2077ef66051b2aa4d7 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 8 Sep 2025 18:45:48 +0800 Subject: net: af_packet: remove last_kactive_blk_num field kactive_blk_num (K) is only incremented on block close. In timer callback prb_retire_rx_blk_timer_expired, except delete_blk_timer is true, last_kactive_blk_num (L) is set to match kactive_blk_num (K) in all cases. L is also set to match K in prb_open_block. The only case K not equal to L is when scheduled by tpacket_rcv and K is just incremented on block close but no new block could be opened, so that it does not call prb_open_block in prb_dispatch_next_block. This patch modifies the prb_retire_rx_blk_timer_expired function by simply removing the check for L == K. This patch just provides another checkpoint to thaw the might-be-frozen block in any case. It doesn't have any effect because __packet_lookup_frame_in_block() has the same logic and does it again without this patch when detecting the ring is frozen. The patch only advances checking the status of the ring. Suggested-by: Willem de Bruijn Reviewed-by: Willem de Bruijn Reviewed-by: Jason Xing Link: https://lore.kernel.org/all/20250831100822.1238795-1-jackzxcui1989@163.com/ Signed-off-by: Xin Zhao Link: https://patch.msgid.link/20250908104549.204412-2-jackzxcui1989@163.com Signed-off-by: Jakub Kicinski --- net/packet/af_packet.c | 60 +++++++++++++++++++++++--------------------------- net/packet/internal.h | 6 ----- 2 files changed, 28 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 9d42c4bd6e39..230cb8764615 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -669,7 +669,6 @@ static void init_prb_bdqc(struct packet_sock *po, p1->knum_blocks = req_u->req3.tp_block_nr; p1->hdrlen = po->tp_hdrlen; p1->version = po->tp_version; - p1->last_kactive_blk_num = 0; po->stats.stats3.tp_freeze_q_cnt = 0; if (req_u->req3.tp_retire_blk_tov) p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov; @@ -693,7 +692,6 @@ static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc) { mod_timer(&pkc->retire_blk_timer, jiffies + pkc->tov_in_jiffies); - pkc->last_kactive_blk_num = pkc->kactive_blk_num; } /* @@ -750,38 +748,36 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t) write_unlock(&pkc->blk_fill_in_prog_lock); } - if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) { - if (!frozen) { - if (!BLOCK_NUM_PKTS(pbd)) { - /* An empty block. Just refresh the timer. */ - goto refresh_timer; - } - prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); - if (!prb_dispatch_next_block(pkc, po)) - goto refresh_timer; - else - goto out; + if (!frozen) { + if (!BLOCK_NUM_PKTS(pbd)) { + /* An empty block. Just refresh the timer. */ + goto refresh_timer; + } + prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); + if (!prb_dispatch_next_block(pkc, po)) + goto refresh_timer; + else + goto out; + } else { + /* Case 1. Queue was frozen because user-space was + * lagging behind. + */ + if (prb_curr_blk_in_use(pbd)) { + /* + * Ok, user-space is still behind. + * So just refresh the timer. + */ + goto refresh_timer; } else { - /* Case 1. Queue was frozen because user-space was - * lagging behind. + /* Case 2. queue was frozen,user-space caught up, + * now the link went idle && the timer fired. + * We don't have a block to close.So we open this + * block and restart the timer. + * opening a block thaws the queue,restarts timer + * Thawing/timer-refresh is a side effect. */ - if (prb_curr_blk_in_use(pbd)) { - /* - * Ok, user-space is still behind. - * So just refresh the timer. - */ - goto refresh_timer; - } else { - /* Case 2. queue was frozen,user-space caught up, - * now the link went idle && the timer fired. - * We don't have a block to close.So we open this - * block and restart the timer. - * opening a block thaws the queue,restarts timer - * Thawing/timer-refresh is a side effect. - */ - prb_open_block(pkc, pbd); - goto out; - } + prb_open_block(pkc, pbd); + goto out; } } diff --git a/net/packet/internal.h b/net/packet/internal.h index 1e743d0316fd..d367b9f93a73 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -24,12 +24,6 @@ struct tpacket_kbdq_core { unsigned short kactive_blk_num; unsigned short blk_sizeof_priv; - /* last_kactive_blk_num: - * trick to see if user-space has caught up - * in order to avoid refreshing timer when every single pkt arrives. - */ - unsigned short last_kactive_blk_num; - char *pkblk_start; char *pkblk_end; int kblk_size; -- cgit v1.2.3 From f7460d2989fa7fb29a0c6d8b929076521480a124 Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Mon, 8 Sep 2025 18:45:49 +0800 Subject: net: af_packet: Use hrtimer to do the retire operation In a system with high real-time requirements, the timeout mechanism of ordinary timers with jiffies granularity is insufficient to meet the demands for real-time performance. Meanwhile, the optimization of CPU usage with af_packet is quite significant. Use hrtimer instead of timer to help compensate for the shortcomings in real-time performance. In HZ=100 or HZ=250 system, the update of TP_STATUS_USER is not real-time enough, with fluctuations reaching over 8ms (on a system with HZ=250). This is unacceptable in some high real-time systems that require timely processing of network packets. By replacing it with hrtimer, if a timeout of 2ms is set, the update of TP_STATUS_USER can be stabilized to within 3 ms. Delete delete_blk_timer field, because hrtimer_cancel will check and wait until the timer callback return and ensure never enter callback again. Simplify the logic related to setting timeout, only update the hrtimer expire time within the hrtimer callback, no longer update the expire time in prb_open_block which is called by tpacket_rcv or timer callback. Reasons why NOT update hrtimer in prb_open_block: 1) It will increase complexity to distinguish the two caller scenario. 2) hrtimer_cancel and hrtimer_start need to be called if you want to update TMO of an already enqueued hrtimer, leading to complex shutdown logic. One side effect of NOT update hrtimer when called by tpacket_rcv is that a newly opened block triggered by tpacket_rcv may be retired earlier than expected. On the other hand, if timeout is updated in prb_open_block, the frequent reception of network packets that leads to prb_open_block being called may cause hrtimer to be removed and enqueued repeatedly. The retire hrtimer expiration is unconditional and periodic. If there are numerous packet sockets on the system, please set an appropriate timeout to avoid frequent enqueueing of hrtimers. Reviewed-by: Willem de Bruijn Reviewed-by: Jason Xing Link: https://lore.kernel.org/all/20250831100822.1238795-1-jackzxcui1989@163.com/ Signed-off-by: Xin Zhao Link: https://patch.msgid.link/20250908104549.204412-3-jackzxcui1989@163.com Signed-off-by: Jakub Kicinski --- net/packet/af_packet.c | 104 +++++++++++++------------------------------------ net/packet/diag.c | 2 +- net/packet/internal.h | 10 ++--- 3 files changed, 33 insertions(+), 83 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 230cb8764615..173e6edda08f 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -203,8 +203,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *, static int prb_queue_frozen(struct tpacket_kbdq_core *); static void prb_open_block(struct tpacket_kbdq_core *, struct tpacket_block_desc *); -static void prb_retire_rx_blk_timer_expired(struct timer_list *); -static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *); +static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *); static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void prb_clear_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); @@ -579,33 +578,13 @@ static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb) return proto; } -static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc) -{ - timer_delete_sync(&pkc->retire_blk_timer); -} - static void prb_shutdown_retire_blk_timer(struct packet_sock *po, struct sk_buff_head *rb_queue) { struct tpacket_kbdq_core *pkc; pkc = GET_PBDQC_FROM_RB(&po->rx_ring); - - spin_lock_bh(&rb_queue->lock); - pkc->delete_blk_timer = 1; - spin_unlock_bh(&rb_queue->lock); - - prb_del_retire_blk_timer(pkc); -} - -static void prb_setup_retire_blk_timer(struct packet_sock *po) -{ - struct tpacket_kbdq_core *pkc; - - pkc = GET_PBDQC_FROM_RB(&po->rx_ring); - timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired, - 0); - pkc->retire_blk_timer.expires = jiffies; + hrtimer_cancel(&pkc->retire_blk_timer); } static int prb_calc_retire_blk_tmo(struct packet_sock *po, @@ -671,53 +650,34 @@ static void init_prb_bdqc(struct packet_sock *po, p1->version = po->tp_version; po->stats.stats3.tp_freeze_q_cnt = 0; if (req_u->req3.tp_retire_blk_tov) - p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov; + p1->interval_ktime = ms_to_ktime(req_u->req3.tp_retire_blk_tov); else - p1->retire_blk_tov = prb_calc_retire_blk_tmo(po, - req_u->req3.tp_block_size); - p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); + p1->interval_ktime = ms_to_ktime(prb_calc_retire_blk_tmo(po, + req_u->req3.tp_block_size)); p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; rwlock_init(&p1->blk_fill_in_prog_lock); p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); prb_init_ft_ops(p1, req_u); - prb_setup_retire_blk_timer(po); + hrtimer_setup(&p1->retire_blk_timer, prb_retire_rx_blk_timer_expired, + CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); + hrtimer_start(&p1->retire_blk_timer, p1->interval_ktime, + HRTIMER_MODE_REL_SOFT); prb_open_block(p1, pbd); } -/* Do NOT update the last_blk_num first. - * Assumes sk_buff_head lock is held. - */ -static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc) -{ - mod_timer(&pkc->retire_blk_timer, - jiffies + pkc->tov_in_jiffies); -} - /* - * Timer logic: - * 1) We refresh the timer only when we open a block. - * By doing this we don't waste cycles refreshing the timer - * on packet-by-packet basis. - * * With a 1MB block-size, on a 1Gbps line, it will take * i) ~8 ms to fill a block + ii) memcpy etc. * In this cut we are not accounting for the memcpy time. * - * So, if the user sets the 'tmo' to 10ms then the timer - * will never fire while the block is still getting filled - * (which is what we want). However, the user could choose - * to close a block early and that's fine. - * - * But when the timer does fire, we check whether or not to refresh it. * Since the tmo granularity is in msecs, it is not too expensive * to refresh the timer, lets say every '8' msecs. * Either the user can set the 'tmo' or we can derive it based on * a) line-speed and b) block-size. * prb_calc_retire_blk_tmo() calculates the tmo. - * */ -static void prb_retire_rx_blk_timer_expired(struct timer_list *t) +static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *t) { struct packet_sock *po = timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer); @@ -730,9 +690,6 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t) frozen = prb_queue_frozen(pkc); pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); - if (unlikely(pkc->delete_blk_timer)) - goto out; - /* We only need to plug the race when the block is partially filled. * tpacket_rcv: * lock(); increment BLOCK_NUM_PKTS; unlock() @@ -749,26 +706,16 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t) } if (!frozen) { - if (!BLOCK_NUM_PKTS(pbd)) { - /* An empty block. Just refresh the timer. */ - goto refresh_timer; + if (BLOCK_NUM_PKTS(pbd)) { + /* Not an empty block. Need retire the block. */ + prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); + prb_dispatch_next_block(pkc, po); } - prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); - if (!prb_dispatch_next_block(pkc, po)) - goto refresh_timer; - else - goto out; } else { /* Case 1. Queue was frozen because user-space was * lagging behind. */ - if (prb_curr_blk_in_use(pbd)) { - /* - * Ok, user-space is still behind. - * So just refresh the timer. - */ - goto refresh_timer; - } else { + if (!prb_curr_blk_in_use(pbd)) { /* Case 2. queue was frozen,user-space caught up, * now the link went idle && the timer fired. * We don't have a block to close.So we open this @@ -777,15 +724,12 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t) * Thawing/timer-refresh is a side effect. */ prb_open_block(pkc, pbd); - goto out; } } -refresh_timer: - _prb_refresh_rx_retire_blk_timer(pkc); - -out: + hrtimer_forward_now(&pkc->retire_blk_timer, pkc->interval_ktime); spin_unlock(&po->sk.sk_receive_queue.lock); + return HRTIMER_RESTART; } static void prb_flush_block(struct tpacket_kbdq_core *pkc1, @@ -879,11 +823,18 @@ static void prb_thaw_queue(struct tpacket_kbdq_core *pkc) } /* - * Side effect of opening a block: + * prb_open_block is called by tpacket_rcv or timer callback. * - * 1) prb_queue is thawed. - * 2) retire_blk_timer is refreshed. + * Reasons why NOT update hrtimer in prb_open_block: + * 1) It will increase complexity to distinguish the two caller scenario. + * 2) hrtimer_cancel and hrtimer_start need to be called if you want to update + * TMO of an already enqueued hrtimer, leading to complex shutdown logic. * + * One side effect of NOT update hrtimer when called by tpacket_rcv is that + * a newly opened block triggered by tpacket_rcv may be retired earlier than + * expected. On the other hand, if timeout is updated in prb_open_block, the + * frequent reception of network packets that leads to prb_open_block being + * called may cause hrtimer to be removed and enqueued repeatedly. */ static void prb_open_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1) @@ -917,7 +868,6 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1, pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size; prb_thaw_queue(pkc1); - _prb_refresh_rx_retire_blk_timer(pkc1); smp_wmb(); } diff --git a/net/packet/diag.c b/net/packet/diag.c index 6ce1dcc284d9..c8f43e0c1925 100644 --- a/net/packet/diag.c +++ b/net/packet/diag.c @@ -83,7 +83,7 @@ static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type, pdr.pdr_frame_nr = ring->frame_max + 1; if (ver > TPACKET_V2) { - pdr.pdr_retire_tmo = ring->prb_bdqc.retire_blk_tov; + pdr.pdr_retire_tmo = ktime_to_ms(ring->prb_bdqc.interval_ktime); pdr.pdr_sizeof_priv = ring->prb_bdqc.blk_sizeof_priv; pdr.pdr_features = ring->prb_bdqc.feature_req_word; } else { diff --git a/net/packet/internal.h b/net/packet/internal.h index d367b9f93a73..b76e645cd78d 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -20,10 +20,11 @@ struct tpacket_kbdq_core { unsigned int feature_req_word; unsigned int hdrlen; unsigned char reset_pending_on_curr_blk; - unsigned char delete_blk_timer; unsigned short kactive_blk_num; unsigned short blk_sizeof_priv; + unsigned short version; + char *pkblk_start; char *pkblk_end; int kblk_size; @@ -32,6 +33,7 @@ struct tpacket_kbdq_core { uint64_t knxt_seq_num; char *prev; char *nxt_offset; + struct sk_buff *skb; rwlock_t blk_fill_in_prog_lock; @@ -39,12 +41,10 @@ struct tpacket_kbdq_core { /* Default is set to 8ms */ #define DEFAULT_PRB_RETIRE_TOV (8) - unsigned short retire_blk_tov; - unsigned short version; - unsigned long tov_in_jiffies; + ktime_t interval_ktime; /* timer to retire an outstanding block */ - struct timer_list retire_blk_timer; + struct hrtimer retire_blk_timer; }; struct pgv { -- cgit v1.2.3 From ac36dea3bc85c2cde87e490736708032328dfbdc Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Tue, 9 Sep 2025 05:26:07 -0700 Subject: ipv6: udp: fix typos in comments Correct typos in ipv6/udp.c comments: "execeeds" -> "exceeds" "tacking care" -> "taking care" "measureable" -> "measurable" No functional changes. Signed-off-by: Alok Tiwari Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250909122611.3711859-1-alok.a.tiwari@oracle.com Signed-off-by: Jakub Kicinski --- net/ipv6/udp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index a35ee6d693a8..b70369f3cd32 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -260,7 +260,7 @@ rescore: /* compute_score is too long of a function to be * inlined, and calling it again here yields - * measureable overhead for some + * measurable overhead for some * workloads. Work around it by jumping * backwards to rescore 'result'. */ @@ -449,7 +449,7 @@ struct sock *udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr EXPORT_SYMBOL_GPL(udp6_lib_lookup); #endif -/* do not use the scratch area len for jumbogram: their length execeeds the +/* do not use the scratch area len for jumbogram: their length exceeds the * scratch area space; note that the IP6CB flags is still in the first * cacheline, so checking for jumbograms is cheap */ @@ -1048,7 +1048,7 @@ static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) sk->sk_rx_dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); } -/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and +/* wrapper for udp_queue_rcv_skb taking care of csum conversion and * return code conversion for ip layer consumption */ static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, -- cgit v1.2.3 From 18282100d7040614b553f1cad737cb689c04e2b9 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 10 Sep 2025 09:24:29 -0700 Subject: net: devmem: expose tcp_recvmsg_locked errors tcp_recvmsg_dmabuf can export the following errors: - EFAULT when linear copy fails - ETOOSMALL when cmsg put fails - ENODEV if one of the frags is readable - ENOMEM on xarray failures But they are all ignored and replaced by EFAULT in the caller (tcp_recvmsg_locked). Expose real error to the userspace to add more transparency on what specifically fails. In non-devmem case (skb_copy_datagram_msg) doing `if (!copied) copied=-EFAULT` is ok because skb_copy_datagram_msg can return only EFAULT. Reviewed-by: David Ahern Reviewed-by: Mina Almasry Reviewed-by: Eric Dumazet Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250910162429.4127997-1-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 588932c3cf1d..9c576dc9a1f7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2818,9 +2818,9 @@ found_ok_skb: err = tcp_recvmsg_dmabuf(sk, skb, offset, msg, used); - if (err <= 0) { + if (err < 0) { if (!copied) - copied = -EFAULT; + copied = err; break; } -- cgit v1.2.3 From c1164178e9a86f63c2b39a187bd2670783a244b4 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 4 Sep 2025 19:07:18 +0200 Subject: net: bridge: Introduce BROPT_FDB_LOCAL_VLAN_0 The following patches will gradually introduce the ability of the bridge to look up local FDB entries on VLAN 0 instead of using the VLAN indicated by a packet. In this patch, just introduce the option itself, with which the feature will be linked. Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/ab85e33ef41ed19a3deaef0ff7da26830da30642.1757004393.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- net/bridge/br_private.h | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 8de0904b9627..87da287f19fe 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -487,6 +487,7 @@ enum net_bridge_opts { BROPT_MCAST_VLAN_SNOOPING_ENABLED, BROPT_MST_ENABLED, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION, + BROPT_FDB_LOCAL_VLAN_0, }; struct net_bridge { -- cgit v1.2.3 From 60d6be0931e931e1fb585242d3b391012cd113e3 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 4 Sep 2025 19:07:19 +0200 Subject: net: bridge: BROPT_FDB_LOCAL_VLAN_0: Look up FDB on VLAN 0 on miss When BROPT_FDB_LOCAL_VLAN_0 is enabled, the local FDB entries for the member ports as well as the bridge itself should not be created per-VLAN, but instead only on VLAN 0. That means that br_handle_frame_finish() needs to make two lookups: the primary lookup on an appropriate VLAN, and when that misses, a lookup on VLAN 0. Have the second lookup only accept local MAC addresses. Turning this into a generic second-lookup feature is not the goal. Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/8087475009dce360fb68d873b1ed9c80827da302.1757004393.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- net/bridge/br_input.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 5f6ac9bf1527..67b4c905e49a 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -202,6 +202,14 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb break; case BR_PKT_UNICAST: dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, vid); + if (unlikely(!dst && vid && + br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0))) { + dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, 0); + if (dst && + (!test_bit(BR_FDB_LOCAL, &dst->flags) || + test_bit(BR_FDB_ADDED_BY_USER, &dst->flags))) + dst = NULL; + } break; default: break; -- cgit v1.2.3 From 4cf5fd84978738acf3d3610c19c81bbe4a083b93 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 4 Sep 2025 19:07:20 +0200 Subject: net: bridge: BROPT_FDB_LOCAL_VLAN_0: On port changeaddr, skip per-VLAN FDBs When BROPT_FDB_LOCAL_VLAN_0 is enabled, the local FDB entries for member ports should not be created per-VLAN, but instead only on VLAN 0. When the member port address changes, the local FDB entries need to be updated, which is done in br_fdb_changeaddr(). Under the VLAN-0 mode, only one local FDB entry will ever be added for a port's address, and that on VLAN 0. Thus bail out of the delete loop early. For the same reason, also skip adding the per-VLAN entries. Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/0cf9d41836d2a245b0ce07e1a16ee05ca506cbe9.1757004393.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- net/bridge/br_fdb.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 902694c0ce64..918c37554638 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -459,6 +459,9 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) struct net_bridge_fdb_entry *f; struct net_bridge *br = p->br; struct net_bridge_vlan *v; + bool local_vlan_0; + + local_vlan_0 = br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0); spin_lock_bh(&br->hash_lock); vg = nbp_vlan_group(p); @@ -468,11 +471,11 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) /* delete old one */ fdb_delete_local(br, p, f); - /* if this port has no vlan information - * configured, we can safely be done at - * this point. + /* if this port has no vlan information configured, or + * local entries are only kept on VLAN 0, we can safely + * be done at this point. */ - if (!vg || !vg->num_vlans) + if (!vg || !vg->num_vlans || local_vlan_0) goto insert; } } @@ -481,7 +484,7 @@ insert: /* insert new address, may fail if invalid address or dup. */ fdb_add_local(br, p, newaddr, 0); - if (!vg || !vg->num_vlans) + if (!vg || !vg->num_vlans || local_vlan_0) goto done; /* Now add entries for every VLAN configured on the port. -- cgit v1.2.3 From 40df3b8e90eec85b0830dfc04d805f7ddbcc929d Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 4 Sep 2025 19:07:21 +0200 Subject: net: bridge: BROPT_FDB_LOCAL_VLAN_0: On bridge changeaddr, skip per-VLAN FDBs When BROPT_FDB_LOCAL_VLAN_0 is enabled, the local FDB entries for the bridge itself should not be created per-VLAN, but instead only on VLAN 0. When the bridge address changes, the local FDB entries need to be updated, which is done in br_fdb_change_mac_address(). Bail out early when in VLAN-0 mode, so that the per-VLAN FDB entries are not created. The per-VLAN walk is only done afterwards. Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/0bd432cf91921ef7c4ed0e129de1d1cd358c716b.1757004393.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- net/bridge/br_fdb.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 918c37554638..4a20578517a5 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -503,6 +503,9 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) struct net_bridge_vlan_group *vg; struct net_bridge_fdb_entry *f; struct net_bridge_vlan *v; + bool local_vlan_0; + + local_vlan_0 = br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0); spin_lock_bh(&br->hash_lock); @@ -514,7 +517,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) fdb_add_local(br, NULL, newaddr, 0); vg = br_vlan_group(br); - if (!vg || !vg->num_vlans) + if (!vg || !vg->num_vlans || local_vlan_0) goto out; /* Now remove and add entries for every VLAN configured on the * bridge. This function runs under RTNL so the bitmap will not -- cgit v1.2.3 From a29aba64e022072eb1388e9bf041fbec6902553e Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 4 Sep 2025 19:07:22 +0200 Subject: net: bridge: BROPT_FDB_LOCAL_VLAN_0: Skip local FDBs on VLAN creation When BROPT_FDB_LOCAL_VLAN_0 is enabled, the local FDB entries for the member ports as well as the bridge itself should not be created per-VLAN, but instead only on VLAN 0. Thus when a VLAN is added for a port or the bridge itself, a local FDB entry with the corresponding address should not be added when in the VLAN-0 mode. Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/bb13ba01d58ed6d5d700e012c519d38ee6806d22.1757004393.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- net/bridge/br_vlan.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 939a3aa78d5c..ae911220cb3c 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -331,10 +331,12 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, /* Add the dev mac and count the vlan only if it's usable */ if (br_vlan_should_use(v)) { - err = br_fdb_add_local(br, p, dev->dev_addr, v->vid); - if (err) { - br_err(br, "failed insert local address into bridge forwarding table\n"); - goto out_filt; + if (!br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0)) { + err = br_fdb_add_local(br, p, dev->dev_addr, v->vid); + if (err) { + br_err(br, "failed insert local address into bridge forwarding table\n"); + goto out_filt; + } } vg->num_vlans++; } -- cgit v1.2.3 From 21446c06b441b9c993870efae71aef4e9aa72ec7 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 4 Sep 2025 19:07:23 +0200 Subject: net: bridge: Introduce UAPI for BR_BOOLOPT_FDB_LOCAL_VLAN_0 The previous patches introduced a new option, BR_BOOLOPT_FDB_LOCAL_VLAN_0. When enabled, it has local FDB entries installed only on VLAN 0, instead of duplicating them across all VLANs. In this patch, add the corresponding UAPI toggle, and the code for turning the feature on and off. Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/ea99bfb10f687fa58091e6e1c2f8acc33f47ca45.1757004393.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_bridge.h | 3 ++ net/bridge/br.c | 22 ++++++++++ net/bridge/br_fdb.c | 96 ++++++++++++++++++++++++++++++++++++++++++ net/bridge/br_private.h | 2 + 4 files changed, 123 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 73876c0e2bba..e52f8207ab27 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -823,6 +823,8 @@ struct br_mcast_stats { /* bridge boolean options * BR_BOOLOPT_NO_LL_LEARN - disable learning from link-local packets * BR_BOOLOPT_MCAST_VLAN_SNOOPING - control vlan multicast snooping + * BR_BOOLOPT_FDB_LOCAL_VLAN_0 - local FDB entries installed by the bridge + * driver itself should only be added on VLAN 0 * * IMPORTANT: if adding a new option do not forget to handle * it in br_boolopt_toggle/get and bridge sysfs @@ -832,6 +834,7 @@ enum br_boolopt_id { BR_BOOLOPT_MCAST_VLAN_SNOOPING, BR_BOOLOPT_MST_ENABLE, BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION, + BR_BOOLOPT_FDB_LOCAL_VLAN_0, BR_BOOLOPT_MAX }; diff --git a/net/bridge/br.c b/net/bridge/br.c index c683baa3847f..512872a2ef81 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -259,6 +259,23 @@ static struct notifier_block br_switchdev_blocking_notifier = { .notifier_call = br_switchdev_blocking_event, }; +static int +br_toggle_fdb_local_vlan_0(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack) +{ + int err; + + if (br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0) == on) + return 0; + + err = br_fdb_toggle_local_vlan_0(br, on, extack); + if (err) + return err; + + br_opt_toggle(br, BROPT_FDB_LOCAL_VLAN_0, on); + return 0; +} + /* br_boolopt_toggle - change user-controlled boolean option * * @br: bridge device @@ -287,6 +304,9 @@ int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION: br_opt_toggle(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION, on); break; + case BR_BOOLOPT_FDB_LOCAL_VLAN_0: + err = br_toggle_fdb_local_vlan_0(br, on, extack); + break; default: /* shouldn't be called with unsupported options */ WARN_ON(1); @@ -307,6 +327,8 @@ int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) return br_opt_get(br, BROPT_MST_ENABLED); case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION: return br_opt_get(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION); + case BR_BOOLOPT_FDB_LOCAL_VLAN_0: + return br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0); default: /* shouldn't be called with unsupported options */ WARN_ON(1); diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 4a20578517a5..58d22e2b85fc 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -582,6 +582,102 @@ void br_fdb_cleanup(struct work_struct *work) mod_delayed_work(system_long_wq, &br->gc_work, work_delay); } +static void br_fdb_delete_locals_per_vlan_port(struct net_bridge *br, + struct net_bridge_port *p) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + struct net_device *dev; + + if (p) { + vg = nbp_vlan_group(p); + dev = p->dev; + } else { + vg = br_vlan_group(br); + dev = br->dev; + } + + list_for_each_entry(v, &vg->vlan_list, vlist) + br_fdb_find_delete_local(br, p, dev->dev_addr, v->vid); +} + +static void br_fdb_delete_locals_per_vlan(struct net_bridge *br) +{ + struct net_bridge_port *p; + + ASSERT_RTNL(); + + list_for_each_entry(p, &br->port_list, list) + br_fdb_delete_locals_per_vlan_port(br, p); + + br_fdb_delete_locals_per_vlan_port(br, NULL); +} + +static int br_fdb_insert_locals_per_vlan_port(struct net_bridge *br, + struct net_bridge_port *p, + struct netlink_ext_ack *extack) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + struct net_device *dev; + int err; + + if (p) { + vg = nbp_vlan_group(p); + dev = p->dev; + } else { + vg = br_vlan_group(br); + dev = br->dev; + } + + list_for_each_entry(v, &vg->vlan_list, vlist) { + if (!br_vlan_should_use(v)) + continue; + + err = br_fdb_add_local(br, p, dev->dev_addr, v->vid); + if (err) + return err; + } + + return 0; +} + +static int br_fdb_insert_locals_per_vlan(struct net_bridge *br, + struct netlink_ext_ack *extack) +{ + struct net_bridge_port *p; + int err; + + ASSERT_RTNL(); + + list_for_each_entry(p, &br->port_list, list) { + err = br_fdb_insert_locals_per_vlan_port(br, p, extack); + if (err) + goto rollback; + } + + err = br_fdb_insert_locals_per_vlan_port(br, NULL, extack); + if (err) + goto rollback; + + return 0; + +rollback: + NL_SET_ERR_MSG_MOD(extack, "fdb_local_vlan_0 toggle: FDB entry insertion failed"); + br_fdb_delete_locals_per_vlan(br); + return err; +} + +int br_fdb_toggle_local_vlan_0(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack) +{ + if (!on) + return br_fdb_insert_locals_per_vlan(br, extack); + + br_fdb_delete_locals_per_vlan(br); + return 0; +} + static bool __fdb_flush_matches(const struct net_bridge *br, const struct net_bridge_fdb_entry *f, const struct net_bridge_fdb_flush_desc *desc) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 87da287f19fe..16be5d250402 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -844,6 +844,8 @@ void br_fdb_find_delete_local(struct net_bridge *br, void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr); void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr); void br_fdb_cleanup(struct work_struct *work); +int br_fdb_toggle_local_vlan_0(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack); void br_fdb_delete_by_port(struct net_bridge *br, const struct net_bridge_port *p, u16 vid, int do_all); struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br, -- cgit v1.2.3 From 9e472d9e84b11e9f3c429eba97c2a9e74461a884 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 9 Sep 2025 02:18:50 +0100 Subject: tcp: Destroy TCP-AO, TCP-MD5 keys in .sk_destruct() Currently there are a couple of minor issues with destroying the keys tcp_v4_destroy_sock(): 1. The socket is yet in TCP bind buckets, making it reachable for incoming segments [on another CPU core], potentially available to send late FIN/ACK/RST replies. 2. There is at least one code path, where tcp_done() is called before sending RST [kudos to Bob for investigation]. This is a case of a server, that finished sending its data and just called close(). The socket is in TCP_FIN_WAIT2 and has RCV_SHUTDOWN (set by __tcp_close()) tcp_v4_do_rcv()/tcp_v6_do_rcv() tcp_rcv_state_process() /* LINUX_MIB_TCPABORTONDATA */ tcp_reset() tcp_done_with_error() tcp_done() inet_csk_destroy_sock() /* Destroys AO/MD5 keys */ /* tcp_rcv_state_process() returns SKB_DROP_REASON_TCP_ABORT_ON_DATA */ tcp_v4_send_reset() /* Sends an unsigned RST segment */ tcpdump: > 22:53:15.399377 00:00:b2:1f:00:00 > 00:00:01:01:00:00, ethertype IPv4 (0x0800), length 74: (tos 0x0, ttl 64, id 33929, offset 0, flags [DF], proto TCP (6), length 60) > 1.0.0.1.34567 > 1.0.0.2.49848: Flags [F.], seq 2185658590, ack 3969644355, win 502, options [nop,nop,md5 valid], length 0 > 22:53:15.399396 00:00:01:01:00:00 > 00:00:b2:1f:00:00, ethertype IPv4 (0x0800), length 86: (tos 0x0, ttl 64, id 51951, offset 0, flags [DF], proto TCP (6), length 72) > 1.0.0.2.49848 > 1.0.0.1.34567: Flags [.], seq 3969644375, ack 2185658591, win 128, options [nop,nop,md5 valid,nop,nop,sack 1 {2185658590:2185658591}], length 0 > 22:53:16.429588 00:00:b2:1f:00:00 > 00:00:01:01:00:00, ethertype IPv4 (0x0800), length 60: (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto TCP (6), length 40) > 1.0.0.1.34567 > 1.0.0.2.49848: Flags [R], seq 2185658590, win 0, length 0 > 22:53:16.664725 00:00:b2:1f:00:00 > 00:00:01:01:00:00, ethertype IPv4 (0x0800), length 74: (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto TCP (6), length 60) > 1.0.0.1.34567 > 1.0.0.2.49848: Flags [R], seq 2185658591, win 0, options [nop,nop,md5 valid], length 0 > 22:53:17.289832 00:00:b2:1f:00:00 > 00:00:01:01:00:00, ethertype IPv4 (0x0800), length 74: (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto TCP (6), length 60) > 1.0.0.1.34567 > 1.0.0.2.49848: Flags [R], seq 2185658591, win 0, options [nop,nop,md5 valid], length 0 Note the signed RSTs later in the dump - those are sent by the server when the fin-wait socket gets removed from hash buckets, by the listener socket. Instead of destroying AO/MD5 info and their keys in inet_csk_destroy_sock(), slightly delay it until the actual socket .sk_destruct(). As shutdown'ed socket can yet send non-data replies, they should be signed in order for the peer to process them. Now it also matches how AO/MD5 gets destructed for TIME-WAIT sockets (in tcp_twsk_destructor()). This seems optimal for TCP-MD5, while for TCP-AO it seems to have an open problem: once RST get sent and socket gets actually destructed, there is no information on the initial sequence numbers. So, in case this last RST gets lost in the network, the server's listener socket won't be able to properly sign another RST. Nothing in RFC 1122 prescribes keeping any local state after non-graceful reset. Luckily, BGP are known to use keep alive(s). While the issue is quite minor/cosmetic, these days monitoring network counters is a common practice and getting invalid signed segments from a trusted BGP peer can get customers worried. Investigated-by: Bob Gilligan Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Signed-off-by: Dmitry Safonov Link: https://patch.msgid.link/20250909-b4-tcp-ao-md5-rst-finwait2-v5-1-9ffaaaf8b236@arista.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 4 ++++ net/ipv4/tcp.c | 27 +++++++++++++++++++++++++++ net/ipv4/tcp_ipv4.c | 33 ++++++++------------------------- net/ipv6/tcp_ipv6.c | 8 ++++++++ 4 files changed, 47 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 0fb7923b8367..277914c4d067 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1941,6 +1941,7 @@ tcp_md5_do_lookup_any_l3index(const struct sock *sk, } #define tcp_twsk_md5_key(twsk) ((twsk)->tw_md5_key) +void tcp_md5_destruct_sock(struct sock *sk); #else static inline struct tcp_md5sig_key * tcp_md5_do_lookup(const struct sock *sk, int l3index, @@ -1957,6 +1958,9 @@ tcp_md5_do_lookup_any_l3index(const struct sock *sk, } #define tcp_twsk_md5_key(twsk) NULL +static inline void tcp_md5_destruct_sock(struct sock *sk) +{ +} #endif int tcp_md5_alloc_sigpool(void); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9c576dc9a1f7..7c6c143017ef 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -412,6 +412,33 @@ static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp) return rate64; } +#ifdef CONFIG_TCP_MD5SIG +static void tcp_md5sig_info_free_rcu(struct rcu_head *head) +{ + struct tcp_md5sig_info *md5sig; + + md5sig = container_of(head, struct tcp_md5sig_info, rcu); + kfree(md5sig); + static_branch_slow_dec_deferred(&tcp_md5_needed); + tcp_md5_release_sigpool(); +} + +void tcp_md5_destruct_sock(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->md5sig_info) { + struct tcp_md5sig_info *md5sig; + + md5sig = rcu_dereference_protected(tp->md5sig_info, 1); + tcp_clear_md5_list(sk); + rcu_assign_pointer(tp->md5sig_info, NULL); + call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu); + } +} +EXPORT_IPV6_MOD_GPL(tcp_md5_destruct_sock); +#endif + /* Address-family independent initialization for a tcp_sock. * * NOTE: A lot of things set to zero explicitly by call to diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1e58a8a9ff7a..17176a5d8638 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2494,6 +2494,13 @@ static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, #endif }; + +static void tcp4_destruct_sock(struct sock *sk) +{ + tcp_md5_destruct_sock(sk); + tcp_ao_destroy_sock(sk, false); + inet_sock_destruct(sk); +} #endif /* NOTE: A lot of things set to zero explicitly by call to @@ -2509,23 +2516,12 @@ static int tcp_v4_init_sock(struct sock *sk) #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; + sk->sk_destruct = tcp4_destruct_sock; #endif return 0; } -#ifdef CONFIG_TCP_MD5SIG -static void tcp_md5sig_info_free_rcu(struct rcu_head *head) -{ - struct tcp_md5sig_info *md5sig; - - md5sig = container_of(head, struct tcp_md5sig_info, rcu); - kfree(md5sig); - static_branch_slow_dec_deferred(&tcp_md5_needed); - tcp_md5_release_sigpool(); -} -#endif - static void tcp_release_user_frags(struct sock *sk) { #ifdef CONFIG_PAGE_POOL @@ -2562,19 +2558,6 @@ void tcp_v4_destroy_sock(struct sock *sk) /* Cleans up our, hopefully empty, out_of_order_queue. */ skb_rbtree_purge(&tp->out_of_order_queue); -#ifdef CONFIG_TCP_MD5SIG - /* Clean up the MD5 key list, if any */ - if (tp->md5sig_info) { - struct tcp_md5sig_info *md5sig; - - md5sig = rcu_dereference_protected(tp->md5sig_info, 1); - tcp_clear_md5_list(sk); - call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu); - rcu_assign_pointer(tp->md5sig_info, NULL); - } -#endif - tcp_ao_destroy_sock(sk, false); - /* Clean up a referenced TCP bind bucket. */ if (inet_csk(sk)->icsk_bind_hash) inet_put_port(sk); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0562e939b2e3..08dabc47a6e7 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2110,6 +2110,13 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = { .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, #endif }; + +static void tcp6_destruct_sock(struct sock *sk) +{ + tcp_md5_destruct_sock(sk); + tcp_ao_destroy_sock(sk, false); + inet6_sock_destruct(sk); +} #endif /* NOTE: A lot of things set to zero explicitly by call to @@ -2125,6 +2132,7 @@ static int tcp_v6_init_sock(struct sock *sk) #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific; + sk->sk_destruct = tcp6_destruct_sock; #endif return 0; -- cgit v1.2.3 From 51e547e8c89c661f6fbede4a28b1d33b13625683 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 9 Sep 2025 02:18:51 +0100 Subject: tcp: Free TCP-AO/TCP-MD5 info/keys without RCU Now that the destruction of info/keys is delayed until the socket destructor, it's safe to use kfree() without an RCU callback. The socket is in TCP_CLOSE state either because it never left it, or it's already closed and the refcounter is zero. In any way, no one can discover it anymore, it's safe to release memory straight away. Similar thing was possible for twsk already. Reviewed-by: Kuniyuki Iwashima Signed-off-by: Dmitry Safonov Link: https://patch.msgid.link/20250909-b4-tcp-ao-md5-rst-finwait2-v5-2-9ffaaaf8b236@arista.com Signed-off-by: Jakub Kicinski --- include/net/tcp_ao.h | 1 - net/ipv4/tcp.c | 17 +++-------------- net/ipv4/tcp_ao.c | 5 ++--- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv4/tcp_minisocks.c | 19 +++++-------------- 5 files changed, 12 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h index df655ce6987d..1e9e27d6e06b 100644 --- a/include/net/tcp_ao.h +++ b/include/net/tcp_ao.h @@ -130,7 +130,6 @@ struct tcp_ao_info { u32 snd_sne; u32 rcv_sne; refcount_t refcnt; /* Protects twsk destruction */ - struct rcu_head rcu; }; #ifdef CONFIG_TCP_MD5SIG diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7c6c143017ef..7f9c671b1ee0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -413,27 +413,16 @@ static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp) } #ifdef CONFIG_TCP_MD5SIG -static void tcp_md5sig_info_free_rcu(struct rcu_head *head) -{ - struct tcp_md5sig_info *md5sig; - - md5sig = container_of(head, struct tcp_md5sig_info, rcu); - kfree(md5sig); - static_branch_slow_dec_deferred(&tcp_md5_needed); - tcp_md5_release_sigpool(); -} - void tcp_md5_destruct_sock(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (tp->md5sig_info) { - struct tcp_md5sig_info *md5sig; - md5sig = rcu_dereference_protected(tp->md5sig_info, 1); tcp_clear_md5_list(sk); - rcu_assign_pointer(tp->md5sig_info, NULL); - call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu); + kfree(rcu_replace_pointer(tp->md5sig_info, NULL, 1)); + static_branch_slow_dec_deferred(&tcp_md5_needed); + tcp_md5_release_sigpool(); } } EXPORT_IPV6_MOD_GPL(tcp_md5_destruct_sock); diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index bbb8d5f0eae7..31302be78bc4 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -268,9 +268,8 @@ static void tcp_ao_key_free_rcu(struct rcu_head *head) kfree_sensitive(key); } -static void tcp_ao_info_free_rcu(struct rcu_head *head) +static void tcp_ao_info_free(struct tcp_ao_info *ao) { - struct tcp_ao_info *ao = container_of(head, struct tcp_ao_info, rcu); struct tcp_ao_key *key; struct hlist_node *n; @@ -310,7 +309,7 @@ void tcp_ao_destroy_sock(struct sock *sk, bool twsk) if (!twsk) tcp_ao_sk_omem_free(sk, ao); - call_rcu(&ao->rcu, tcp_ao_info_free_rcu); + tcp_ao_info_free(ao); } void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 17176a5d8638..2a0602035729 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1503,9 +1503,9 @@ void tcp_clear_md5_list(struct sock *sk) md5sig = rcu_dereference_protected(tp->md5sig_info, 1); hlist_for_each_entry_safe(key, n, &md5sig->head, node) { - hlist_del_rcu(&key->node); + hlist_del(&key->node); atomic_sub(sizeof(*key), &sk->sk_omem_alloc); - kfree_rcu(key, rcu); + kfree(key); } } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index d1c9e4088646..7c2ae07d8d5d 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -377,26 +377,17 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) } EXPORT_SYMBOL(tcp_time_wait); -#ifdef CONFIG_TCP_MD5SIG -static void tcp_md5_twsk_free_rcu(struct rcu_head *head) -{ - struct tcp_md5sig_key *key; - - key = container_of(head, struct tcp_md5sig_key, rcu); - kfree(key); - static_branch_slow_dec_deferred(&tcp_md5_needed); - tcp_md5_release_sigpool(); -} -#endif - void tcp_twsk_destructor(struct sock *sk) { #ifdef CONFIG_TCP_MD5SIG if (static_branch_unlikely(&tcp_md5_needed.key)) { struct tcp_timewait_sock *twsk = tcp_twsk(sk); - if (twsk->tw_md5_key) - call_rcu(&twsk->tw_md5_key->rcu, tcp_md5_twsk_free_rcu); + if (twsk->tw_md5_key) { + kfree(twsk->tw_md5_key); + static_branch_slow_dec_deferred(&tcp_md5_needed); + tcp_md5_release_sigpool(); + } } #endif tcp_ao_destroy_sock(sk, true); -- cgit v1.2.3 From dc2f650f7e6857bf384069c1a56b2937a1ee370d Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Wed, 10 Sep 2025 12:50:26 -0700 Subject: udp_tunnel: use netdev_warn() instead of netdev_WARN() netdev_WARN() uses WARN/WARN_ON to print a backtrace along with file and line information. In this case, udp_tunnel_nic_register() returning an error is just a failed operation, not a kernel bug. udp_tunnel_nic_register() can fail due to a memory allocation failure (kzalloc() or udp_tunnel_nic_alloc()). This is a normal runtime error and not a kernel bug. Replace netdev_WARN() with netdev_warn() accordingly. Signed-off-by: Alok Tiwari Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250910195031.3784748-1-alok.a.tiwari@oracle.com Signed-off-by: Jakub Kicinski --- net/ipv4/udp_tunnel_nic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c index ff66db48453c..944b3cf25468 100644 --- a/net/ipv4/udp_tunnel_nic.c +++ b/net/ipv4/udp_tunnel_nic.c @@ -930,7 +930,7 @@ udp_tunnel_nic_netdevice_event(struct notifier_block *unused, err = udp_tunnel_nic_register(dev); if (err) - netdev_WARN(dev, "failed to register for UDP tunnel offloads: %d", err); + netdev_warn(dev, "failed to register for UDP tunnel offloads: %d", err); return notifier_from_errno(err); } /* All other events will need the udp_tunnel_nic state */ -- cgit v1.2.3 From 201825fb4278accb6ace42915566c22391a0900d Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 11 Sep 2025 15:43:15 +0100 Subject: net: ethtool: handle EOPNOTSUPP from ethtool get_ts_info() method Network drivers sometimes return -EOPNOTSUPP from their get_ts_info() method, and this should not cause the reporting of PHY timestamping information to be prohibited. Handle this error code, and also arrange for ethtool_net_get_ts_info_by_phc() to return -EOPNOTSUPP when the method is not implemented. This allows e.g. PHYs connected to DSA switches which support timestamping to report their timestamping capabilities. Fixes: b9e3f7dc9ed9 ("net: ethtool: tsinfo: Enhance tsinfo to support several hwtstamp by net topology") Signed-off-by: Russell King (Oracle) Reviewed-by: Kory Maincent Link: https://patch.msgid.link/E1uwiW3-00000004jRF-3CnC@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- net/ethtool/common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 4f58648a27ad..92e6a681c797 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -905,7 +905,7 @@ int ethtool_net_get_ts_info_by_phc(struct net_device *dev, int err; if (!ops->get_ts_info) - return -ENODEV; + return -EOPNOTSUPP; /* Does ptp comes from netdev */ ethtool_init_tsinfo(info); @@ -973,7 +973,7 @@ int ethtool_get_ts_info_by_phc(struct net_device *dev, int err; err = ethtool_net_get_ts_info_by_phc(dev, info, hwprov_desc); - if (err == -ENODEV) { + if (err == -ENODEV || err == -EOPNOTSUPP) { struct phy_device *phy; phy = ethtool_phy_get_ts_info_by_phc(dev, info, hwprov_desc); -- cgit v1.2.3 From fdae0ab67d57d480dc61e9fb45678bbdc3786711 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 Sep 2025 12:19:42 +0000 Subject: net: use NUMA drop counters for softnet_data.dropped Hosts under DOS attack can suffer from false sharing in enqueue_to_backlog() : atomic_inc(&sd->dropped). This is because sd->dropped can be touched from many cpus, possibly residing on different NUMA nodes. Generalize the sk_drop_counters infrastucture added in commit c51613fa276f ("net: add sk->sk_drop_counters") and use it to replace softnet_data.dropped with NUMA friendly softnet_data.drop_counters. This adds 64 bytes per cpu, maybe more in the future if we increase the number of counters (currently 2) per 'struct numa_drop_counters'. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250909121942.1202585-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/ipv6.h | 2 +- include/linux/netdevice.h | 28 +++++++++++++++++++++++++++- include/linux/udp.h | 2 +- include/net/raw.h | 2 +- include/net/sock.h | 37 ++++++++++++------------------------- net/core/dev.c | 2 +- net/core/net-procfs.c | 3 ++- 7 files changed, 45 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 261d02efb615..f43314517396 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -295,7 +295,7 @@ struct raw6_sock { __u32 offset; /* checksum offset */ struct icmp6_filter filter; __u32 ip6mr_table; - struct socket_drop_counters drop_counters; + struct numa_drop_counters drop_counters; struct ipv6_pinfo inet6; }; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f3a3b761abfb..f5a840c07cf1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3459,6 +3459,32 @@ static inline bool dev_has_header(const struct net_device *dev) return dev->header_ops && dev->header_ops->create; } +struct numa_drop_counters { + atomic_t drops0 ____cacheline_aligned_in_smp; + atomic_t drops1 ____cacheline_aligned_in_smp; +}; + +static inline int numa_drop_read(const struct numa_drop_counters *ndc) +{ + return atomic_read(&ndc->drops0) + atomic_read(&ndc->drops1); +} + +static inline void numa_drop_add(struct numa_drop_counters *ndc, int val) +{ + int n = numa_node_id() % 2; + + if (n) + atomic_add(val, &ndc->drops1); + else + atomic_add(val, &ndc->drops0); +} + +static inline void numa_drop_reset(struct numa_drop_counters *ndc) +{ + atomic_set(&ndc->drops0, 0); + atomic_set(&ndc->drops1, 0); +} + /* * Incoming packets are placed on per-CPU queues */ @@ -3504,7 +3530,7 @@ struct softnet_data { struct sk_buff_head input_pkt_queue; struct napi_struct backlog; - atomic_t dropped ____cacheline_aligned_in_smp; + struct numa_drop_counters drop_counters; /* Another possibly contended cache line */ spinlock_t defer_lock ____cacheline_aligned_in_smp; diff --git a/include/linux/udp.h b/include/linux/udp.h index 981506be1e15..6ed008ab1665 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -108,7 +108,7 @@ struct udp_sock { * the last UDP socket cacheline. */ struct hlist_node tunnel_list; - struct socket_drop_counters drop_counters; + struct numa_drop_counters drop_counters; }; #define udp_test_bit(nr, sk) \ diff --git a/include/net/raw.h b/include/net/raw.h index d52709139060..66c0ffeada2e 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -81,7 +81,7 @@ struct raw_sock { struct inet_sock inet; struct icmp_filter filter; u32 ipmr_table; - struct socket_drop_counters drop_counters; + struct numa_drop_counters drop_counters; }; #define raw_sk(ptr) container_of_const(ptr, struct raw_sock, inet.sk) diff --git a/include/net/sock.h b/include/net/sock.h index 896bec2d2176..0fd465935334 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -102,11 +102,6 @@ struct net; typedef __u32 __bitwise __portpair; typedef __u64 __bitwise __addrpair; -struct socket_drop_counters { - atomic_t drops0 ____cacheline_aligned_in_smp; - atomic_t drops1 ____cacheline_aligned_in_smp; -}; - /** * struct sock_common - minimal network layer representation of sockets * @skc_daddr: Foreign IPv4 addr @@ -287,7 +282,7 @@ struct sk_filter; * @sk_err_soft: errors that don't cause failure but are the cause of a * persistent failure not just 'timed out' * @sk_drops: raw/udp drops counter - * @sk_drop_counters: optional pointer to socket_drop_counters + * @sk_drop_counters: optional pointer to numa_drop_counters * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner @@ -456,7 +451,7 @@ struct sock { #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif - struct socket_drop_counters *sk_drop_counters; + struct numa_drop_counters *sk_drop_counters; __cacheline_group_end(sock_read_rxtx); __cacheline_group_begin(sock_write_rxtx); @@ -2698,18 +2693,12 @@ struct sock_skb_cb { static inline void sk_drops_add(struct sock *sk, int segs) { - struct socket_drop_counters *sdc = sk->sk_drop_counters; + struct numa_drop_counters *ndc = sk->sk_drop_counters; - if (sdc) { - int n = numa_node_id() % 2; - - if (n) - atomic_add(segs, &sdc->drops1); - else - atomic_add(segs, &sdc->drops0); - } else { + if (ndc) + numa_drop_add(ndc, segs); + else atomic_add(segs, &sk->sk_drops); - } } static inline void sk_drops_inc(struct sock *sk) @@ -2719,23 +2708,21 @@ static inline void sk_drops_inc(struct sock *sk) static inline int sk_drops_read(const struct sock *sk) { - const struct socket_drop_counters *sdc = sk->sk_drop_counters; + const struct numa_drop_counters *ndc = sk->sk_drop_counters; - if (sdc) { + if (ndc) { DEBUG_NET_WARN_ON_ONCE(atomic_read(&sk->sk_drops)); - return atomic_read(&sdc->drops0) + atomic_read(&sdc->drops1); + return numa_drop_read(ndc); } return atomic_read(&sk->sk_drops); } static inline void sk_drops_reset(struct sock *sk) { - struct socket_drop_counters *sdc = sk->sk_drop_counters; + struct numa_drop_counters *ndc = sk->sk_drop_counters; - if (sdc) { - atomic_set(&sdc->drops0, 0); - atomic_set(&sdc->drops1, 0); - } + if (ndc) + numa_drop_reset(ndc); atomic_set(&sk->sk_drops, 0); } diff --git a/net/core/dev.c b/net/core/dev.c index 1d1650d9ecff..2522d9d8f0e4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5248,7 +5248,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, backlog_unlock_irq_restore(sd, &flags); cpu_backlog_drop: - atomic_inc(&sd->dropped); + numa_drop_add(&sd->drop_counters, 1); bad_dev: dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb_reason(skb, reason); diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 4f0f0709a1cb..70e0e9a3b650 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -145,7 +145,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x\n", - READ_ONCE(sd->processed), atomic_read(&sd->dropped), + READ_ONCE(sd->processed), + numa_drop_read(&sd->drop_counters), READ_ONCE(sd->time_squeeze), 0, 0, 0, 0, 0, /* was fastroute */ 0, /* was cpu_collision */ -- cgit v1.2.3 From 010fe36ad2a398adc06c54983466b2f08658d748 Mon Sep 17 00:00:00 2001 From: Mahanta Jambigi Date: Wed, 10 Sep 2025 08:31:25 +0200 Subject: net/smc: Remove unused argument from 2 SMC functions The smc argument is not used in both smc_connect_ism_vlan_setup() & smc_connect_ism_vlan_cleanup(). Hence removing it. Signed-off-by: Mahanta Jambigi Reviewed-by: Sidraya Jayagond Reviewed-by: Dust Li Link: https://patch.msgid.link/20250910063125.2112577-1-mjambigi@linux.ibm.com Signed-off-by: Jakub Kicinski --- net/smc/af_smc.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e0e48f24cd61..a7187e5873ec 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1097,8 +1097,7 @@ static int smc_find_ism_v2_device_clnt(struct smc_sock *smc, } /* Check for VLAN ID and register it on ISM device just for CLC handshake */ -static int smc_connect_ism_vlan_setup(struct smc_sock *smc, - struct smc_init_info *ini) +static int smc_connect_ism_vlan_setup(struct smc_init_info *ini) { if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev[0], ini->vlan_id)) return SMC_CLC_DECL_ISMVLANERR; @@ -1113,7 +1112,7 @@ static int smc_find_proposal_devices(struct smc_sock *smc, /* check if there is an ism device available */ if (!(ini->smcd_version & SMC_V1) || smc_find_ism_device(smc, ini) || - smc_connect_ism_vlan_setup(smc, ini)) + smc_connect_ism_vlan_setup(ini)) ini->smcd_version &= ~SMC_V1; /* else ISM V1 is supported for this connection */ @@ -1158,8 +1157,7 @@ static int smc_find_proposal_devices(struct smc_sock *smc, /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is * used, the VLAN ID will be registered again during the connection setup. */ -static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, - struct smc_init_info *ini) +static int smc_connect_ism_vlan_cleanup(struct smc_init_info *ini) { if (!smcd_indicated(ini->smc_type_v1)) return 0; @@ -1582,13 +1580,13 @@ static int __smc_connect(struct smc_sock *smc) goto vlan_cleanup; SMC_STAT_CLNT_SUCC_INC(sock_net(smc->clcsock->sk), aclc); - smc_connect_ism_vlan_cleanup(smc, ini); + smc_connect_ism_vlan_cleanup(ini); kfree(buf); kfree(ini); return 0; vlan_cleanup: - smc_connect_ism_vlan_cleanup(smc, ini); + smc_connect_ism_vlan_cleanup(ini); kfree(buf); fallback: kfree(ini); -- cgit v1.2.3 From 2e7bba08923ebc675b1f0e0e0959e68e53047838 Mon Sep 17 00:00:00 2001 From: Anderson Nascimento Date: Thu, 11 Sep 2025 20:07:44 -0300 Subject: net/tcp: Fix a NULL pointer dereference when using TCP-AO with TCP_REPAIR A NULL pointer dereference can occur in tcp_ao_finish_connect() during a connect() system call on a socket with a TCP-AO key added and TCP_REPAIR enabled. The function is called with skb being NULL and attempts to dereference it on tcp_hdr(skb)->seq without a prior skb validation. Fix this by checking if skb is NULL before dereferencing it. The commentary is taken from bpf_skops_established(), which is also called in the same flow. Unlike the function being patched, bpf_skops_established() validates the skb before dereferencing it. int main(void){ struct sockaddr_in sockaddr; struct tcp_ao_add tcp_ao; int sk; int one = 1; memset(&sockaddr,'\0',sizeof(sockaddr)); memset(&tcp_ao,'\0',sizeof(tcp_ao)); sk = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); sockaddr.sin_family = AF_INET; memcpy(tcp_ao.alg_name,"cmac(aes128)",12); memcpy(tcp_ao.key,"ABCDEFGHABCDEFGH",16); tcp_ao.keylen = 16; memcpy(&tcp_ao.addr,&sockaddr,sizeof(sockaddr)); setsockopt(sk, IPPROTO_TCP, TCP_AO_ADD_KEY, &tcp_ao, sizeof(tcp_ao)); setsockopt(sk, IPPROTO_TCP, TCP_REPAIR, &one, sizeof(one)); sockaddr.sin_family = AF_INET; sockaddr.sin_port = htobe16(123); inet_aton("127.0.0.1", &sockaddr.sin_addr); connect(sk,(struct sockaddr *)&sockaddr,sizeof(sockaddr)); return 0; } $ gcc tcp-ao-nullptr.c -o tcp-ao-nullptr -Wall $ unshare -Urn BUG: kernel NULL pointer dereference, address: 00000000000000b6 PGD 1f648d067 P4D 1f648d067 PUD 1982e8067 PMD 0 Oops: Oops: 0000 [#1] SMP NOPTI Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 11/12/2020 RIP: 0010:tcp_ao_finish_connect (net/ipv4/tcp_ao.c:1182) Fixes: 7c2ffaf21bd6 ("net/tcp: Calculate TCP-AO traffic keys") Signed-off-by: Anderson Nascimento Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com> Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250911230743.2551-3-anderson@allelesecurity.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ao.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index bbb8d5f0eae7..3338b6cc85c4 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -1178,7 +1178,9 @@ void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb) if (!ao) return; - WRITE_ONCE(ao->risn, tcp_hdr(skb)->seq); + /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */ + if (skb) + WRITE_ONCE(ao->risn, tcp_hdr(skb)->seq); ao->rcv_sne = 0; hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk)) -- cgit v1.2.3 From 64863f4ca4945bdb62ce2b30823f39ea9fe95415 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 11 Sep 2025 23:58:16 +0100 Subject: rxrpc: Fix unhandled errors in rxgk_verify_packet_integrity() rxgk_verify_packet_integrity() may get more errors than just -EPROTO from rxgk_verify_mic_skb(). Pretty much anything other than -ENOMEM constitutes an unrecoverable error. In the case of -ENOMEM, we can just drop the packet and wait for a retransmission. Similar happens with rxgk_decrypt_skb() and its callers. Fix rxgk_decrypt_skb() or rxgk_verify_mic_skb() to return a greater variety of abort codes and fix their callers to abort the connection on any error apart from -ENOMEM. Also preclear the variables used to hold the abort code returned from rxgk_decrypt_skb() or rxgk_verify_mic_skb() to eliminate uninitialised variable warnings. Fixes: 9d1d2b59341f ("rxrpc: rxgk: Implement the yfs-rxgk security class (GSSAPI)") Reported-by: Dan Carpenter Closes: https://lists.infradead.org/pipermail/linux-afs/2025-April/009739.html Closes: https://lists.infradead.org/pipermail/linux-afs/2025-April/009740.html Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Reviewed-by: Simon Horman Link: https://patch.msgid.link/2038804.1757631496@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- net/rxrpc/rxgk.c | 18 ++++++++++-------- net/rxrpc/rxgk_app.c | 10 ++++++---- net/rxrpc/rxgk_common.h | 14 ++++++++++++-- 3 files changed, 28 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c index 1e19c605bcc8..dce5a3d8a964 100644 --- a/net/rxrpc/rxgk.c +++ b/net/rxrpc/rxgk.c @@ -475,7 +475,7 @@ static int rxgk_verify_packet_integrity(struct rxrpc_call *call, struct krb5_buffer metadata; unsigned int offset = sp->offset, len = sp->len; size_t data_offset = 0, data_len = len; - u32 ac; + u32 ac = 0; int ret = -ENOMEM; _enter(""); @@ -499,9 +499,10 @@ static int rxgk_verify_packet_integrity(struct rxrpc_call *call, ret = rxgk_verify_mic_skb(gk->krb5, gk->rx_Kc, &metadata, skb, &offset, &len, &ac); kfree(hdr); - if (ret == -EPROTO) { - rxrpc_abort_eproto(call, skb, ac, - rxgk_abort_1_verify_mic_eproto); + if (ret < 0) { + if (ret != -ENOMEM) + rxrpc_abort_eproto(call, skb, ac, + rxgk_abort_1_verify_mic_eproto); } else { sp->offset = offset; sp->len = len; @@ -524,15 +525,16 @@ static int rxgk_verify_packet_encrypted(struct rxrpc_call *call, struct rxgk_header hdr; unsigned int offset = sp->offset, len = sp->len; int ret; - u32 ac; + u32 ac = 0; _enter(""); ret = rxgk_decrypt_skb(gk->krb5, gk->rx_enc, skb, &offset, &len, &ac); - if (ret == -EPROTO) - rxrpc_abort_eproto(call, skb, ac, rxgk_abort_2_decrypt_eproto); - if (ret < 0) + if (ret < 0) { + if (ret != -ENOMEM) + rxrpc_abort_eproto(call, skb, ac, rxgk_abort_2_decrypt_eproto); goto error; + } if (len < sizeof(hdr)) { ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, diff --git a/net/rxrpc/rxgk_app.c b/net/rxrpc/rxgk_app.c index b94b77a1c317..df684b5a8531 100644 --- a/net/rxrpc/rxgk_app.c +++ b/net/rxrpc/rxgk_app.c @@ -187,7 +187,7 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, struct key *server_key; unsigned int ticket_offset, ticket_len; u32 kvno, enctype; - int ret, ec; + int ret, ec = 0; struct { __be32 kvno; @@ -236,9 +236,11 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, &ticket_offset, &ticket_len, &ec); crypto_free_aead(token_enc); token_enc = NULL; - if (ret < 0) - return rxrpc_abort_conn(conn, skb, ec, ret, - rxgk_abort_resp_tok_dec); + if (ret < 0) { + if (ret != -ENOMEM) + return rxrpc_abort_conn(conn, skb, ec, ret, + rxgk_abort_resp_tok_dec); + } ret = conn->security->default_decode_ticket(conn, skb, ticket_offset, ticket_len, _key); diff --git a/net/rxrpc/rxgk_common.h b/net/rxrpc/rxgk_common.h index 7370a5655985..80164d89e19c 100644 --- a/net/rxrpc/rxgk_common.h +++ b/net/rxrpc/rxgk_common.h @@ -88,11 +88,16 @@ int rxgk_decrypt_skb(const struct krb5_enctype *krb5, *_offset += offset; *_len = len; break; + case -EBADMSG: /* Checksum mismatch. */ case -EPROTO: - case -EBADMSG: *_error_code = RXGK_SEALEDINCON; break; + case -EMSGSIZE: + *_error_code = RXGK_PACKETSHORT; + break; + case -ENOPKG: /* Would prefer RXGK_BADETYPE, but not available for YFS. */ default: + *_error_code = RXGK_INCONSISTENCY; break; } @@ -127,11 +132,16 @@ int rxgk_verify_mic_skb(const struct krb5_enctype *krb5, *_offset += offset; *_len = len; break; + case -EBADMSG: /* Checksum mismatch */ case -EPROTO: - case -EBADMSG: *_error_code = RXGK_SEALEDINCON; break; + case -EMSGSIZE: + *_error_code = RXGK_PACKETSHORT; + break; + case -ENOPKG: /* Would prefer RXGK_BADETYPE, but not available for YFS. */ default: + *_error_code = RXGK_INCONSISTENCY; break; } -- cgit v1.2.3 From 2429a197648178cd4dc930a9d87c13c547460564 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 12 Sep 2025 00:06:17 +0100 Subject: rxrpc: Fix untrusted unsigned subtract Fix the following Smatch static checker warning: net/rxrpc/rxgk_app.c:65 rxgk_yfs_decode_ticket() warn: untrusted unsigned subtract. 'ticket_len - 10 * 4' by prechecking the length of what we're trying to extract in two places in the token and decoding for a response packet. Also use sizeof() on the struct we're extracting rather specifying the size numerically to be consistent with the other related statements. Fixes: 9d1d2b59341f ("rxrpc: rxgk: Implement the yfs-rxgk security class (GSSAPI)") Reported-by: Dan Carpenter Closes: https://lists.infradead.org/pipermail/linux-afs/2025-September/010135.html Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Reviewed-by: Simon Horman Link: https://patch.msgid.link/2039268.1757631977@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- net/rxrpc/rxgk_app.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/rxrpc/rxgk_app.c b/net/rxrpc/rxgk_app.c index df684b5a8531..30275cb5ba3e 100644 --- a/net/rxrpc/rxgk_app.c +++ b/net/rxrpc/rxgk_app.c @@ -54,6 +54,10 @@ int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, _enter(""); + if (ticket_len < 10 * sizeof(__be32)) + return rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_short_yfs_tkt); + /* Get the session key length */ ret = skb_copy_bits(skb, ticket_offset, tmp, sizeof(tmp)); if (ret < 0) @@ -195,22 +199,23 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, __be32 token_len; } container; + if (token_len < sizeof(container)) + goto short_packet; + /* Decode the RXGK_TokenContainer object. This tells us which server * key we should be using. We can then fetch the key, get the secret * and set up the crypto to extract the token. */ if (skb_copy_bits(skb, token_offset, &container, sizeof(container)) < 0) - return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, - rxgk_abort_resp_tok_short); + goto short_packet; kvno = ntohl(container.kvno); enctype = ntohl(container.enctype); ticket_len = ntohl(container.token_len); ticket_offset = token_offset + sizeof(container); - if (xdr_round_up(ticket_len) > token_len - 3 * 4) - return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, - rxgk_abort_resp_tok_short); + if (xdr_round_up(ticket_len) > token_len - sizeof(container)) + goto short_packet; _debug("KVNO %u", kvno); _debug("ENC %u", enctype); @@ -285,4 +290,8 @@ temporary_error: * also come out this way if the ticket decryption fails. */ return ret; + +short_packet: + return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_tok_short); } -- cgit v1.2.3 From 91d8a53db2199eefc73ecf3682e0665ea6895696 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 10 Sep 2025 17:22:13 +0200 Subject: xfrm: fix offloading of cross-family tunnels Xiumei reported a regression in IPsec offload tests over xfrmi, where the traffic for IPv6 over IPv4 tunnels is processed in SW instead of going through crypto offload, after commit cc18f482e8b6 ("xfrm: provide common xdo_dev_offload_ok callback implementation"). Commit cc18f482e8b6 added a generic version of existing checks attempting to prevent packets with IPv4 options or IPv6 extension headers from being sent to HW that doesn't support offloading such packets. The check mistakenly uses x->props.family (the outer family) to determine the inner packet's family and verify if options/extensions are present. In the case of IPv6 over IPv4, the check compares some of the traffic class bits to the expected no-options ihl value (5). The original check was introduced in commit 2ac9cfe78223 ("net/mlx5e: IPSec, Add Innova IPSec offload TX data path"), and then duplicated in the other drivers. Before commit cc18f482e8b6, the loose check (ihl > 5) passed because those traffic class bits were not set to a value that triggered the no-offload codepath. Packets with options/extension headers that should have been handled in SW went through the offload path, and were likely dropped by the NIC or incorrectly processed. Since commit cc18f482e8b6, the check is now strict (ihl != 5), and in a basic setup (no traffic class configured), all packets go through the no-offload codepath. The commits that introduced the incorrect family checks in each driver are: 2ac9cfe78223 ("net/mlx5e: IPSec, Add Innova IPSec offload TX data path") 8362ea16f69f ("crypto: chcr - ESN for Inline IPSec Tx") 859a497fe80c ("nfp: implement xfrm callbacks and expose ipsec offload feature to upper layer") 32188be805d0 ("cn10k-ipsec: Allow ipsec crypto offload for skb with SA") [ixgbe/ixgbevf commits are ignored, as that HW does not support tunnel mode, thus no cross-family setups are possible] Fixes: cc18f482e8b6 ("xfrm: provide common xdo_dev_offload_ok callback implementation") Reported-by: Xiumei Mu Signed-off-by: Sabrina Dubroca Reviewed-by: Leon Romanovsky Reviewed-by: Zhu Yanjun Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index c7a1f080d2de..44b9de6e4e77 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -438,7 +438,7 @@ ok: check_tunnel_size = x->xso.type == XFRM_DEV_OFFLOAD_PACKET && x->props.mode == XFRM_MODE_TUNNEL; - switch (x->props.family) { + switch (x->inner_mode.family) { case AF_INET: /* Check for IPv4 options */ if (ip_hdr(skb)->ihl != 5) -- cgit v1.2.3 From 449144f4d5f284c038c57bd7ea54c0d4b7ca766d Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Thu, 11 Sep 2025 13:06:29 +0200 Subject: tcp: reorganize SYN ECN code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prepare for AccECN that needs to have access here on IP ECN field value which is only available after INET_ECN_xmit(). No functional changes. Signed-off-by: Ilpo Järvinen Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250911110642.87529-2-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_output.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e180364b8dda..54b8faa3ad95 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -350,10 +350,11 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) tp->ecn_flags = 0; if (use_ecn) { - TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; - tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) INET_ECN_xmit(sk); + + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); } } -- cgit v1.2.3 From c3426ba2ed6942fe33c75bf17fc7513ba2c6ac64 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Thu, 11 Sep 2025 13:06:31 +0200 Subject: tcp: reorganize tcp_sock_write_txrx group for variables later Use the first 3-byte hole at the beginning of the tcp_sock_write_txrx group for 'noneagle'/'rate_app_limited' to fill in the existing hole in later patches. Therefore, the group size of tcp_sock_write_txrx is reduced from 92 + 4 to 91 + 4. In addition, the group size of tcp_sock_write_rx is changed to 96 to fit in the pahole outcome. Below are the trimmed pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_sock { [...] __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */ /* XXX 3 bytes hole, try to pack */ [...] struct tcp_options_received rx_opt; /* 2588 24 */ u8 nonagle:4; /* 2612: 0 1 */ u8 rate_app_limited:1; /* 2612: 4 1 */ /* XXX 3 bits hole, try to pack */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2613 0 */ /* XXX 3 bytes hole, try to pack */ __cacheline_group_begin__tcp_sock_write_rx[0] __attribute__((__aligned__(8))); /* 2616 0 */ [...] __cacheline_group_end__tcp_sock_write_rx[0]; /* 2712 0 */ [...] /* size: 3200, cachelines: 50, members: 161 */ } [AFTER THIS PATCH] struct tcp_sock { [...] __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */ u8 nonagle:4; /* 2521: 0 1 */ u8 rate_app_limited:1; /* 2521: 4 1 */ /* XXX 3 bits hole, try to pack */ /* XXX 2 bytes hole, try to pack */ [...] struct tcp_options_received rx_opt; /* 2588 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2612 0 */ /* XXX 4 bytes hole, try to pack */ __cacheline_group_begin__tcp_sock_write_rx[0] __attribute__((__aligned__(8))); /* 2616 0 */ [...] __cacheline_group_end__tcp_sock_write_rx[0]; /* 2712 0 */ [...] /* size: 3200, cachelines: 50, members: 161 */ } Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250911110642.87529-4-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 4 ++-- net/ipv4/tcp.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 57e478bfaef2..d103cc0e7a35 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -285,6 +285,8 @@ struct tcp_sock { * Header prediction flags * 0x5?10 << 16 + snd_wnd in net byte order */ + u8 nonagle : 4,/* Disable Nagle algorithm? */ + rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ __be32 pred_flags; u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ @@ -303,8 +305,6 @@ struct tcp_sock { * Options received (usually on last packet, some only on SYN packets). */ struct tcp_options_received rx_opt; - u8 nonagle : 4,/* Disable Nagle algorithm? */ - rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ __cacheline_group_end(tcp_sock_write_txrx); /* RX read-write hotpath cache lines */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7f9c671b1ee0..1f643faa8b93 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5145,7 +5145,7 @@ static void __init tcp_struct_check(void) /* 32bit arches with 8byte alignment on u64 fields might need padding * before tcp_clock_cache. */ - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 92 + 4); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 91 + 4); /* RX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); @@ -5162,7 +5162,7 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 99); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 96); } void __init tcp_init(void) -- cgit v1.2.3 From 30f5ca00624397d81c99515bdd43286ade93d7c8 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Thu, 11 Sep 2025 13:06:32 +0200 Subject: tcp: ecn functions in separated include file The following patches will modify ECN helpers and add AccECN herlpers, and this patch moves the existing ones into a separated include file. No functional changes. Signed-off-by: Chia-Yu Chang Acked-by: Paolo Abeni Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250911110642.87529-5-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Jakub Kicinski --- include/net/tcp_ecn.h | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_input.c | 45 +------------------- net/ipv4/tcp_output.c | 56 +----------------------- 3 files changed, 118 insertions(+), 99 deletions(-) create mode 100644 include/net/tcp_ecn.h (limited to 'net') diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h new file mode 100644 index 000000000000..b3430557676b --- /dev/null +++ b/include/net/tcp_ecn.h @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _TCP_ECN_H +#define _TCP_ECN_H + +#include +#include + +#include +#include +#include +#include + +static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) +{ + if (tcp_ecn_mode_rfc3168(tp)) + tp->ecn_flags |= TCP_ECN_QUEUE_CWR; +} + +static inline void tcp_ecn_accept_cwr(struct sock *sk, + const struct sk_buff *skb) +{ + if (tcp_hdr(skb)->cwr) { + tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + + /* If the sender is telling us it has entered CWR, then its + * cwnd may be very low (even just 1 packet), so we should ACK + * immediately. + */ + if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + } +} + +static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) +{ + tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; +} + +static inline void tcp_ecn_rcv_synack(struct tcp_sock *tp, + const struct tcphdr *th) +{ + if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr)) + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); +} + +static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, + const struct tcphdr *th) +{ + if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr)) + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); +} + +static inline bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, + const struct tcphdr *th) +{ + if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp)) + return true; + return false; +} + +/* Packet ECN state for a SYN-ACK */ +static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; + if (tcp_ecn_disabled(tp)) + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; + else if (tcp_ca_needs_ecn(sk) || + tcp_bpf_ca_needs_ecn(sk)) + INET_ECN_xmit(sk); +} + +/* Packet ECN state for a SYN. */ +static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); + bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || + tcp_ca_needs_ecn(sk) || bpf_needs_ecn; + + if (!use_ecn) { + const struct dst_entry *dst = __sk_dst_get(sk); + + if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) + use_ecn = true; + } + + tp->ecn_flags = 0; + + if (use_ecn) { + if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) + INET_ECN_xmit(sk); + + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + } +} + +static inline void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) +{ + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) + /* tp->ecn_flags are cleared at a later point in time when + * SYN ACK is ultimatively being received. + */ + TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR); +} + +static inline void +tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) +{ + if (inet_rsk(req)->ecn_ok) + th->ece = 1; +} + +#endif /* _LINUX_TCP_ECN_H */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f1be65af1a77..b2793e749cfd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -72,6 +72,7 @@ #include #include #include +#include #include #include #include @@ -339,31 +340,6 @@ static bool tcp_in_quickack_mode(struct sock *sk) (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk)); } -static void tcp_ecn_queue_cwr(struct tcp_sock *tp) -{ - if (tcp_ecn_mode_rfc3168(tp)) - tp->ecn_flags |= TCP_ECN_QUEUE_CWR; -} - -static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb) -{ - if (tcp_hdr(skb)->cwr) { - tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR; - - /* If the sender is telling us it has entered CWR, then its - * cwnd may be very low (even just 1 packet), so we should ACK - * immediately. - */ - if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) - inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; - } -} - -static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) -{ - tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; -} - static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -399,25 +375,6 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) } } -static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) -{ - if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr)) - tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); -} - -static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) -{ - if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr)) - tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); -} - -static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) -{ - if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp)) - return true; - return false; -} - static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count) { tp->delivered_ce += ecn_count; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 54b8faa3ad95..be8ceefa5332 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -38,6 +38,7 @@ #define pr_fmt(fmt) "TCP: " fmt #include +#include #include #include @@ -319,61 +320,6 @@ static u16 tcp_select_window(struct sock *sk) return new_win; } -/* Packet ECN state for a SYN-ACK */ -static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) -{ - const struct tcp_sock *tp = tcp_sk(sk); - - TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; - if (tcp_ecn_disabled(tp)) - TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; - else if (tcp_ca_needs_ecn(sk) || - tcp_bpf_ca_needs_ecn(sk)) - INET_ECN_xmit(sk); -} - -/* Packet ECN state for a SYN. */ -static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); - bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || - tcp_ca_needs_ecn(sk) || bpf_needs_ecn; - - if (!use_ecn) { - const struct dst_entry *dst = __sk_dst_get(sk); - - if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) - use_ecn = true; - } - - tp->ecn_flags = 0; - - if (use_ecn) { - if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) - INET_ECN_xmit(sk); - - TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; - tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); - } -} - -static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) -{ - if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) - /* tp->ecn_flags are cleared at a later point in time when - * SYN ACK is ultimatively being received. - */ - TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR); -} - -static void -tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) -{ - if (inet_rsk(req)->ecn_ok) - th->ece = 1; -} - /* Set up ECN state for a packet on a ESTABLISHED socket that is about to * be sent. */ -- cgit v1.2.3 From 4351ca3fcb3ffecf12631b4996bf085a2dad0db6 Mon Sep 17 00:00:00 2001 From: Håkon Bugge Date: Thu, 11 Sep 2025 15:33:34 +0200 Subject: rds: ib: Increment i_fastreg_wrs before bailing out MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We need to increment i_fastreg_wrs before we bail out from rds_ib_post_reg_frmr(). We have a fixed budget of how many FRWR operations that can be outstanding using the dedicated QP used for memory registrations and de-registrations. This budget is enforced by the atomic_t i_fastreg_wrs. If we bail out early in rds_ib_post_reg_frmr(), we will "leak" the possibility of posting an FRWR operation, and if that accumulates, no FRWR operation can be carried out. Fixes: 1659185fb4d0 ("RDS: IB: Support Fastreg MR (FRMR) memory registration mode") Fixes: 3a2886cca703 ("net/rds: Keep track of and wait for FRWR segments in use upon shutdown") Cc: stable@vger.kernel.org Signed-off-by: Håkon Bugge Reviewed-by: Allison Henderson Link: https://patch.msgid.link/20250911133336.451212-1-haakon.bugge@oracle.com Signed-off-by: Jakub Kicinski --- net/rds/ib_frmr.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c index 28c1b0022178..bd861191157b 100644 --- a/net/rds/ib_frmr.c +++ b/net/rds/ib_frmr.c @@ -133,12 +133,15 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_dma_len, &off, PAGE_SIZE); - if (unlikely(ret != ibmr->sg_dma_len)) - return ret < 0 ? ret : -EINVAL; + if (unlikely(ret != ibmr->sg_dma_len)) { + ret = ret < 0 ? ret : -EINVAL; + goto out_inc; + } - if (cmpxchg(&frmr->fr_state, - FRMR_IS_FREE, FRMR_IS_INUSE) != FRMR_IS_FREE) - return -EBUSY; + if (cmpxchg(&frmr->fr_state, FRMR_IS_FREE, FRMR_IS_INUSE) != FRMR_IS_FREE) { + ret = -EBUSY; + goto out_inc; + } atomic_inc(&ibmr->ic->i_fastreg_inuse_count); @@ -166,11 +169,10 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) /* Failure here can be because of -ENOMEM as well */ rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_STALE); - atomic_inc(&ibmr->ic->i_fastreg_wrs); if (printk_ratelimit()) pr_warn("RDS/IB: %s returned error(%d)\n", __func__, ret); - goto out; + goto out_inc; } /* Wait for the registration to complete in order to prevent an invalid @@ -179,8 +181,10 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) */ wait_event(frmr->fr_reg_done, !frmr->fr_reg); -out: + return ret; +out_inc: + atomic_inc(&ibmr->ic->i_fastreg_wrs); return ret; } -- cgit v1.2.3 From f755be0b1ff429a2ecf709beeb1bcd7abc111c2b Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:25:50 +0200 Subject: mptcp: propagate shutdown to subflows when possible When the MPTCP DATA FIN have been ACKed, there is no more MPTCP related metadata to exchange, and all subflows can be safely shutdown. Before this patch, the subflows were actually terminated at 'close()' time. That's certainly fine most of the time, but not when the userspace 'shutdown()' a connection, without close()ing it. When doing so, the subflows were staying in LAST_ACK state on one side -- and consequently in FIN_WAIT2 on the other side -- until the 'close()' of the MPTCP socket. Now, when the DATA FIN have been ACKed, all subflows are shutdown. A consequence of this is that the TCP 'FIN' flag can be set earlier now, but the end result is the same. This affects the packetdrill tests looking at the end of the MPTCP connections, but for a good reason. Note that tcp_shutdown() will check the subflow state, so no need to do that again before calling it. Fixes: 3721b9b64676 ("mptcp: Track received DATA_FIN sequence number and add related helpers") Cc: stable@vger.kernel.org Fixes: 16a9a9da1723 ("mptcp: Add helper to process acks of DATA_FIN") Reviewed-by: Mat Martineau Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-fix-sft-connect-v1-1-d40e77cbbf02@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e6fd97b21e9e..5e497a83e967 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -371,6 +371,20 @@ static void mptcp_close_wake_up(struct sock *sk) sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); } +static void mptcp_shutdown_subflows(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow; + + slow = lock_sock_fast(ssk); + tcp_shutdown(ssk, SEND_SHUTDOWN); + unlock_sock_fast(ssk, slow); + } +} + /* called under the msk socket lock */ static bool mptcp_pending_data_fin_ack(struct sock *sk) { @@ -395,6 +409,7 @@ static void mptcp_check_data_fin_ack(struct sock *sk) break; case TCP_CLOSING: case TCP_LAST_ACK: + mptcp_shutdown_subflows(msk); mptcp_set_state(sk, TCP_CLOSE); break; } @@ -563,6 +578,7 @@ static bool mptcp_check_data_fin(struct sock *sk) mptcp_set_state(sk, TCP_CLOSING); break; case TCP_FIN_WAIT2: + mptcp_shutdown_subflows(msk); mptcp_set_state(sk, TCP_CLOSE); break; default: -- cgit v1.2.3 From 96939cec994070aa5df852c10fad5fc303a97ea3 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:52:20 +0200 Subject: mptcp: set remote_deny_join_id0 on SYN recv When a SYN containing the 'C' flag (deny join id0) was received, this piece of information was not propagated to the path-manager. Even if this flag is mainly set on the server side, a client can also tell the server it cannot try to establish new subflows to the client's initial IP address and port. The server's PM should then record such info when received, and before sending events about the new connection. Fixes: df377be38725 ("mptcp: add deny_join_id0 in mptcp_options_received") Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-pm-uspace-deny_join_id0-v1-1-40171884ade8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 3f1b62a9fe88..f31a3a79531a 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -883,6 +883,10 @@ create_child: ctx->subflow_id = 1; owner = mptcp_sk(ctx->conn); + + if (mp_opt.deny_join_id0) + WRITE_ONCE(owner->pm.remote_deny_join_id0, true); + mptcp_pm_new_connection(owner, child, 1); /* with OoO packets we can reach here without ingress -- cgit v1.2.3 From 2293c57484ae64c9a3c847c8807db8c26a3a4d41 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:52:21 +0200 Subject: mptcp: pm: nl: announce deny-join-id0 flag During the connection establishment, a peer can tell the other one that it cannot establish new subflows to the initial IP address and port by setting the 'C' flag [1]. Doing so makes sense when the sender is behind a strict NAT, operating behind a legacy Layer 4 load balancer, or using anycast IP address for example. When this 'C' flag is set, the path-managers must then not try to establish new subflows to the other peer's initial IP address and port. The in-kernel PM has access to this info, but the userspace PM didn't. The RFC8684 [1] is strict about that: (...) therefore the receiver MUST NOT try to open any additional subflows toward this address and port. So it is important to tell the userspace about that as it is responsible for the respect of this flag. When a new connection is created and established, the Netlink events now contain the existing but not currently used 'flags' attribute. When MPTCP_PM_EV_FLAG_DENY_JOIN_ID0 is set, it means no other subflows to the initial IP address and port -- info that are also part of the event -- can be established. Link: https://datatracker.ietf.org/doc/html/rfc8684#section-3.1-20.6 [1] Fixes: 702c2f646d42 ("mptcp: netlink: allow userspace-driven subflow establishment") Reported-by: Marek Majkowski Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/532 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-pm-uspace-deny_join_id0-v1-2-40171884ade8@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 4 ++-- include/uapi/linux/mptcp.h | 2 ++ include/uapi/linux/mptcp_pm.h | 4 ++-- net/mptcp/pm_netlink.c | 7 +++++++ 4 files changed, 13 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index d15335684ec3..d1b4829b580a 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -28,13 +28,13 @@ definitions: traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, server-side. + dport, server-side, [flags]. - name: established doc: >- A MPTCP connection is established (can start new subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, server-side. + dport, server-side, [flags]. - name: closed doc: >- diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 67d015df8893..5fd5b4cf75ca 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -31,6 +31,8 @@ #define MPTCP_INFO_FLAG_FALLBACK _BITUL(0) #define MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED _BITUL(1) +#define MPTCP_PM_EV_FLAG_DENY_JOIN_ID0 _BITUL(0) + #define MPTCP_PM_ADDR_FLAG_SIGNAL (1 << 0) #define MPTCP_PM_ADDR_FLAG_SUBFLOW (1 << 1) #define MPTCP_PM_ADDR_FLAG_BACKUP (1 << 2) diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index 6ac84b2f636c..7359d34da446 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -16,10 +16,10 @@ * good time to allocate memory and send ADD_ADDR if needed. Depending on the * traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED * is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side. + * sport, dport, server-side, [flags]. * @MPTCP_EVENT_ESTABLISHED: A MPTCP connection is established (can start new * subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side. + * sport, dport, server-side, [flags]. * @MPTCP_EVENT_CLOSED: A MPTCP connection has stopped. Attribute: token. * @MPTCP_EVENT_ANNOUNCED: A new address has been announced by the peer. * Attributes: token, rem_id, family, daddr4 | daddr6 [, dport]. diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 50aaf259959a..ce7d42d3bd00 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -408,6 +408,7 @@ static int mptcp_event_created(struct sk_buff *skb, const struct sock *ssk) { int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)); + u16 flags = 0; if (err) return err; @@ -415,6 +416,12 @@ static int mptcp_event_created(struct sk_buff *skb, if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, READ_ONCE(msk->pm.server_side))) return -EMSGSIZE; + if (READ_ONCE(msk->pm.remote_deny_join_id0)) + flags |= MPTCP_PM_EV_FLAG_DENY_JOIN_ID0; + + if (flags && nla_put_u16(skb, MPTCP_ATTR_FLAGS, flags)) + return -EMSGSIZE; + return mptcp_event_add_subflow(skb, ssk); } -- cgit v1.2.3 From 92da495cb65719583aa06bc946aeb18a10e1e6e2 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:52:23 +0200 Subject: mptcp: tfo: record 'deny join id0' info When TFO is used, the check to see if the 'C' flag (deny join id0) was set was bypassed. This flag can be set when TFO is used, so the check should also be done when TFO is used. Note that the set_fully_established label is also used when a 4th ACK is received. In this case, deny_join_id0 will not be set. Fixes: dfc8d0603033 ("mptcp: implement delayed seq generation for passive fastopen") Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-pm-uspace-deny_join_id0-v1-4-40171884ade8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 2a8ea28442b2..1103b3341a70 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -985,13 +985,13 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, return false; } - if (mp_opt->deny_join_id0) - WRITE_ONCE(msk->pm.remote_deny_join_id0, true); - if (unlikely(!READ_ONCE(msk->pm.server_side))) pr_warn_once("bogus mpc option on established client sk"); set_fully_established: + if (mp_opt->deny_join_id0) + WRITE_ONCE(msk->pm.remote_deny_join_id0, true); + mptcp_data_lock((struct sock *)msk); __mptcp_subflow_fully_established(msk, subflow, mp_opt); mptcp_data_unlock((struct sock *)msk); -- cgit v1.2.3 From f3b52167a0cb23b27414452fbc1278da2ee884fc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 12 Sep 2025 09:17:03 -0700 Subject: page_pool: always add GFP_NOWARN for ATOMIC allocations Driver authors often forget to add GFP_NOWARN for page allocation from the datapath. This is annoying to users as OOMs are a fact of life, and we pretty much expect network Rx to hit page allocation failures during OOM. Make page pool add GFP_NOWARN for ATOMIC allocations by default. Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20250912161703.361272-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/page_pool.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/core/page_pool.c b/net/core/page_pool.c index ba70569bd4b0..36a98f2bcac3 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -555,6 +555,12 @@ static noinline netmem_ref __page_pool_alloc_netmems_slow(struct page_pool *pool netmem_ref netmem; int i, nr_pages; + /* Unconditionally set NOWARN if allocating from NAPI. + * Drivers forget to set it, and OOM reports on packet Rx are useless. + */ + if ((gfp & GFP_ATOMIC) == GFP_ATOMIC) + gfp |= __GFP_NOWARN; + /* Don't support bulk alloc for high-order pages */ if (unlikely(pp_order)) return page_to_netmem(__page_pool_alloc_page_order(pool, gfp)); -- cgit v1.2.3 From 3f9a22be374b864c9199a43971d0eec18a88cde4 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 18:36:49 +0200 Subject: mptcp: pm: netlink: fix if-idx type As pointed out by Donald, when parsing an entry, the wrong type was set for the temp value: this value is signed. There are no real issues here, because the intermediate variable was only wrong for the sign, not for the size, and the final variable had the right sign. But this feels wrong, and is confusing, so fixing this small typo introduced by commit ef0da3b8a2f1 ("mptcp: move address attribute into mptcp_addr_info"). Reported-by: Donald Hunter Closes: https://lore.kernel.org/m2plc0ui9z.fsf@gmail.com Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-next-mptcp-minor-fixes-6-18-v1-3-99d179b483ad@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 50aaf259959a..2225b1c5b966 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -113,7 +113,7 @@ int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, return err; if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) { - u32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]); + s32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]); entry->ifindex = val; } -- cgit v1.2.3 From d2d3f529e7b6ff2aa432b16a2317126621c28058 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Fri, 12 Sep 2025 16:03:32 +0200 Subject: ethernet: Extend device_get_mac_address() to use NVMEM A lot of modern SoC have the ability to store MAC addresses in their NVMEM. So extend the generic function device_get_mac_address() to obtain the MAC address from an nvmem cell named 'mac-address' in case there is no firmware node which contains the MAC address directly. Signed-off-by: Stefan Wahren Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250912140332.35395-3-wahrenst@gmx.net Signed-off-by: Jakub Kicinski --- net/ethernet/eth.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 4e3651101b86..43e211e611b1 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -613,7 +613,10 @@ EXPORT_SYMBOL(fwnode_get_mac_address); */ int device_get_mac_address(struct device *dev, char *addr) { - return fwnode_get_mac_address(dev_fwnode(dev), addr); + if (!fwnode_get_mac_address(dev_fwnode(dev), addr)) + return 0; + + return nvmem_get_mac_address(dev, addr); } EXPORT_SYMBOL(device_get_mac_address); -- cgit v1.2.3 From b6f56a44e4c1014b08859dcf04ed246500e310e5 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 13 Sep 2025 13:35:15 +0200 Subject: net: rfkill: gpio: Fix crash due to dereferencering uninitialized pointer Since commit 7d5e9737efda ("net: rfkill: gpio: get the name and type from device property") rfkill_find_type() gets called with the possibly uninitialized "const char *type_name;" local variable. On x86 systems when rfkill-gpio binds to a "BCM4752" or "LNV4752" acpi_device, the rfkill->type is set based on the ACPI acpi_device_id: rfkill->type = (unsigned)id->driver_data; and there is no "type" property so device_property_read_string() will fail and leave type_name uninitialized, leading to a potential crash. rfkill_find_type() does accept a NULL pointer, fix the potential crash by initializing type_name to NULL. Note likely sofar this has not been caught because: 1. Not many x86 machines actually have a "BCM4752"/"LNV4752" acpi_device 2. The stack happened to contain NULL where type_name is stored Fixes: 7d5e9737efda ("net: rfkill: gpio: get the name and type from device property") Cc: stable@vger.kernel.org Cc: Heikki Krogerus Signed-off-by: Hans de Goede Reviewed-by: Heikki Krogerus Link: https://patch.msgid.link/20250913113515.21698-1-hansg@kernel.org Signed-off-by: Johannes Berg --- net/rfkill/rfkill-gpio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index 41e657e97761..cf2dcec6ce5a 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -94,10 +94,10 @@ static const struct dmi_system_id rfkill_gpio_deny_table[] = { static int rfkill_gpio_probe(struct platform_device *pdev) { struct rfkill_gpio_data *rfkill; - struct gpio_desc *gpio; + const char *type_name = NULL; const char *name_property; const char *type_property; - const char *type_name; + struct gpio_desc *gpio; int ret; if (dmi_check_system(rfkill_gpio_deny_table)) -- cgit v1.2.3 From 6fabca2fc94d33cdf7ec102058983b086293395f Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Wed, 17 Sep 2025 10:08:00 +0200 Subject: bpf: Explicitly check accesses to bpf_sock_addr Syzkaller found a kernel warning on the following sock_addr program: 0: r0 = 0 1: r2 = *(u32 *)(r1 +60) 2: exit which triggers: verifier bug: error during ctx access conversion (0) This is happening because offset 60 in bpf_sock_addr corresponds to an implicit padding of 4 bytes, right after msg_src_ip4. Access to this padding isn't rejected in sock_addr_is_valid_access and it thus later fails to convert the access. This patch fixes it by explicitly checking the various fields of bpf_sock_addr in sock_addr_is_valid_access. I checked the other ctx structures and is_valid_access functions and didn't find any other similar cases. Other cases of (properly handled) padding are covered in new tests in a subsequent patch. Fixes: 1cedee13d25a ("bpf: Hooks for sys_sendmsg") Reported-by: syzbot+136ca59d411f92e821b7@syzkaller.appspotmail.com Signed-off-by: Paul Chaignon Signed-off-by: Daniel Borkmann Acked-by: Eduard Zingerman Acked-by: Daniel Borkmann Closes: https://syzkaller.appspot.com/bug?extid=136ca59d411f92e821b7 Link: https://lore.kernel.org/bpf/b58609d9490649e76e584b0361da0abd3c2c1779.1758094761.git.paul.chaignon@gmail.com --- net/core/filter.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 63f3baee2daf..8342f810ad85 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9284,13 +9284,17 @@ static bool sock_addr_is_valid_access(int off, int size, return false; info->reg_type = PTR_TO_SOCKET; break; - default: - if (type == BPF_READ) { - if (size != size_default) - return false; - } else { + case bpf_ctx_range(struct bpf_sock_addr, user_family): + case bpf_ctx_range(struct bpf_sock_addr, family): + case bpf_ctx_range(struct bpf_sock_addr, type): + case bpf_ctx_range(struct bpf_sock_addr, protocol): + if (type != BPF_READ) return false; - } + if (size != size_default) + return false; + break; + default: + return false; } return true; -- cgit v1.2.3 From 45c8a6cc2bcd780e634a6ba8e46bffbdf1fc5c01 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 15 Sep 2025 17:56:46 +0000 Subject: tcp: Clear tcp_sk(sk)->fastopen_rsk in tcp_disconnect(). syzbot reported the splat below where a socket had tcp_sk(sk)->fastopen_rsk in the TCP_ESTABLISHED state. [0] syzbot reused the server-side TCP Fast Open socket as a new client before the TFO socket completes 3WHS: 1. accept() 2. connect(AF_UNSPEC) 3. connect() to another destination As of accept(), sk->sk_state is TCP_SYN_RECV, and tcp_disconnect() changes it to TCP_CLOSE and makes connect() possible, which restarts timers. Since tcp_disconnect() forgot to clear tcp_sk(sk)->fastopen_rsk, the retransmit timer triggered the warning and the intended packet was not retransmitted. Let's call reqsk_fastopen_remove() in tcp_disconnect(). [0]: WARNING: CPU: 2 PID: 0 at net/ipv4/tcp_timer.c:542 tcp_retransmit_timer (net/ipv4/tcp_timer.c:542 (discriminator 7)) Modules linked in: CPU: 2 UID: 0 PID: 0 Comm: swapper/2 Not tainted 6.17.0-rc5-g201825fb4278 #62 PREEMPT(voluntary) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 RIP: 0010:tcp_retransmit_timer (net/ipv4/tcp_timer.c:542 (discriminator 7)) Code: 41 55 41 54 55 53 48 8b af b8 08 00 00 48 89 fb 48 85 ed 0f 84 55 01 00 00 0f b6 47 12 3c 03 74 0c 0f b6 47 12 3c 04 74 04 90 <0f> 0b 90 48 8b 85 c0 00 00 00 48 89 ef 48 8b 40 30 e8 6a 4f 06 3e RSP: 0018:ffffc900002f8d40 EFLAGS: 00010293 RAX: 0000000000000002 RBX: ffff888106911400 RCX: 0000000000000017 RDX: 0000000002517619 RSI: ffffffff83764080 RDI: ffff888106911400 RBP: ffff888106d5c000 R08: 0000000000000001 R09: ffffc900002f8de8 R10: 00000000000000c2 R11: ffffc900002f8ff8 R12: ffff888106911540 R13: ffff888106911480 R14: ffff888106911840 R15: ffffc900002f8de0 FS: 0000000000000000(0000) GS:ffff88907b768000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f8044d69d90 CR3: 0000000002c30003 CR4: 0000000000370ef0 Call Trace: tcp_write_timer (net/ipv4/tcp_timer.c:738) call_timer_fn (kernel/time/timer.c:1747) __run_timers (kernel/time/timer.c:1799 kernel/time/timer.c:2372) timer_expire_remote (kernel/time/timer.c:2385 kernel/time/timer.c:2376 kernel/time/timer.c:2135) tmigr_handle_remote_up (kernel/time/timer_migration.c:944 kernel/time/timer_migration.c:1035) __walk_groups.isra.0 (kernel/time/timer_migration.c:533 (discriminator 1)) tmigr_handle_remote (kernel/time/timer_migration.c:1096) handle_softirqs (./arch/x86/include/asm/jump_label.h:36 ./include/trace/events/irq.h:142 kernel/softirq.c:580) irq_exit_rcu (kernel/softirq.c:614 kernel/softirq.c:453 kernel/softirq.c:680 kernel/softirq.c:696) sysvec_apic_timer_interrupt (arch/x86/kernel/apic/apic.c:1050 (discriminator 35) arch/x86/kernel/apic/apic.c:1050 (discriminator 35)) Fixes: 8336886f786f ("tcp: TCP Fast Open Server - support TFO listeners") Reported-by: syzkaller Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250915175800.118793-2-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 71a956fbfc55..ad76556800f2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3327,6 +3327,7 @@ int tcp_disconnect(struct sock *sk, int flags) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int old_state = sk->sk_state; + struct request_sock *req; u32 seq; if (old_state != TCP_CLOSE) @@ -3442,6 +3443,10 @@ int tcp_disconnect(struct sock *sk, int flags) /* Clean up fastopen related fields */ + req = rcu_dereference_protected(tp->fastopen_rsk, + lockdep_sock_is_held(sk)); + if (req) + reqsk_fastopen_remove(sk, req, false); tcp_free_fastopen_req(tp); inet_clear_bit(DEFER_CONNECT, sk); tp->fastopen_client_fail = 0; -- cgit v1.2.3 From 3d3466878afd8d43ec0ca2facfbc7f03e40d0f79 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 16 Sep 2025 21:47:19 +0000 Subject: smc: Fix use-after-free in __pnet_find_base_ndev(). syzbot reported use-after-free of net_device in __pnet_find_base_ndev(), which was called during connect(). [0] smc_pnet_find_ism_resource() fetches sk_dst_get(sk)->dev and passes down to pnet_find_base_ndev(), where RTNL is held. Then, UAF happened at __pnet_find_base_ndev() when the dev is first used. This means dev had already been freed before acquiring RTNL in pnet_find_base_ndev(). While dev is going away, dst->dev could be swapped with blackhole_netdev, and the dev's refcnt by dst will be released. We must hold dev's refcnt before calling smc_pnet_find_ism_resource(). Also, smc_pnet_find_roce_resource() has the same problem. Let's use __sk_dst_get() and dst_dev_rcu() in the two functions. [0]: BUG: KASAN: use-after-free in __pnet_find_base_ndev+0x1b1/0x1c0 net/smc/smc_pnet.c:926 Read of size 1 at addr ffff888036bac33a by task syz.0.3632/18609 CPU: 1 UID: 0 PID: 18609 Comm: syz.0.3632 Not tainted syzkaller #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/18/2025 Call Trace: dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xca/0x240 mm/kasan/report.c:482 kasan_report+0x118/0x150 mm/kasan/report.c:595 __pnet_find_base_ndev+0x1b1/0x1c0 net/smc/smc_pnet.c:926 pnet_find_base_ndev net/smc/smc_pnet.c:946 [inline] smc_pnet_find_ism_by_pnetid net/smc/smc_pnet.c:1103 [inline] smc_pnet_find_ism_resource+0xef/0x390 net/smc/smc_pnet.c:1154 smc_find_ism_device net/smc/af_smc.c:1030 [inline] smc_find_proposal_devices net/smc/af_smc.c:1115 [inline] __smc_connect+0x372/0x1890 net/smc/af_smc.c:1545 smc_connect+0x877/0xd90 net/smc/af_smc.c:1715 __sys_connect_file net/socket.c:2086 [inline] __sys_connect+0x313/0x440 net/socket.c:2105 __do_sys_connect net/socket.c:2111 [inline] __se_sys_connect net/socket.c:2108 [inline] __x64_sys_connect+0x7a/0x90 net/socket.c:2108 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f47cbf8eba9 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f47ccdb1038 EFLAGS: 00000246 ORIG_RAX: 000000000000002a RAX: ffffffffffffffda RBX: 00007f47cc1d5fa0 RCX: 00007f47cbf8eba9 RDX: 0000000000000010 RSI: 0000200000000280 RDI: 000000000000000b RBP: 00007f47cc011e19 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007f47cc1d6038 R14: 00007f47cc1d5fa0 R15: 00007ffc512f8aa8 The buggy address belongs to the physical page: page: refcount:0 mapcount:0 mapping:0000000000000000 index:0xffff888036bacd00 pfn:0x36bac flags: 0xfff00000000000(node=0|zone=1|lastcpupid=0x7ff) raw: 00fff00000000000 ffffea0001243d08 ffff8880b863fdc0 0000000000000000 raw: ffff888036bacd00 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as freed page last allocated via order 2, migratetype Unmovable, gfp_mask 0x446dc0(GFP_KERNEL_ACCOUNT|__GFP_ZERO|__GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_COMP), pid 16741, tgid 16741 (syz-executor), ts 343313197788, free_ts 380670750466 set_page_owner include/linux/page_owner.h:32 [inline] post_alloc_hook+0x240/0x2a0 mm/page_alloc.c:1851 prep_new_page mm/page_alloc.c:1859 [inline] get_page_from_freelist+0x21e4/0x22c0 mm/page_alloc.c:3858 __alloc_frozen_pages_noprof+0x181/0x370 mm/page_alloc.c:5148 alloc_pages_mpol+0x232/0x4a0 mm/mempolicy.c:2416 ___kmalloc_large_node+0x5f/0x1b0 mm/slub.c:4317 __kmalloc_large_node_noprof+0x18/0x90 mm/slub.c:4348 __do_kmalloc_node mm/slub.c:4364 [inline] __kvmalloc_node_noprof+0x6d/0x5f0 mm/slub.c:5067 alloc_netdev_mqs+0xa3/0x11b0 net/core/dev.c:11812 tun_set_iff+0x532/0xef0 drivers/net/tun.c:2775 __tun_chr_ioctl+0x788/0x1df0 drivers/net/tun.c:3085 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:598 [inline] __se_sys_ioctl+0xfc/0x170 fs/ioctl.c:584 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f page last free pid 18610 tgid 18608 stack trace: reset_page_owner include/linux/page_owner.h:25 [inline] free_pages_prepare mm/page_alloc.c:1395 [inline] __free_frozen_pages+0xbc4/0xd30 mm/page_alloc.c:2895 free_large_kmalloc+0x13a/0x1f0 mm/slub.c:4820 device_release+0x99/0x1c0 drivers/base/core.c:-1 kobject_cleanup lib/kobject.c:689 [inline] kobject_release lib/kobject.c:720 [inline] kref_put include/linux/kref.h:65 [inline] kobject_put+0x22b/0x480 lib/kobject.c:737 netdev_run_todo+0xd2e/0xea0 net/core/dev.c:11513 rtnl_unlock net/core/rtnetlink.c:157 [inline] rtnl_net_unlock include/linux/rtnetlink.h:135 [inline] rtnl_dellink+0x537/0x710 net/core/rtnetlink.c:3563 rtnetlink_rcv_msg+0x7cc/0xb70 net/core/rtnetlink.c:6946 netlink_rcv_skb+0x208/0x470 net/netlink/af_netlink.c:2552 netlink_unicast_kernel net/netlink/af_netlink.c:1320 [inline] netlink_unicast+0x82f/0x9e0 net/netlink/af_netlink.c:1346 netlink_sendmsg+0x805/0xb30 net/netlink/af_netlink.c:1896 sock_sendmsg_nosec net/socket.c:714 [inline] __sock_sendmsg+0x219/0x270 net/socket.c:729 ____sys_sendmsg+0x505/0x830 net/socket.c:2614 ___sys_sendmsg+0x21f/0x2a0 net/socket.c:2668 __sys_sendmsg net/socket.c:2700 [inline] __do_sys_sendmsg net/socket.c:2705 [inline] __se_sys_sendmsg net/socket.c:2703 [inline] __x64_sys_sendmsg+0x19b/0x260 net/socket.c:2703 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Memory state around the buggy address: ffff888036bac200: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff888036bac280: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff >ffff888036bac300: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ ffff888036bac380: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff888036bac400: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff Fixes: 0afff91c6f5e ("net/smc: add pnetid support") Fixes: 1619f770589a ("net/smc: add pnetid support for SMC-D and ISM") Reported-by: syzbot+ea28e9d85be2f327b6c6@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/68c237c7.050a0220.3c6139.0036.GAE@google.com/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916214758.650211-2-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/smc/smc_pnet.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index b90337f86e83..7225b5fa17a6 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -1126,37 +1126,38 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, */ void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini) { - struct dst_entry *dst = sk_dst_get(sk); - - if (!dst) - goto out; - if (!dst->dev) - goto out_rel; + struct net_device *dev; + struct dst_entry *dst; - smc_pnet_find_roce_by_pnetid(dst->dev, ini); + rcu_read_lock(); + dst = __sk_dst_get(sk); + dev = dst ? dst_dev_rcu(dst) : NULL; + dev_hold(dev); + rcu_read_unlock(); -out_rel: - dst_release(dst); -out: - return; + if (dev) { + smc_pnet_find_roce_by_pnetid(dev, ini); + dev_put(dev); + } } void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini) { - struct dst_entry *dst = sk_dst_get(sk); + struct net_device *dev; + struct dst_entry *dst; ini->ism_dev[0] = NULL; - if (!dst) - goto out; - if (!dst->dev) - goto out_rel; - smc_pnet_find_ism_by_pnetid(dst->dev, ini); + rcu_read_lock(); + dst = __sk_dst_get(sk); + dev = dst ? dst_dev_rcu(dst) : NULL; + dev_hold(dev); + rcu_read_unlock(); -out_rel: - dst_release(dst); -out: - return; + if (dev) { + smc_pnet_find_ism_by_pnetid(dev, ini); + dev_put(dev); + } } /* Lookup and apply a pnet table entry to the given ib device. -- cgit v1.2.3 From 935d783e5de9b64587f3adb25641dd8385e64ddb Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 16 Sep 2025 21:47:20 +0000 Subject: smc: Use __sk_dst_get() and dst_dev_rcu() in in smc_clc_prfx_set(). smc_clc_prfx_set() is called during connect() and not under RCU nor RTNL. Using sk_dst_get(sk)->dev could trigger UAF. Let's use __sk_dst_get() and dev_dst_rcu() under rcu_read_lock() after kernel_getsockname(). Note that the returned value of smc_clc_prfx_set() is not used in the caller. While at it, we change the 1st arg of smc_clc_prfx_set[46]_rcu() not to touch dst there. Fixes: a046d57da19f ("smc: CLC handshake (incl. preparation steps)") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916214758.650211-3-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/smc/smc_clc.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 08be56dfb3f2..976b2102bdfc 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -509,10 +509,10 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm, bool check_trl) } /* find ipv4 addr on device and get the prefix len, fill CLC proposal msg */ -static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4, +static int smc_clc_prfx_set4_rcu(struct net_device *dev, __be32 ipv4, struct smc_clc_msg_proposal_prefix *prop) { - struct in_device *in_dev = __in_dev_get_rcu(dst->dev); + struct in_device *in_dev = __in_dev_get_rcu(dev); const struct in_ifaddr *ifa; if (!in_dev) @@ -530,12 +530,12 @@ static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4, } /* fill CLC proposal msg with ipv6 prefixes from device */ -static int smc_clc_prfx_set6_rcu(struct dst_entry *dst, +static int smc_clc_prfx_set6_rcu(struct net_device *dev, struct smc_clc_msg_proposal_prefix *prop, struct smc_clc_ipv6_prefix *ipv6_prfx) { #if IS_ENABLED(CONFIG_IPV6) - struct inet6_dev *in6_dev = __in6_dev_get(dst->dev); + struct inet6_dev *in6_dev = __in6_dev_get(dev); struct inet6_ifaddr *ifa; int cnt = 0; @@ -564,41 +564,44 @@ static int smc_clc_prfx_set(struct socket *clcsock, struct smc_clc_msg_proposal_prefix *prop, struct smc_clc_ipv6_prefix *ipv6_prfx) { - struct dst_entry *dst = sk_dst_get(clcsock->sk); struct sockaddr_storage addrs; struct sockaddr_in6 *addr6; struct sockaddr_in *addr; + struct net_device *dev; + struct dst_entry *dst; int rc = -ENOENT; - if (!dst) { - rc = -ENOTCONN; - goto out; - } - if (!dst->dev) { - rc = -ENODEV; - goto out_rel; - } /* get address to which the internal TCP socket is bound */ if (kernel_getsockname(clcsock, (struct sockaddr *)&addrs) < 0) - goto out_rel; + goto out; + /* analyze IP specific data of net_device belonging to TCP socket */ addr6 = (struct sockaddr_in6 *)&addrs; + rcu_read_lock(); + + dst = __sk_dst_get(clcsock->sk); + dev = dst ? dst_dev_rcu(dst) : NULL; + if (!dev) { + rc = -ENODEV; + goto out_unlock; + } + if (addrs.ss_family == PF_INET) { /* IPv4 */ addr = (struct sockaddr_in *)&addrs; - rc = smc_clc_prfx_set4_rcu(dst, addr->sin_addr.s_addr, prop); + rc = smc_clc_prfx_set4_rcu(dev, addr->sin_addr.s_addr, prop); } else if (ipv6_addr_v4mapped(&addr6->sin6_addr)) { /* mapped IPv4 address - peer is IPv4 only */ - rc = smc_clc_prfx_set4_rcu(dst, addr6->sin6_addr.s6_addr32[3], + rc = smc_clc_prfx_set4_rcu(dev, addr6->sin6_addr.s6_addr32[3], prop); } else { /* IPv6 */ - rc = smc_clc_prfx_set6_rcu(dst, prop, ipv6_prfx); + rc = smc_clc_prfx_set6_rcu(dev, prop, ipv6_prfx); } + +out_unlock: rcu_read_unlock(); -out_rel: - dst_release(dst); out: return rc; } -- cgit v1.2.3 From 235f81045c008169cc4e1955b4a64e118eebe61b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 16 Sep 2025 21:47:21 +0000 Subject: smc: Use __sk_dst_get() and dst_dev_rcu() in smc_clc_prfx_match(). smc_clc_prfx_match() is called from smc_listen_work() and not under RCU nor RTNL. Using sk_dst_get(sk)->dev could trigger UAF. Let's use __sk_dst_get() and dst_dev_rcu(). Note that the returned value of smc_clc_prfx_match() is not used in the caller. Fixes: a046d57da19f ("smc: CLC handshake (incl. preparation steps)") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916214758.650211-4-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/smc/smc_clc.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 976b2102bdfc..09745baa1017 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -657,26 +657,26 @@ static int smc_clc_prfx_match6_rcu(struct net_device *dev, int smc_clc_prfx_match(struct socket *clcsock, struct smc_clc_msg_proposal_prefix *prop) { - struct dst_entry *dst = sk_dst_get(clcsock->sk); + struct net_device *dev; + struct dst_entry *dst; int rc; - if (!dst) { - rc = -ENOTCONN; - goto out; - } - if (!dst->dev) { + rcu_read_lock(); + + dst = __sk_dst_get(clcsock->sk); + dev = dst ? dst_dev_rcu(dst) : NULL; + if (!dev) { rc = -ENODEV; - goto out_rel; + goto out; } - rcu_read_lock(); + if (!prop->ipv6_prefixes_cnt) - rc = smc_clc_prfx_match4_rcu(dst->dev, prop); + rc = smc_clc_prfx_match4_rcu(dev, prop); else - rc = smc_clc_prfx_match6_rcu(dst->dev, prop); - rcu_read_unlock(); -out_rel: - dst_release(dst); + rc = smc_clc_prfx_match6_rcu(dev, prop); out: + rcu_read_unlock(); + return rc; } -- cgit v1.2.3 From 0b0e4d51c6554e5ecc3f8cc73c2eaf12da21249a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 16 Sep 2025 21:47:22 +0000 Subject: smc: Use __sk_dst_get() and dst_dev_rcu() in smc_vlan_by_tcpsk(). smc_vlan_by_tcpsk() fetches sk_dst_get(sk)->dev before RTNL and passes it to netdev_walk_all_lower_dev(), which is illegal. Also, smc_vlan_by_tcpsk_walk() does not require RTNL at all. Let's use __sk_dst_get(), dst_dev_rcu(), and netdev_walk_all_lower_dev_rcu(). Note that the returned value of smc_vlan_by_tcpsk() is not used in the caller. Fixes: 0cfdd8f92cac ("smc: connection and link group creation") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916214758.650211-5-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/smc/smc_core.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 262746e304dd..2a559a98541c 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1883,35 +1883,32 @@ static int smc_vlan_by_tcpsk_walk(struct net_device *lower_dev, /* Determine vlan of internal TCP socket. */ int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini) { - struct dst_entry *dst = sk_dst_get(clcsock->sk); struct netdev_nested_priv priv; struct net_device *ndev; + struct dst_entry *dst; int rc = 0; ini->vlan_id = 0; - if (!dst) { - rc = -ENOTCONN; - goto out; - } - if (!dst->dev) { + + rcu_read_lock(); + + dst = __sk_dst_get(clcsock->sk); + ndev = dst ? dst_dev_rcu(dst) : NULL; + if (!ndev) { rc = -ENODEV; - goto out_rel; + goto out; } - ndev = dst->dev; if (is_vlan_dev(ndev)) { ini->vlan_id = vlan_dev_vlan_id(ndev); - goto out_rel; + goto out; } priv.data = (void *)&ini->vlan_id; - rtnl_lock(); - netdev_walk_all_lower_dev(ndev, smc_vlan_by_tcpsk_walk, &priv); - rtnl_unlock(); - -out_rel: - dst_release(dst); + netdev_walk_all_lower_dev_rcu(ndev, smc_vlan_by_tcpsk_walk, &priv); out: + rcu_read_unlock(); + return rc; } -- cgit v1.2.3 From c65f27b9c3be2269918e1cbad6d8884741f835c5 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 16 Sep 2025 21:47:23 +0000 Subject: tls: Use __sk_dst_get() and dst_dev_rcu() in get_netdev_for_sock(). get_netdev_for_sock() is called during setsockopt(), so not under RCU. Using sk_dst_get(sk)->dev could trigger UAF. Let's use __sk_dst_get() and dst_dev_rcu(). Note that the only ->ndo_sk_get_lower_dev() user is bond_sk_get_lower_dev(), which uses RCU. Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250916214758.650211-6-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/tls/tls_device.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index f672a62a9a52..a82fdcf19969 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -123,17 +123,19 @@ static void tls_device_queue_ctx_destruction(struct tls_context *ctx) /* We assume that the socket is already connected */ static struct net_device *get_netdev_for_sock(struct sock *sk) { - struct dst_entry *dst = sk_dst_get(sk); - struct net_device *netdev = NULL; + struct net_device *dev, *lowest_dev = NULL; + struct dst_entry *dst; - if (likely(dst)) { - netdev = netdev_sk_get_lowest_dev(dst->dev, sk); - dev_hold(netdev); + rcu_read_lock(); + dst = __sk_dst_get(sk); + dev = dst ? dst_dev_rcu(dst) : NULL; + if (likely(dev)) { + lowest_dev = netdev_sk_get_lowest_dev(dev, sk); + dev_hold(lowest_dev); } + rcu_read_unlock(); - dst_release(dst); - - return netdev; + return lowest_dev; } static void destroy_record(struct tls_record_info *record) -- cgit v1.2.3 From 108a86c71c93ff28087994e6107bc99ebe336629 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 16 Sep 2025 21:47:24 +0000 Subject: mptcp: Call dst_release() in mptcp_active_enable(). mptcp_active_enable() calls sk_dst_get(), which returns dst with its refcount bumped, but forgot dst_release(). Let's add missing dst_release(). Cc: stable@vger.kernel.org Fixes: 27069e7cb3d1 ("mptcp: disable active MPTCP in case of blackhole") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Matthieu Baerts (NGI0) Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916214758.650211-7-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/mptcp/ctrl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index fed40dae5583..c0e516872b4b 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -505,6 +505,8 @@ void mptcp_active_enable(struct sock *sk) if (dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK)) atomic_set(&pernet->active_disable_times, 0); + + dst_release(dst); } } -- cgit v1.2.3 From 893c49a78d9f85e4b8081b908fb7c407d018106a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 16 Sep 2025 21:47:25 +0000 Subject: mptcp: Use __sk_dst_get() and dst_dev_rcu() in mptcp_active_enable(). mptcp_active_enable() is called from subflow_finish_connect(), which is icsk->icsk_af_ops->sk_rx_dst_set() and it's not always under RCU. Using sk_dst_get(sk)->dev could trigger UAF. Let's use __sk_dst_get() and dst_dev_rcu(). Fixes: 27069e7cb3d1 ("mptcp: disable active MPTCP in case of blackhole") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Matthieu Baerts (NGI0) Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916214758.650211-8-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/mptcp/ctrl.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index c0e516872b4b..e8ffa62ec183 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -501,12 +501,15 @@ void mptcp_active_enable(struct sock *sk) struct mptcp_pernet *pernet = mptcp_get_pernet(sock_net(sk)); if (atomic_read(&pernet->active_disable_times)) { - struct dst_entry *dst = sk_dst_get(sk); + struct net_device *dev; + struct dst_entry *dst; - if (dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK)) + rcu_read_lock(); + dst = __sk_dst_get(sk); + dev = dst ? dst_dev_rcu(dst) : NULL; + if (dev && (dev->flags & IFF_LOOPBACK)) atomic_set(&pernet->active_disable_times, 0); - - dst_release(dst); + rcu_read_unlock(); } } -- cgit v1.2.3 From 6bdcb735fec6cb866b0d40634d4f23effba81074 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 16 Sep 2025 17:11:43 +0300 Subject: devlink: Add a 'num_doorbells' driverinit param This parameter can be used by drivers to configure a different number of doorbells. Signed-off-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Reviewed-by: Jiri Pirko Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski --- Documentation/networking/devlink/devlink-params.rst | 3 +++ include/net/devlink.h | 4 ++++ net/devlink/param.c | 5 +++++ 3 files changed, 12 insertions(+) (limited to 'net') diff --git a/Documentation/networking/devlink/devlink-params.rst b/Documentation/networking/devlink/devlink-params.rst index c51da4fba7e7..0a9c20d70122 100644 --- a/Documentation/networking/devlink/devlink-params.rst +++ b/Documentation/networking/devlink/devlink-params.rst @@ -148,3 +148,6 @@ own name. - The max number of Virtual Functions (VFs) exposed by the PF. after reboot/pci reset, 'sriov_totalvfs' entry under the device's sysfs directory will report this value. + * - ``num_doorbells`` + - u32 + - Controls the number of doorbells used by the device. diff --git a/include/net/devlink.h b/include/net/devlink.h index 8d4362f010e4..9e824f61e40f 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -531,6 +531,7 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_ENABLE_PHC, DEVLINK_PARAM_GENERIC_ID_CLOCK_ID, DEVLINK_PARAM_GENERIC_ID_TOTAL_VFS, + DEVLINK_PARAM_GENERIC_ID_NUM_DOORBELLS, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -598,6 +599,9 @@ enum devlink_param_generic_id { #define DEVLINK_PARAM_GENERIC_TOTAL_VFS_NAME "total_vfs" #define DEVLINK_PARAM_GENERIC_TOTAL_VFS_TYPE DEVLINK_PARAM_TYPE_U32 +#define DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_NAME "num_doorbells" +#define DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_TYPE DEVLINK_PARAM_TYPE_U32 + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ diff --git a/net/devlink/param.c b/net/devlink/param.c index 33134940c266..70e69523412c 100644 --- a/net/devlink/param.c +++ b/net/devlink/param.c @@ -107,6 +107,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_TOTAL_VFS_NAME, .type = DEVLINK_PARAM_GENERIC_TOTAL_VFS_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_NUM_DOORBELLS, + .name = DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_NAME, + .type = DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) -- cgit v1.2.3 From 542a495cbaa6dc57a310da62b501fdf318657cad Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:25 +0200 Subject: tcp: AccECN core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change implements Accurate ECN without negotiation and AccECN Option (that will be added by later changes). Based on AccECN specifications: https://tools.ietf.org/id/draft-ietf-tcpm-accurate-ecn-28.txt Accurate ECN allows feeding back the number of CE (congestion experienced) marks accurately to the sender in contrast to RFC3168 ECN that can only signal one marks-seen-yes/no per RTT. Congestion control algorithms can take advantage of the accurate ECN information to fine-tune their congestion response to avoid drastic rate reduction when only mild congestion is encountered. With Accurate ECN, tp->received_ce (r.cep in AccECN spec) keeps track of how many segments have arrived with a CE mark. Accurate ECN uses ACE field (ECE, CWR, AE) to communicate the value back to the sender which updates tp->delivered_ce (s.cep) based on the feedback. This signalling channel is lossy when ACE field overflow occurs. Conservative strategy is selected here to deal with the ACE overflow, however, some strategies using the AccECN option later in the overall patchset mitigate against false overflows detected. The ACE field values on the wire are offset by TCP_ACCECN_CEP_INIT_OFFSET. Delivered_ce/received_ce count the real CE marks rather than forcing all downstream users to adapt to the wire offset. This patch uses the first 1-byte hole and the last 4-byte hole of the tcp_sock_write_txrx for 'received_ce_pending' and 'received_ce'. Also, the group size of tcp_sock_write_txrx is increased from 91 + 4 to 95 + 4 due to the new u32 received_ce member. Below are the trimmed pahole outcomes before and after this patch. [BEFORE THIS PATCH] struct tcp_sock { [...] __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */ u8 nonagle:4; /* 2521: 0 1 */ u8 rate_app_limited:1; /* 2521: 4 1 */ /* XXX 3 bits hole, try to pack */ /* XXX 2 bytes hole, try to pack */ [...] u32 delivered_ce; /* 2576 4 */ u32 app_limited; /* 2580 4 */ u32 rcv_wnd; /* 2684 4 */ struct tcp_options_received rx_opt; /* 2688 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2612 0 */ /* XXX 4 bytes hole, try to pack */ [...] /* size: 3200, cachelines: 50, members: 161 */ } [AFTER THIS PATCH] struct tcp_sock { [...] __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */ u8 nonagle:4; /* 2521: 0 1 */ u8 rate_app_limited:1; /* 2521: 4 1 */ /* XXX 3 bits hole, try to pack */ /* Force alignment to the next boundary: */ u8 :0; u8 received_ce_pending:4;/* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ /* XXX 1 byte hole, try to pack */ [...] u32 delivered_ce; /* 2576 4 */ u32 received_ce; /* 2580 4 */ u32 app_limited; /* 2584 4 */ u32 rcv_wnd; /* 2588 4 */ struct tcp_options_received rx_opt; /* 2592 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2616 0 */ [...] /* size: 3200, cachelines: 50, members: 164 */ } Signed-off-by: Ilpo Järvinen Co-developed-by: Olivier Tilmans Signed-off-by: Olivier Tilmans Co-developed-by: Chia-Yu Chang Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-2-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- .../networking/net_cachelines/tcp_sock.rst | 2 + include/linux/tcp.h | 3 + include/net/tcp.h | 15 ++++ include/net/tcp_ecn.h | 53 ++++++++++- net/ipv4/tcp.c | 5 +- net/ipv4/tcp_input.c | 100 +++++++++++++++++++-- net/ipv4/tcp_output.c | 9 +- 7 files changed, 175 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 7bbda5944ee2..31313a9adccc 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -101,6 +101,8 @@ u32 prr_delivered u32 prr_out read_mostly read_mostly tcp_rate_skb_sent,tcp_newly_delivered(tx);tcp_ack,tcp_rate_gen,tcp_clean_rtx_queue(rx) u32 delivered read_mostly read_write tcp_rate_skb_sent, tcp_newly_delivered(tx);tcp_ack, tcp_rate_gen, tcp_clean_rtx_queue (rx) u32 delivered_ce read_mostly read_write tcp_rate_skb_sent(tx);tcp_rate_gen(rx) +u32 received_ce read_mostly read_write +u8:4 received_ce_pending read_mostly read_write u32 lost read_mostly tcp_ack u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx) u64 first_tx_mstamp read_write tcp_rate_skb_sent diff --git a/include/linux/tcp.h b/include/linux/tcp.h index d103cc0e7a35..90cee6e53527 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -287,6 +287,8 @@ struct tcp_sock { */ u8 nonagle : 4,/* Disable Nagle algorithm? */ rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ + u8 received_ce_pending:4, /* Not yet transmit cnt of received_ce */ + unused2:4; __be32 pred_flags; u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ @@ -299,6 +301,7 @@ struct tcp_sock { u32 snd_up; /* Urgent pointer */ u32 delivered; /* Total data packets delivered incl. rexmits */ u32 delivered_ce; /* Like the above but only ECE marked packets */ + u32 received_ce; /* Like the above but for rcvd CE marked pkts */ u32 app_limited; /* limited until "delivered" reaches this val */ u32 rcv_wnd; /* Current receiver window */ /* diff --git a/include/net/tcp.h b/include/net/tcp.h index e25340459ce4..bc5159fe842e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -973,6 +973,14 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) #define TCPHDR_ACE (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE) #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) +#define TCP_ACCECN_CEP_ACE_MASK 0x7 +#define TCP_ACCECN_ACE_MAX_DELTA 6 + +/* To avoid/detect middlebox interference, not all counters start at 0. + * See draft-ietf-tcpm-accurate-ecn for the latest values. + */ +#define TCP_ACCECN_CEP_INIT_OFFSET 5 + /* State flags for sacked in struct tcp_skb_cb */ enum tcp_skb_cb_sacked_flags { TCPCB_SACKED_ACKED = (1 << 0), /* SKB ACK'd by a SACK block */ @@ -1782,11 +1790,18 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt, static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) { + u32 ace; + /* mptcp hooks are only on the slow path */ if (sk_is_mptcp((struct sock *)tp)) return; + ace = tcp_ecn_mode_accecn(tp) ? + ((tp->delivered_ce + TCP_ACCECN_CEP_INIT_OFFSET) & + TCP_ACCECN_CEP_ACE_MASK) : 0; + tp->pred_flags = htonl((tp->tcp_header_len << 26) | + (ace << 22) | ntohl(TCP_FLAG_ACK) | snd_wnd); } diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index b3430557676b..b0ed89dbad41 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -12,6 +12,7 @@ static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) { + /* Do not set CWR if in AccECN mode! */ if (tcp_ecn_mode_rfc3168(tp)) tp->ecn_flags |= TCP_ECN_QUEUE_CWR; } @@ -19,8 +20,10 @@ static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) static inline void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb) { - if (tcp_hdr(skb)->cwr) { - tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + struct tcp_sock *tp = tcp_sk(sk); + + if (tcp_ecn_mode_rfc3168(tp) && tcp_hdr(skb)->cwr) { + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; /* If the sender is telling us it has entered CWR, then its * cwnd may be very low (even just 1 packet), so we should ACK @@ -36,6 +39,52 @@ static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; } +static inline u8 tcp_accecn_ace(const struct tcphdr *th) +{ + return (th->ae << 2) | (th->cwr << 1) | th->ece; +} + +static inline void tcp_accecn_init_counters(struct tcp_sock *tp) +{ + tp->received_ce = 0; + tp->received_ce_pending = 0; +} + +/* Updates Accurate ECN received counters from the received IP ECN field */ +static inline void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb) +{ + u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; + u8 is_ce = INET_ECN_is_ce(ecnfield); + struct tcp_sock *tp = tcp_sk(sk); + + if (!INET_ECN_is_not_ect(ecnfield)) { + u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs); + + /* As for accurate ECN, the TCP_ECN_SEEN flag is set by + * tcp_ecn_received_counters() when the ECN codepoint of + * received TCP data or ACK contains ECT(0), ECT(1), or CE. + */ + if (!tcp_ecn_mode_rfc3168(tp)) + tp->ecn_flags |= TCP_ECN_SEEN; + + /* ACE counter tracks *all* segments including pure ACKs */ + tp->received_ce += pcount; + tp->received_ce_pending = min(tp->received_ce_pending + pcount, + 0xfU); + } +} + +static inline void tcp_accecn_set_ace(struct tcphdr *th, struct tcp_sock *tp) +{ + u32 wire_ace; + + wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET; + th->ece = !!(wire_ace & 0x1); + th->cwr = !!(wire_ace & 0x2); + th->ae = !!(wire_ace & 0x4); + tp->received_ce_pending = 0; +} + static inline void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1f643faa8b93..16456c10e5e8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -271,6 +271,7 @@ #include #include #include +#include #include #include #include @@ -3406,6 +3407,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->window_clamp = 0; tp->delivered = 0; tp->delivered_ce = 0; + tcp_accecn_init_counters(tp); if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); @@ -5138,6 +5140,7 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); @@ -5145,7 +5148,7 @@ static void __init tcp_struct_check(void) /* 32bit arches with 8byte alignment on u64 fields might need padding * before tcp_clock_cache. */ - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 91 + 4); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 95 + 4); /* RX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b2793e749cfd..98782134c2f4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -360,16 +360,25 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) if (tcp_ca_needs_ecn(sk)) tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); - if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR) && + tcp_ecn_mode_rfc3168(tp)) { /* Better not delay acks, sender can have a very low cwnd */ tcp_enter_quickack_mode(sk, 2); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; } + /* As for RFC3168 ECN, the TCP_ECN_SEEN flag is set by + * tcp_data_ecn_check() when the ECN codepoint of + * received TCP data contains ECT(0), ECT(1), or CE. + */ + if (!tcp_ecn_mode_rfc3168(tp)) + break; tp->ecn_flags |= TCP_ECN_SEEN; break; default: if (tcp_ca_needs_ecn(sk)) tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); + if (!tcp_ecn_mode_rfc3168(tp)) + break; tp->ecn_flags |= TCP_ECN_SEEN; break; } @@ -385,10 +394,64 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, bool ece_ack) { tp->delivered += delivered; - if (ece_ack) + if (tcp_ecn_mode_rfc3168(tp) && ece_ack) tcp_count_delivered_ce(tp, delivered); } +/* Returns the ECN CE delta */ +static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, + u32 delivered_pkts, int flag) +{ + const struct tcphdr *th = tcp_hdr(skb); + struct tcp_sock *tp = tcp_sk(sk); + u32 delta, safe_delta; + u32 corrected_ace; + + /* Reordered ACK or uncertain due to lack of data to send and ts */ + if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS))) + return 0; + + if (!(flag & FLAG_SLOWPATH)) { + /* AccECN counter might overflow on large ACKs */ + if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) + return 0; + } + + /* ACE field is not available during handshake */ + if (flag & FLAG_SYN_ACKED) + return 0; + + if (tp->received_ce_pending >= TCP_ACCECN_ACE_MAX_DELTA) + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + + corrected_ace = tcp_accecn_ace(th) - TCP_ACCECN_CEP_INIT_OFFSET; + delta = (corrected_ace - tp->delivered_ce) & TCP_ACCECN_CEP_ACE_MASK; + if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) + return delta; + + safe_delta = delivered_pkts - + ((delivered_pkts - delta) & TCP_ACCECN_CEP_ACE_MASK); + + return safe_delta; +} + +static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, + u32 delivered_pkts, int *flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 delta; + + delta = __tcp_accecn_process(sk, skb, delivered_pkts, *flag); + if (delta > 0) { + tcp_count_delivered_ce(tp, delta); + *flag |= FLAG_ECE; + /* Recalculate header predictor */ + if (tp->pred_flags) + tcp_fast_path_on(tp); + } + return delta; +} + /* Buffer size and advertised window tuning. * * 1. Tuning sk->sk_sndbuf, when connection enters established state. @@ -3744,7 +3807,8 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit) } /* Returns the number of packets newly acked or sacked by the current ACK */ -static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) +static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, + u32 ecn_count, int flag) { const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -3752,8 +3816,12 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) delivered = tp->delivered - prior_delivered; NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); - if (flag & FLAG_ECE) - NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered); + + if (flag & FLAG_ECE) { + if (tcp_ecn_mode_rfc3168(tp)) + ecn_count = delivered; + NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, ecn_count); + } return delivered; } @@ -3774,6 +3842,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 delivered = tp->delivered; u32 lost = tp->lost; int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ + u32 ecn_count = 0; /* Did we receive ECE/an AccECN ACE update? */ u32 prior_fack; sack_state.first_sackt = 0; @@ -3881,6 +3950,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_rack_update_reo_wnd(sk, &rs); + if (tcp_ecn_mode_accecn(tp)) + ecn_count = tcp_accecn_process(sk, skb, + tp->delivered - delivered, + &flag); + tcp_in_ack_event(sk, flag); if (tp->tlp_high_seq) @@ -3905,7 +3979,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) sk_dst_confirm(sk); - delivered = tcp_newly_delivered(sk, delivered, flag); + delivered = tcp_newly_delivered(sk, delivered, ecn_count, flag); + lost = tp->lost - lost; /* freshly marked lost */ rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); @@ -3914,12 +3989,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) return 1; no_queue: + if (tcp_ecn_mode_accecn(tp)) + ecn_count = tcp_accecn_process(sk, skb, + tp->delivered - delivered, + &flag); tcp_in_ack_event(sk, flag); /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) { tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); - tcp_newly_delivered(sk, delivered, flag); + tcp_newly_delivered(sk, delivered, ecn_count, flag); } /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than @@ -3940,7 +4019,7 @@ old_ack: &sack_state); tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); - tcp_newly_delivered(sk, delivered, flag); + tcp_newly_delivered(sk, delivered, ecn_count, flag); tcp_xmit_recovery(sk, rexmit); } @@ -6071,6 +6150,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) flag |= __tcp_replace_ts_recent(tp, delta); + tcp_ecn_received_counters(sk, skb); + /* We know that such packets are checksummed * on entry. */ @@ -6119,6 +6200,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) /* Bulk data transfer: receiver */ tcp_cleanup_skb(skb); __skb_pull(skb, tcp_header_len); + tcp_ecn_received_counters(sk, skb); eaten = tcp_queue_rcv(sk, skb, &fragstolen); tcp_event_data_recv(sk, skb); @@ -6159,6 +6241,8 @@ validate: return; step5: + tcp_ecn_received_counters(sk, skb); + reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT); if ((int)reason < 0) { reason = -reason; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index be8ceefa5332..a3a6d3e91d84 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -328,7 +328,14 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_ecn_mode_rfc3168(tp)) { + if (!tcp_ecn_mode_any(tp)) + return; + + if (tcp_ecn_mode_accecn(tp)) { + INET_ECN_xmit(sk); + tcp_accecn_set_ace(th, tp); + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN; + } else { /* Not-retransmitted data segment: set ECT and inject CWR. */ if (skb->len != tcp_header_len && !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) { -- cgit v1.2.3 From 3cae34274c79e0c60ccd1c10516973af1aed2a7c Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:26 +0200 Subject: tcp: accecn: AccECN negotiation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Accurate ECN negotiation parts based on the specification: https://tools.ietf.org/id/draft-ietf-tcpm-accurate-ecn-28.txt Accurate ECN is negotiated using ECE, CWR and AE flags in the TCP header. TCP falls back into using RFC3168 ECN if one of the ends supports only RFC3168-style ECN. The AccECN negotiation includes reflecting IP ECN field value seen in SYN and SYNACK back using the same bits as negotiation to allow responding to SYN CE marks and to detect ECN field mangling. CE marks should not occur currently because SYN=1 segments are sent with Non-ECT in IP ECN field (but proposal exists to remove this restriction). Reflecting SYN IP ECN field in SYNACK is relatively simple. Reflecting SYNACK IP ECN field in the final/third ACK of the handshake is more challenging. Linux TCP code is not well prepared for using the final/third ACK a signalling channel which makes things somewhat complicated here. tcp_ecn sysctl can be used to select the highest ECN variant (Accurate ECN, ECN, No ECN) that is attemped to be negotiated and requested for incoming connection and outgoing connection: TCP_ECN_IN_NOECN_OUT_NOECN, TCP_ECN_IN_ECN_OUT_ECN, TCP_ECN_IN_ECN_OUT_NOECN, TCP_ECN_IN_ACCECN_OUT_ACCECN, TCP_ECN_IN_ACCECN_OUT_ECN, and TCP_ECN_IN_ACCECN_OUT_NOECN. After this patch, the size of tcp_request_sock remains unchanged and no new holes are added. Below are the pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_request_sock { [...] u32 rcv_nxt; /* 352 4 */ u8 syn_tos; /* 356 1 */ /* size: 360, cachelines: 6, members: 16 */ } [AFTER THIS PATCH] struct tcp_request_sock { [...] u32 rcv_nxt; /* 352 4 */ u8 syn_tos; /* 356 1 */ bool accecn_ok; /* 357 1 */ u8 syn_ect_snt:2; /* 358: 0 1 */ u8 syn_ect_rcv:2; /* 358: 2 1 */ u8 accecn_fail_mode:4; /* 358: 4 1 */ /* size: 360, cachelines: 6, members: 20 */ } After this patch, the size of tcp_sock remains unchanged and no new holes are added. Also, 4 bits of the existing 2-byte hole are exploited. Below are the pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_sock { [...] u8 dup_ack_counter:2; /* 2761: 0 1 */ u8 tlp_retrans:1; /* 2761: 2 1 */ u8 unused:5; /* 2761: 3 1 */ u8 thin_lto:1; /* 2762: 0 1 */ u8 fastopen_connect:1; /* 2762: 1 1 */ u8 fastopen_no_cookie:1; /* 2762: 2 1 */ u8 fastopen_client_fail:2; /* 2762: 3 1 */ u8 frto:1; /* 2762: 5 1 */ /* XXX 2 bits hole, try to pack */ [...] u8 keepalive_probes; /* 2765 1 */ /* XXX 2 bytes hole, try to pack */ [...] /* size: 3200, cachelines: 50, members: 164 */ } [AFTER THIS PATCH] struct tcp_sock { [...] u8 dup_ack_counter:2; /* 2761: 0 1 */ u8 tlp_retrans:1; /* 2761: 2 1 */ u8 syn_ect_snt:2; /* 2761: 3 1 */ u8 syn_ect_rcv:2; /* 2761: 5 1 */ u8 thin_lto:1; /* 2761: 7 1 */ u8 fastopen_connect:1; /* 2762: 0 1 */ u8 fastopen_no_cookie:1; /* 2762: 1 1 */ u8 fastopen_client_fail:2; /* 2762: 2 1 */ u8 frto:1; /* 2762: 4 1 */ /* XXX 3 bits hole, try to pack */ [...] u8 keepalive_probes; /* 2765 1 */ u8 accecn_fail_mode:4; /* 2766: 0 1 */ /* XXX 4 bits hole, try to pack */ /* XXX 1 byte hole, try to pack */ [...] /* size: 3200, cachelines: 50, members: 166 */ } Signed-off-by: Ilpo Järvinen Co-developed-by: Olivier Tilmans Signed-off-by: Olivier Tilmans Co-developed-by: Chia-Yu Chang Signed-off-by: Chia-Yu Chang Acked-by: Paolo Abeni Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-3-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- Documentation/networking/ip-sysctl.rst | 36 ++- .../networking/net_cachelines/tcp_sock.rst | 3 + include/linux/tcp.h | 8 +- include/net/tcp.h | 1 + include/net/tcp_ecn.h | 310 +++++++++++++++++++-- net/ipv4/syncookies.c | 4 + net/ipv4/sysctl_net_ipv4.c | 3 +- net/ipv4/tcp.c | 1 + net/ipv4/tcp_input.c | 50 +++- net/ipv4/tcp_ipv4.c | 6 +- net/ipv4/tcp_minisocks.c | 24 +- net/ipv4/tcp_output.c | 10 +- net/ipv6/syncookies.c | 2 + net/ipv6/tcp_ipv6.c | 1 + 14 files changed, 400 insertions(+), 59 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 9f5891c9b07b..3d8eb54b84f9 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -443,20 +443,28 @@ tcp_early_retrans - INTEGER tcp_ecn - INTEGER Control use of Explicit Congestion Notification (ECN) by TCP. - ECN is used only when both ends of the TCP connection indicate - support for it. This feature is useful in avoiding losses due - to congestion by allowing supporting routers to signal - congestion before having to drop packets. - - Possible values are: - - = ===================================================== - 0 Disable ECN. Neither initiate nor accept ECN. - 1 Enable ECN when requested by incoming connections and - also request ECN on outgoing connection attempts. - 2 Enable ECN when requested by incoming connections - but do not request ECN on outgoing connections. - = ===================================================== + ECN is used only when both ends of the TCP connection indicate support + for it. This feature is useful in avoiding losses due to congestion by + allowing supporting routers to signal congestion before having to drop + packets. A host that supports ECN both sends ECN at the IP layer and + feeds back ECN at the TCP layer. The highest variant of ECN feedback + that both peers support is chosen by the ECN negotiation (Accurate ECN, + ECN, or no ECN). + + The highest negotiated variant for incoming connection requests + and the highest variant requested by outgoing connection + attempts: + + ===== ==================== ==================== + Value Incoming connections Outgoing connections + ===== ==================== ==================== + 0 No ECN No ECN + 1 ECN ECN + 2 ECN No ECN + 3 AccECN AccECN + 4 AccECN ECN + 5 AccECN No ECN + ===== ==================== ==================== Default: 2 diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 31313a9adccc..4f71ece7c655 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -103,6 +103,9 @@ u32 delivered read_mostly read_w u32 delivered_ce read_mostly read_write tcp_rate_skb_sent(tx);tcp_rate_gen(rx) u32 received_ce read_mostly read_write u8:4 received_ce_pending read_mostly read_write +u8:2 syn_ect_snt write_mostly read_write +u8:2 syn_ect_rcv read_mostly read_write +u8:4 accecn_fail_mode u32 lost read_mostly tcp_ack u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx) u64 first_tx_mstamp read_write tcp_rate_skb_sent diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 90cee6e53527..b8432bed546d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -168,6 +168,10 @@ struct tcp_request_sock { * after data-in-SYN. */ u8 syn_tos; + bool accecn_ok; + u8 syn_ect_snt: 2, + syn_ect_rcv: 2, + accecn_fail_mode:4; #ifdef CONFIG_TCP_AO u8 ao_keyid; u8 ao_rcv_next; @@ -375,7 +379,8 @@ struct tcp_sock { u8 compressed_ack; u8 dup_ack_counter:2, tlp_retrans:1, /* TLP is a retransmission */ - unused:5; + syn_ect_snt:2, /* AccECN ECT memory, only */ + syn_ect_rcv:2; /* ... needed during 3WHS + first seqno */ u8 thin_lto : 1,/* Use linear timeouts for thin streams */ fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ @@ -391,6 +396,7 @@ struct tcp_sock { syn_fastopen_child:1; /* created TFO passive child socket */ u8 keepalive_probes; /* num of allowed keep alive probes */ + u8 accecn_fail_mode:4; /* AccECN failure handling */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ /* RTT measurement */ diff --git a/include/net/tcp.h b/include/net/tcp.h index bc5159fe842e..da8c6640ead3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -972,6 +972,7 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) #define TCPHDR_ACE (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE) #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) +#define TCPHDR_SYNACK_ACCECN (TCPHDR_SYN | TCPHDR_ACK | TCPHDR_CWR) #define TCP_ACCECN_CEP_ACE_MASK 0x7 #define TCP_ACCECN_ACE_MAX_DELTA 6 diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index b0ed89dbad41..da0b355418bd 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -4,12 +4,26 @@ #include #include +#include #include #include #include #include +/* The highest ECN variant (Accurate ECN, ECN, or no ECN) that is + * attemped to be negotiated and requested for incoming connection + * and outgoing connection, respectively. + */ +enum tcp_ecn_mode { + TCP_ECN_IN_NOECN_OUT_NOECN = 0, + TCP_ECN_IN_ECN_OUT_ECN = 1, + TCP_ECN_IN_ECN_OUT_NOECN = 2, + TCP_ECN_IN_ACCECN_OUT_ACCECN = 3, + TCP_ECN_IN_ACCECN_OUT_ECN = 4, + TCP_ECN_IN_ACCECN_OUT_NOECN = 5, +}; + static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) { /* Do not set CWR if in AccECN mode! */ @@ -39,19 +53,125 @@ static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; } +/* tp->accecn_fail_mode */ +#define TCP_ACCECN_ACE_FAIL_SEND BIT(0) +#define TCP_ACCECN_ACE_FAIL_RECV BIT(1) +#define TCP_ACCECN_OPT_FAIL_SEND BIT(2) +#define TCP_ACCECN_OPT_FAIL_RECV BIT(3) + +static inline bool tcp_accecn_ace_fail_send(const struct tcp_sock *tp) +{ + return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND; +} + +static inline bool tcp_accecn_ace_fail_recv(const struct tcp_sock *tp) +{ + return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV; +} + +static inline bool tcp_accecn_opt_fail_send(const struct tcp_sock *tp) +{ + return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_SEND; +} + +static inline bool tcp_accecn_opt_fail_recv(const struct tcp_sock *tp) +{ + return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_RECV; +} + +static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode) +{ + tp->accecn_fail_mode |= mode; +} + static inline u8 tcp_accecn_ace(const struct tcphdr *th) { return (th->ae << 2) | (th->cwr << 1) | th->ece; } -static inline void tcp_accecn_init_counters(struct tcp_sock *tp) +/* Infer the ECT value our SYN arrived with from the echoed ACE field */ +static inline int tcp_accecn_extract_syn_ect(u8 ace) { - tp->received_ce = 0; - tp->received_ce_pending = 0; + /* Below is an excerpt from the 1st block of Table 2 of AccECN spec */ + static const int ace_to_ecn[8] = { + INET_ECN_ECT_0, /* 0b000 (Undefined) */ + INET_ECN_ECT_1, /* 0b001 (Undefined) */ + INET_ECN_NOT_ECT, /* 0b010 (Not-ECT is received) */ + INET_ECN_ECT_1, /* 0b011 (ECT-1 is received) */ + INET_ECN_ECT_0, /* 0b100 (ECT-0 is received) */ + INET_ECN_ECT_1, /* 0b101 (Reserved) */ + INET_ECN_CE, /* 0b110 (CE is received) */ + INET_ECN_ECT_1 /* 0b111 (Undefined) */ + }; + + return ace_to_ecn[ace & 0x7]; +} + +/* Check ECN field transition to detect invalid transitions */ +static inline bool tcp_ect_transition_valid(u8 snt, u8 rcv) +{ + if (rcv == snt) + return true; + + /* Non-ECT altered to something or something became non-ECT */ + if (snt == INET_ECN_NOT_ECT || rcv == INET_ECN_NOT_ECT) + return false; + /* CE -> ECT(0/1)? */ + if (snt == INET_ECN_CE) + return false; + return true; +} + +static inline bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, + u8 sent_ect) +{ + u8 ect = tcp_accecn_extract_syn_ect(ace); + struct tcp_sock *tp = tcp_sk(sk); + + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) + return true; + + if (!tcp_ect_transition_valid(sent_ect, ect)) { + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV); + return false; + } + + return true; +} + +/* Validate the 3rd ACK based on the ACE field, see Table 4 of AccECN spec */ +static inline void tcp_accecn_third_ack(struct sock *sk, + const struct sk_buff *skb, u8 sent_ect) +{ + u8 ace = tcp_accecn_ace(tcp_hdr(skb)); + struct tcp_sock *tp = tcp_sk(sk); + + switch (ace) { + case 0x0: + /* Invalid value */ + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV); + break; + case 0x7: + case 0x5: + case 0x1: + /* Unused but legal values */ + break; + default: + /* Validation only applies to first non-data packet */ + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq && + !TCP_SKB_CB(skb)->sacked && + tcp_accecn_validate_syn_feedback(sk, ace, sent_ect)) { + if ((tcp_accecn_extract_syn_ect(ace) == INET_ECN_CE) && + !tp->delivered_ce) + tp->delivered_ce++; + } + break; + } } /* Updates Accurate ECN received counters from the received IP ECN field */ -static inline void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb) +static inline void tcp_ecn_received_counters(struct sock *sk, + const struct sk_buff *skb) { u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; u8 is_ce = INET_ECN_is_ce(ecnfield); @@ -74,27 +194,152 @@ static inline void tcp_ecn_received_counters(struct sock *sk, const struct sk_bu } } -static inline void tcp_accecn_set_ace(struct tcphdr *th, struct tcp_sock *tp) +/* AccECN specification, 5.1: [...] a server can determine that it + * negotiated AccECN as [...] if the ACK contains an ACE field with + * the value 0b010 to 0b111 (decimal 2 to 7). + */ +static inline bool cookie_accecn_ok(const struct tcphdr *th) { - u32 wire_ace; + return tcp_accecn_ace(th) > 0x1; +} + +/* Used to form the ACE flags for SYN/ACK */ +static inline u16 tcp_accecn_reflector_flags(u8 ect) +{ + /* TCP ACE flags of SYN/ACK are set based on IP-ECN received from SYN. + * Below is an excerpt from the 1st block of Table 2 of AccECN spec, + * in which TCP ACE flags are encoded as: (AE << 2) | (CWR << 1) | ECE + */ + static const u8 ecn_to_ace_flags[4] = { + 0b010, /* Not-ECT is received */ + 0b011, /* ECT(1) is received */ + 0b100, /* ECT(0) is received */ + 0b110 /* CE is received */ + }; + + return FIELD_PREP(TCPHDR_ACE, ecn_to_ace_flags[ect & 0x3]); +} - wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET; - th->ece = !!(wire_ace & 0x1); - th->cwr = !!(wire_ace & 0x2); - th->ae = !!(wire_ace & 0x4); +/* AccECN specification, 3.1.2: If a TCP server that implements AccECN + * receives a SYN with the three TCP header flags (AE, CWR and ECE) set + * to any combination other than 000, 011 or 111, it MUST negotiate the + * use of AccECN as if they had been set to 111. + */ +static inline bool tcp_accecn_syn_requested(const struct tcphdr *th) +{ + u8 ace = tcp_accecn_ace(th); + + return ace && ace != 0x3; +} + +static inline void tcp_accecn_init_counters(struct tcp_sock *tp) +{ + tp->received_ce = 0; tp->received_ce_pending = 0; } -static inline void tcp_ecn_rcv_synack(struct tcp_sock *tp, - const struct tcphdr *th) +/* Used for make_synack to form the ACE flags */ +static inline void tcp_accecn_echo_syn_ect(struct tcphdr *th, u8 ect) { - if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr)) + /* TCP ACE flags of SYN/ACK are set based on IP-ECN codepoint received + * from SYN. Below is an excerpt from Table 2 of the AccECN spec: + * +====================+====================================+ + * | IP-ECN codepoint | Respective ACE falgs on SYN/ACK | + * | received on SYN | AE CWR ECE | + * +====================+====================================+ + * | Not-ECT | 0 1 0 | + * | ECT(1) | 0 1 1 | + * | ECT(0) | 1 0 0 | + * | CE | 1 1 0 | + * +====================+====================================+ + */ + th->ae = !!(ect & INET_ECN_ECT_0); + th->cwr = ect != INET_ECN_ECT_0; + th->ece = ect == INET_ECN_ECT_1; +} + +static inline void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb, + struct tcphdr *th) +{ + u32 wire_ace; + + /* The final packet of the 3WHS or anything like it must reflect + * the SYN/ACK ECT instead of putting CEP into ACE field, such + * case show up in tcp_flags. + */ + if (likely(!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACE))) { + wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET; + th->ece = !!(wire_ace & 0x1); + th->cwr = !!(wire_ace & 0x2); + th->ae = !!(wire_ace & 0x4); + tp->received_ce_pending = 0; + } +} + +/* See Table 2 of the AccECN draft */ +static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th, + u8 ip_dsfield) +{ + struct tcp_sock *tp = tcp_sk(sk); + u8 ace = tcp_accecn_ace(th); + + switch (ace) { + case 0x0: + case 0x7: + /* +========+========+============+=============+ + * | A | B | SYN/ACK | Feedback | + * | | | B->A | Mode of A | + * | | | AE CWR ECE | | + * +========+========+============+=============+ + * | AccECN | No ECN | 0 0 0 | Not ECN | + * | AccECN | Broken | 1 1 1 | Not ECN | + * +========+========+============+=============+ + */ tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); + break; + case 0x1: + case 0x5: + /* +========+========+============+=============+ + * | A | B | SYN/ACK | Feedback | + * | | | B->A | Mode of A | + * | | | AE CWR ECE | | + * +========+========+============+=============+ + * | AccECN | Nonce | 1 0 1 | (Reserved) | + * | AccECN | ECN | 0 0 1 | Classic ECN | + * | Nonce | AccECN | 0 0 1 | Classic ECN | + * | ECN | AccECN | 0 0 1 | Classic ECN | + * +========+========+============+=============+ + */ + if (tcp_ecn_mode_pending(tp)) + /* Downgrade from AccECN, or requested initially */ + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + break; + default: + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; + if (INET_ECN_is_ce(ip_dsfield) && + tcp_accecn_validate_syn_feedback(sk, ace, + tp->syn_ect_snt)) { + tp->received_ce++; + tp->received_ce_pending++; + } + break; + } } -static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, - const struct tcphdr *th) +static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th, + const struct sk_buff *skb) { + if (tcp_ecn_mode_pending(tp)) { + if (!tcp_accecn_syn_requested(th)) { + /* Downgrade to classic ECN feedback */ + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + } else { + tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & + INET_ECN_MASK; + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + } + } if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr)) tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); } @@ -110,7 +355,7 @@ static inline bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, /* Packet ECN state for a SYN-ACK */ static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) { - const struct tcp_sock *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; if (tcp_ecn_disabled(tp)) @@ -118,6 +363,13 @@ static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) else if (tcp_ca_needs_ecn(sk) || tcp_bpf_ca_needs_ecn(sk)) INET_ECN_xmit(sk); + + if (tp->ecn_flags & TCP_ECN_MODE_ACCECN) { + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE; + TCP_SKB_CB(skb)->tcp_flags |= + tcp_accecn_reflector_flags(tp->syn_ect_rcv); + tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; + } } /* Packet ECN state for a SYN. */ @@ -125,8 +377,13 @@ static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); - bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || - tcp_ca_needs_ecn(sk) || bpf_needs_ecn; + bool use_ecn, use_accecn; + u8 tcp_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn); + + use_accecn = tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ACCECN; + use_ecn = tcp_ecn == TCP_ECN_IN_ECN_OUT_ECN || + tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ECN || + tcp_ca_needs_ecn(sk) || bpf_needs_ecn || use_accecn; if (!use_ecn) { const struct dst_entry *dst = __sk_dst_get(sk); @@ -142,23 +399,32 @@ static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) INET_ECN_xmit(sk); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; - tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + if (use_accecn) { + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_AE; + tcp_ecn_mode_set(tp, TCP_ECN_MODE_PENDING); + tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; + } else { + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + } } } static inline void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) { - if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) { /* tp->ecn_flags are cleared at a later point in time when * SYN ACK is ultimatively being received. */ - TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR); + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE; + } } static inline void tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) { - if (inet_rsk(req)->ecn_ok) + if (tcp_rsk(req)->accecn_ok) + tcp_accecn_echo_syn_ect(th, tcp_rsk(req)->syn_ect_rcv); + else if (inet_rsk(req)->ecn_ok) th->ece = 1; } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index eb0819463fae..569befcf021b 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -12,6 +12,7 @@ #include #include #include +#include #include static siphash_aligned_key_t syncookie_secret[2]; @@ -403,6 +404,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); struct inet_request_sock *ireq; struct net *net = sock_net(sk); + struct tcp_request_sock *treq; struct request_sock *req; struct sock *ret = sk; struct flowi4 fl4; @@ -428,6 +430,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) } ireq = inet_rsk(req); + treq = tcp_rsk(req); sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); @@ -483,6 +486,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) if (!req->syncookie) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, &rt->dst); + treq->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th); ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst); /* ip_queue_xmit() depends on our flow being setup diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 3a43010d726f..268f8b86e8a7 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -47,6 +47,7 @@ static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX; static int tcp_plb_max_rounds = 31; static int tcp_plb_max_cong_thresh = 256; static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC; +static int tcp_ecn_mode_max = 2; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -728,7 +729,7 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, + .extra2 = &tcp_ecn_mode_max, }, { .procname = "tcp_ecn_fallback", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 16456c10e5e8..7261ee6dd875 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3407,6 +3407,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->window_clamp = 0; tp->delivered = 0; tp->delivered_ce = 0; + tp->accecn_fail_mode = 0; tcp_accecn_init_counters(tp); if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 98782134c2f4..8449a5a3e368 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3665,8 +3665,18 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time); } +static void tcp_send_ack_reflect_ect(struct sock *sk, bool accecn_reflector) +{ + struct tcp_sock *tp = tcp_sk(sk); + u16 flags = 0; + + if (accecn_reflector) + flags = tcp_accecn_reflector_flags(tp->syn_ect_rcv); + __tcp_send_ack(sk, tp->rcv_nxt, flags); +} + /* RFC 5961 7 [ACK Throttling] */ -static void tcp_send_challenge_ack(struct sock *sk) +static void tcp_send_challenge_ack(struct sock *sk, bool accecn_reflector) { struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); @@ -3696,7 +3706,7 @@ static void tcp_send_challenge_ack(struct sock *sk) WRITE_ONCE(net->ipv4.tcp_challenge_count, count - 1); send_ack: NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK); - tcp_send_ack(sk); + tcp_send_ack_reflect_ect(sk, accecn_reflector); } } @@ -3863,7 +3873,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ if (before(ack, prior_snd_una - max_window)) { if (!(flag & FLAG_NO_CHALLENGE_ACK)) - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, false); return -SKB_DROP_REASON_TCP_TOO_OLD_ACK; } goto old_ack; @@ -5907,6 +5917,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, int syn_inerr) { struct tcp_sock *tp = tcp_sk(sk); + bool accecn_reflector = false; SKB_DR(reason); /* RFC1323: H1. Apply PAWS check first. */ @@ -6004,7 +6015,7 @@ step1: if (tp->syn_fastopen && !tp->data_segs_in && sk->sk_state == TCP_ESTABLISHED) tcp_fastopen_active_disable(sk); - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, false); SKB_DR_SET(reason, TCP_RESET); goto discard; } @@ -6015,6 +6026,8 @@ step1: * RFC 5961 4.2 : Send a challenge ack */ if (th->syn) { + if (tcp_ecn_mode_accecn(tp)) + accecn_reflector = true; if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack && TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq && TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt && @@ -6024,7 +6037,7 @@ syn_challenge: if (syn_inerr) TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, accecn_reflector); SKB_DR_SET(reason, TCP_INVALID_SYN); goto discard; } @@ -6493,7 +6506,8 @@ consume: * state to ESTABLISHED..." */ - tcp_ecn_rcv_synack(tp, th); + if (tcp_ecn_mode_any(tp)) + tcp_ecn_rcv_synack(sk, th, TCP_SKB_CB(skb)->ip_dsfield); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_try_undo_spurious_syn(sk); @@ -6565,7 +6579,7 @@ consume: TCP_DELACK_MAX, false); goto consume; } - tcp_send_ack(sk); + tcp_send_ack_reflect_ect(sk, tcp_ecn_mode_accecn(tp)); return -1; } @@ -6624,7 +6638,7 @@ consume: tp->snd_wl1 = TCP_SKB_CB(skb)->seq; tp->max_window = tp->snd_wnd; - tcp_ecn_rcv_syn(tp, th); + tcp_ecn_rcv_syn(tp, th, skb); tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); @@ -6806,7 +6820,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) } /* accept old ack during closing */ if ((int)reason < 0) { - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, false); reason = -reason; goto discard; } @@ -6853,9 +6867,12 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tp->lsndtime = tcp_jiffies32; tcp_initialize_rcv_mss(sk); + if (tcp_ecn_mode_accecn(tp)) + tcp_accecn_third_ack(sk, skb, tp->syn_ect_snt); tcp_fast_path_on(tp); if (sk->sk_shutdown & SEND_SHUTDOWN) tcp_shutdown(sk, SEND_SHUTDOWN); + break; case TCP_FIN_WAIT1: { @@ -7025,6 +7042,15 @@ static void tcp_ecn_create_request(struct request_sock *req, bool ect, ecn_ok; u32 ecn_ok_dst; + if (tcp_accecn_syn_requested(th) && + READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3) { + inet_rsk(req)->ecn_ok = 1; + tcp_rsk(req)->accecn_ok = 1; + tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & + INET_ECN_MASK; + return; + } + if (!th_ecn) return; @@ -7032,7 +7058,8 @@ static void tcp_ecn_create_request(struct request_sock *req, ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK); ecn_ok = READ_ONCE(net->ipv4.sysctl_tcp_ecn) || ecn_ok_dst; - if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || + if (((!ect || th->res1 || th->ae) && ecn_ok) || + tcp_ca_needs_ecn(listen_sk) || (ecn_ok_dst & DST_FEATURE_ECN_CA) || tcp_bpf_ca_needs_ecn((struct sock *)req)) inet_rsk(req)->ecn_ok = 1; @@ -7050,6 +7077,9 @@ static void tcp_openreq_init(struct request_sock *req, tcp_rsk(req)->snt_synack = 0; tcp_rsk(req)->snt_tsval_first = 0; tcp_rsk(req)->last_oow_ack_time = 0; + tcp_rsk(req)->accecn_ok = 0; + tcp_rsk(req)->syn_ect_rcv = 0; + tcp_rsk(req)->syn_ect_snt = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; ireq->tstamp_ok = rx_opt->tstamp_ok; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2a0602035729..6162f8dbe9d2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include #include @@ -1189,7 +1190,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { - const struct inet_request_sock *ireq = inet_rsk(req); + struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; int err = -1; struct sk_buff *skb; @@ -1202,6 +1203,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); if (skb) { + tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); tos = READ_ONCE(inet_sk(sk)->tos); @@ -3558,7 +3560,7 @@ fallback: static int __net_init tcp_sk_init(struct net *net) { - net->ipv4.sysctl_tcp_ecn = 2; + net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7c2ae07d8d5d..a4b8be6fdcdc 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -20,6 +20,7 @@ */ #include +#include #include #include #include @@ -451,12 +452,23 @@ void tcp_openreq_init_rwin(struct request_sock *req, ireq->rcv_wscale = rcv_wscale; } -static void tcp_ecn_openreq_child(struct tcp_sock *tp, - const struct request_sock *req) +static void tcp_ecn_openreq_child(struct sock *sk, + const struct request_sock *req, + const struct sk_buff *skb) { - tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? - TCP_ECN_MODE_RFC3168 : - TCP_ECN_DISABLED); + const struct tcp_request_sock *treq = tcp_rsk(req); + struct tcp_sock *tp = tcp_sk(sk); + + if (treq->accecn_ok) { + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + tp->syn_ect_snt = treq->syn_ect_snt; + tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); + tcp_ecn_received_counters(sk, skb); + } else { + tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? + TCP_ECN_MODE_RFC3168 : + TCP_ECN_DISABLED); + } } void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) @@ -621,7 +633,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; - tcp_ecn_openreq_child(newtp, req); + tcp_ecn_openreq_child(newsk, req, skb); newtp->fastopen_req = NULL; RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a3a6d3e91d84..deb9b085a8a2 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -332,8 +332,9 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, return; if (tcp_ecn_mode_accecn(tp)) { - INET_ECN_xmit(sk); - tcp_accecn_set_ace(th, tp); + if (!tcp_accecn_ace_fail_recv(tp)) + INET_ECN_xmit(sk); + tcp_accecn_set_ace(tp, skb, th); skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN; } else { /* Not-retransmitted data segment: set ECT and inject CWR. */ @@ -3356,7 +3357,10 @@ start: tcp_retrans_try_collapse(sk, skb, avail_wnd); } - /* RFC3168, section 6.1.1.1. ECN fallback */ + /* RFC3168, section 6.1.1.1. ECN fallback + * As AccECN uses the same SYN flags (+ AE), this check covers both + * cases. + */ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) tcp_ecn_clear_syn(sk, skb); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index f0ee1a909771..7e007f013ec8 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -16,6 +16,7 @@ #include #include #include +#include #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) @@ -264,6 +265,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) if (!req->syncookie) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, dst); + tcp_rsk(req)->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th); ret = tcp_get_cookie_sock(sk, skb, req, dst); if (!ret) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 08dabc47a6e7..5f0a138f4220 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -544,6 +544,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); if (skb) { + tcp_rsk(req)->syn_ect_snt = np->tclass & INET_ECN_MASK; __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr); -- cgit v1.2.3 From 9a011277445583bab002fbf5043fab0ea03dc5dd Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:27 +0200 Subject: tcp: accecn: add AccECN rx byte counters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These three byte counters track IP ECN field payload byte sums for all arriving (acceptable) packets for ECT0, ECT1, and CE. The AccECN option (added by a later patch in the series) echoes these counters back to sender side; therefore, it is placed within the group of tcp_sock_write_txrx. Below are the pahole outcomes before and after this patch, in which the group size of tcp_sock_write_txrx is increased from 95 + 4 to 107 + 4 and an extra 4-byte hole is created but will be exploited in later patches: [BEFORE THIS PATCH] struct tcp_sock { [...] u32 delivered_ce; /* 2576 4 */ u32 received_ce; /* 2580 4 */ u32 app_limited; /* 2584 4 */ u32 rcv_wnd; /* 2588 4 */ struct tcp_options_received rx_opt; /* 2592 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2616 0 */ [...] /* size: 3200, cachelines: 50, members: 166 */ } [AFTER THIS PATCH] struct tcp_sock { [...] u32 delivered_ce; /* 2576 4 */ u32 received_ce; /* 2580 4 */ u32 received_ecn_bytes[3];/* 2584 12 */ u32 app_limited; /* 2596 4 */ u32 rcv_wnd; /* 2600 4 */ struct tcp_options_received rx_opt; /* 2604 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2628 0 */ /* XXX 4 bytes hole, try to pack */ [...] /* size: 3200, cachelines: 50, members: 167 */ } Signed-off-by: Ilpo Järvinen Signed-off-by: Neal Cardwell Co-developed-by: Chia-Yu Chang Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-4-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- .../networking/net_cachelines/tcp_sock.rst | 1 + include/linux/tcp.h | 4 +++ include/net/tcp_ecn.h | 29 +++++++++++++++++++++- net/ipv4/tcp.c | 3 ++- net/ipv4/tcp_input.c | 7 +++--- net/ipv4/tcp_minisocks.c | 2 +- 6 files changed, 40 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 4f71ece7c655..5a2b0af57364 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -102,6 +102,7 @@ u32 prr_out read_mostly read_m u32 delivered read_mostly read_write tcp_rate_skb_sent, tcp_newly_delivered(tx);tcp_ack, tcp_rate_gen, tcp_clean_rtx_queue (rx) u32 delivered_ce read_mostly read_write tcp_rate_skb_sent(tx);tcp_rate_gen(rx) u32 received_ce read_mostly read_write +u32[3] received_ecn_bytes read_mostly read_write u8:4 received_ce_pending read_mostly read_write u8:2 syn_ect_snt write_mostly read_write u8:2 syn_ect_rcv read_mostly read_write diff --git a/include/linux/tcp.h b/include/linux/tcp.h index b8432bed546d..012d01347b3c 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -306,6 +306,10 @@ struct tcp_sock { u32 delivered; /* Total data packets delivered incl. rexmits */ u32 delivered_ce; /* Like the above but only ECE marked packets */ u32 received_ce; /* Like the above but for rcvd CE marked pkts */ + u32 received_ecn_bytes[3]; /* received byte counters for three ECN + * types: INET_ECN_ECT_1, INET_ECN_ECT_0, + * and INET_ECN_CE + */ u32 app_limited; /* limited until "delivered" reaches this val */ u32 rcv_wnd; /* Current receiver window */ /* diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index da0b355418bd..1a41a459aa07 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -171,7 +171,7 @@ static inline void tcp_accecn_third_ack(struct sock *sk, /* Updates Accurate ECN received counters from the received IP ECN field */ static inline void tcp_ecn_received_counters(struct sock *sk, - const struct sk_buff *skb) + const struct sk_buff *skb, u32 len) { u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; u8 is_ce = INET_ECN_is_ce(ecnfield); @@ -191,9 +191,24 @@ static inline void tcp_ecn_received_counters(struct sock *sk, tp->received_ce += pcount; tp->received_ce_pending = min(tp->received_ce_pending + pcount, 0xfU); + + if (len > 0) + tp->received_ecn_bytes[ecnfield - 1] += len; } } +/* AccECN specification, 2.2: [...] A Data Receiver maintains four counters + * initialized at the start of the half-connection. [...] These byte counters + * reflect only the TCP payload length, excluding TCP header and TCP options. + */ +static inline void tcp_ecn_received_counters_payload(struct sock *sk, + const struct sk_buff *skb) +{ + const struct tcphdr *th = (const struct tcphdr *)skb->data; + + tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4); +} + /* AccECN specification, 5.1: [...] a server can determine that it * negotiated AccECN as [...] if the ACK contains an ACE field with * the value 0b010 to 0b111 (decimal 2 to 7). @@ -232,10 +247,22 @@ static inline bool tcp_accecn_syn_requested(const struct tcphdr *th) return ace && ace != 0x3; } +static inline void __tcp_accecn_init_bytes_counters(int *counter_array) +{ + BUILD_BUG_ON(INET_ECN_ECT_1 != 0x1); + BUILD_BUG_ON(INET_ECN_ECT_0 != 0x2); + BUILD_BUG_ON(INET_ECN_CE != 0x3); + + counter_array[INET_ECN_ECT_1 - 1] = 0; + counter_array[INET_ECN_ECT_0 - 1] = 0; + counter_array[INET_ECN_CE - 1] = 0; +} + static inline void tcp_accecn_init_counters(struct tcp_sock *tp) { tp->received_ce = 0; tp->received_ce_pending = 0; + __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes); } /* Used for make_synack to form the ACE flags */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7261ee6dd875..a45a4184b603 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5142,6 +5142,7 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); @@ -5149,7 +5150,7 @@ static void __init tcp_struct_check(void) /* 32bit arches with 8byte alignment on u64 fields might need padding * before tcp_clock_cache. */ - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 95 + 4); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 107 + 4); /* RX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8449a5a3e368..636a63383412 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6163,7 +6163,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) flag |= __tcp_replace_ts_recent(tp, delta); - tcp_ecn_received_counters(sk, skb); + tcp_ecn_received_counters(sk, skb, 0); /* We know that such packets are checksummed * on entry. @@ -6213,7 +6213,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) /* Bulk data transfer: receiver */ tcp_cleanup_skb(skb); __skb_pull(skb, tcp_header_len); - tcp_ecn_received_counters(sk, skb); + tcp_ecn_received_counters(sk, skb, + len - tcp_header_len); eaten = tcp_queue_rcv(sk, skb, &fragstolen); tcp_event_data_recv(sk, skb); @@ -6254,7 +6255,7 @@ validate: return; step5: - tcp_ecn_received_counters(sk, skb); + tcp_ecn_received_counters_payload(sk, skb); reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT); if ((int)reason < 0) { diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index a4b8be6fdcdc..1dbcc09ff7a9 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -463,7 +463,7 @@ static void tcp_ecn_openreq_child(struct sock *sk, tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_snt = treq->syn_ect_snt; tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); - tcp_ecn_received_counters(sk, skb); + tcp_ecn_received_counters_payload(sk, skb); } else { tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? TCP_ECN_MODE_RFC3168 : -- cgit v1.2.3 From a92543d597621736b8e40fd1a2b50a93bd9840f7 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:28 +0200 Subject: tcp: accecn: AccECN needs to know delivered bytes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AccECN byte counter estimation requires delivered bytes which can be calculated while processing SACK blocks and cumulative ACK. The delivered bytes will be used to estimate the byte counters between AccECN option (on ACKs w/o the option). Accurate ECN does not depend on SACK to function; however, the calculation would be more accurate if SACK were there. Signed-off-by: Ilpo Järvinen Signed-off-by: Chia-Yu Chang Acked-by: Paolo Abeni Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-5-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- net/ipv4/tcp_input.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 636a63383412..8b48a3c00945 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1050,6 +1050,7 @@ struct tcp_sacktag_state { u64 last_sackt; u32 reord; u32 sack_delivered; + u32 delivered_bytes; int flag; unsigned int mss_now; struct rate_sample *rate; @@ -1411,7 +1412,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state, u8 sacked, u32 start_seq, u32 end_seq, - int dup_sack, int pcount, + int dup_sack, int pcount, u32 plen, u64 xmit_time) { struct tcp_sock *tp = tcp_sk(sk); @@ -1471,6 +1472,7 @@ static u8 tcp_sacktag_one(struct sock *sk, tp->sacked_out += pcount; /* Out-of-order packets delivered */ state->sack_delivered += pcount; + state->delivered_bytes += plen; } /* D-SACK. We can detect redundant retransmission in S|R and plain R @@ -1507,7 +1509,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, * tcp_highest_sack_seq() when skb is highest_sack. */ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, - start_seq, end_seq, dup_sack, pcount, + start_seq, end_seq, dup_sack, pcount, skb->len, tcp_skb_timestamp_us(skb)); tcp_rate_skb_delivered(sk, skb, state->rate); @@ -1792,6 +1794,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, TCP_SKB_CB(skb)->end_seq, dup_sack, tcp_skb_pcount(skb), + skb->len, tcp_skb_timestamp_us(skb)); tcp_rate_skb_delivered(sk, skb, state->rate); if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) @@ -3300,6 +3303,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, if (sacked & TCPCB_SACKED_ACKED) { tp->sacked_out -= acked_pcount; + /* snd_una delta covers these skbs */ + sack->delivered_bytes -= skb->len; } else if (tcp_is_sack(tp)) { tcp_count_delivered(tp, acked_pcount, ece_ack); if (!tcp_skb_spurious_retrans(tp, skb)) @@ -3396,6 +3401,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, if (before(reord, prior_fack)) tcp_check_sack_reordering(sk, reord, 0); } + + sack->delivered_bytes = (skb ? + TCP_SKB_CB(skb)->seq : tp->snd_una) - + prior_snd_una; } else if (skb && rtt_update && sack_rtt_us >= 0 && sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(skb))) { @@ -3858,6 +3867,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) sack_state.first_sackt = 0; sack_state.rate = &rs; sack_state.sack_delivered = 0; + sack_state.delivered_bytes = 0; /* We very likely will need to access rtx queue. */ prefetch(sk->tcp_rtx_queue.rb_node); -- cgit v1.2.3 From 77a4fdf43c5ec81a431770511505d371c8822837 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:29 +0200 Subject: tcp: sack option handling improvements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1) Don't early return when sack doesn't fit. AccECN code will be placed after this fragment so no early returns please. 2) Make sure opts->num_sack_blocks is not left undefined. E.g., tcp_current_mss() does not memset its opts struct to zero. AccECN code checks if SACK option is present and may even alter it to make room for AccECN option when many SACK blocks are present. Thus, num_sack_blocks needs to be always valid. Signed-off-by: Ilpo Järvinen Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-6-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- net/ipv4/tcp_output.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index deb9b085a8a2..5be2b3eb73d3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -985,17 +985,20 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; if (unlikely(eff_sacks)) { const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; - if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED + - TCPOLEN_SACK_PERBLOCK)) - return size; - - opts->num_sack_blocks = - min_t(unsigned int, eff_sacks, - (remaining - TCPOLEN_SACK_BASE_ALIGNED) / - TCPOLEN_SACK_PERBLOCK); - - size += TCPOLEN_SACK_BASE_ALIGNED + - opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; + if (likely(remaining >= TCPOLEN_SACK_BASE_ALIGNED + + TCPOLEN_SACK_PERBLOCK)) { + opts->num_sack_blocks = + min_t(unsigned int, eff_sacks, + (remaining - TCPOLEN_SACK_BASE_ALIGNED) / + TCPOLEN_SACK_PERBLOCK); + + size += TCPOLEN_SACK_BASE_ALIGNED + + opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; + } else { + opts->num_sack_blocks = 0; + } + } else { + opts->num_sack_blocks = 0; } if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, -- cgit v1.2.3 From b5e74132dfbe60329b3ff0e5c485039f2e31605c Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:30 +0200 Subject: tcp: accecn: AccECN option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Accurate ECN allows echoing back the sum of bytes for each IP ECN field value in the received packets using AccECN option. This change implements AccECN option tx & rx side processing without option send control related features that are added by a later change. Based on specification: https://tools.ietf.org/id/draft-ietf-tcpm-accurate-ecn-28.txt (Some features of the spec will be added in the later changes rather than in this one). A full-length AccECN option is always attempted but if it does not fit, the minimum length is selected based on the counters that have changed since the last update. The AccECN option (with 24-bit fields) often ends in odd sizes so the option write code tries to take advantage of some nop used to pad the other TCP options. The delivered_ecn_bytes pairs with received_ecn_bytes similar to how delivered_ce pairs with received_ce. In contrast to ACE field, however, the option is not always available to update delivered_ecn_bytes. For ACK w/o AccECN option, the delivered bytes calculated based on the cumulative ACK+SACK information are assigned to one of the counters using an estimation heuristic to select the most likely ECN byte counter. Any estimation error is corrected when the next AccECN option arrives. It may occur that the heuristic gets too confused when there are enough different byte counter deltas between ACKs with the AccECN option in which case the heuristic just gives up on updating the counters for a while. tcp_ecn_option sysctl can be used to select option sending mode for AccECN: TCP_ECN_OPTION_DISABLED, TCP_ECN_OPTION_MINIMUM, and TCP_ECN_OPTION_FULL. This patch increases the size of tcp_info struct, as there is no existing holes for new u32 variables. Below are the pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_info { [...] __u32 tcpi_total_rto_time; /* 244 4 */ /* size: 248, cachelines: 4, members: 61 */ } [AFTER THIS PATCH] struct tcp_info { [...] __u32 tcpi_total_rto_time; /* 244 4 */ __u32 tcpi_received_ce; /* 248 4 */ __u32 tcpi_delivered_e1_bytes; /* 252 4 */ __u32 tcpi_delivered_e0_bytes; /* 256 4 */ __u32 tcpi_delivered_ce_bytes; /* 260 4 */ __u32 tcpi_received_e1_bytes; /* 264 4 */ __u32 tcpi_received_e0_bytes; /* 268 4 */ __u32 tcpi_received_ce_bytes; /* 272 4 */ /* size: 280, cachelines: 5, members: 68 */ } This patch uses the existing 1-byte holes in the tcp_sock_write_txrx group for new u8 members, but adds a 4-byte hole in tcp_sock_write_rx group after the new u32 delivered_ecn_bytes[3] member. Therefore, the group size of tcp_sock_write_rx is increased from 96 to 112. Below are the pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_sock { [...] u8 received_ce_pending:4; /* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ /* XXX 1 byte hole, try to pack */ [...] u32 rcv_rtt_last_tsecr; /* 2668 4 */ [...] __cacheline_group_end__tcp_sock_write_rx[0]; /* 2728 0 */ [...] /* size: 3200, cachelines: 50, members: 167 */ } [AFTER THIS PATCH] struct tcp_sock { [...] u8 received_ce_pending:4;/* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ u8 accecn_minlen:2; /* 2523: 0 1 */ u8 est_ecnfield:2; /* 2523: 2 1 */ u8 unused3:4; /* 2523: 4 1 */ [...] u32 rcv_rtt_last_tsecr; /* 2668 4 */ u32 delivered_ecn_bytes[3];/* 2672 12 */ /* XXX 4 bytes hole, try to pack */ [...] __cacheline_group_end__tcp_sock_write_rx[0]; /* 2744 0 */ [...] /* size: 3200, cachelines: 50, members: 171 */ } Signed-off-by: Ilpo Järvinen Signed-off-by: Neal Cardwell Co-developed-by: Chia-Yu Chang Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-7-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- Documentation/networking/ip-sysctl.rst | 19 +++ .../networking/net_cachelines/tcp_sock.rst | 3 + include/linux/tcp.h | 9 +- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 13 ++ include/net/tcp_ecn.h | 89 ++++++++++- include/uapi/linux/tcp.h | 7 + net/ipv4/sysctl_net_ipv4.c | 9 ++ net/ipv4/tcp.c | 15 +- net/ipv4/tcp_input.c | 94 +++++++++++- net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_output.c | 165 ++++++++++++++++++++- 12 files changed, 412 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 3d8eb54b84f9..1c206501b973 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -468,6 +468,25 @@ tcp_ecn - INTEGER Default: 2 +tcp_ecn_option - INTEGER + Control Accurate ECN (AccECN) option sending when AccECN has been + successfully negotiated during handshake. Send logic inhibits + sending AccECN options regarless of this setting when no AccECN + option has been seen for the reverse direction. + + Possible values are: + + = ============================================================ + 0 Never send AccECN option. This also disables sending AccECN + option in SYN/ACK during handshake. + 1 Send AccECN option sparingly according to the minimum option + rules outlined in draft-ietf-tcpm-accurate-ecn. + 2 Send AccECN option on every packet whenever it fits into TCP + option space. + = ============================================================ + + Default: 2 + tcp_ecn_fallback - BOOLEAN If the kernel detects that ECN connection misbehaves, enable fall back to non-ECN. Currently, this knob implements the fallback diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 5a2b0af57364..b941151f8c0a 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -104,8 +104,11 @@ u32 delivered_ce read_mostly read_w u32 received_ce read_mostly read_write u32[3] received_ecn_bytes read_mostly read_write u8:4 received_ce_pending read_mostly read_write +u32[3] delivered_ecn_bytes read_write u8:2 syn_ect_snt write_mostly read_write u8:2 syn_ect_rcv read_mostly read_write +u8:2 accecn_minlen write_mostly read_write +u8:2 est_ecnfield read_write u8:4 accecn_fail_mode u32 lost read_mostly tcp_ack u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 012d01347b3c..73557656cb2d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -122,8 +122,9 @@ struct tcp_options_received { smc_ok : 1, /* SMC seen on SYN packet */ snd_wscale : 4, /* Window scaling received from sender */ rcv_wscale : 4; /* Window scaling to send to receiver */ - u8 saw_unknown:1, /* Received unknown option */ - unused:7; + u8 accecn:6, /* AccECN index in header, 0=no options */ + saw_unknown:1, /* Received unknown option */ + unused:1; u8 num_sacks; /* Number of SACK blocks */ u16 user_mss; /* mss requested by user in ioctl */ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ @@ -293,6 +294,9 @@ struct tcp_sock { rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ u8 received_ce_pending:4, /* Not yet transmit cnt of received_ce */ unused2:4; + u8 accecn_minlen:2,/* Minimum length of AccECN option sent */ + est_ecnfield:2,/* ECN field for AccECN delivered estimates */ + unused3:4; __be32 pred_flags; u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ @@ -337,6 +341,7 @@ struct tcp_sock { u32 rate_delivered; /* saved rate sample: packets delivered */ u32 rate_interval_us; /* saved rate sample: time elapsed */ u32 rcv_rtt_last_tsecr; + u32 delivered_ecn_bytes[3]; u64 first_tx_mstamp; /* start of window send phase */ u64 delivered_mstamp; /* time we reached "delivered" */ u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 54a7d187f62a..acbb7dd497e1 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -148,6 +148,7 @@ struct netns_ipv4 { struct local_ports ip_local_ports; u8 sysctl_tcp_ecn; + u8 sysctl_tcp_ecn_option; u8 sysctl_tcp_ecn_fallback; u8 sysctl_ip_default_ttl; diff --git a/include/net/tcp.h b/include/net/tcp.h index da8c6640ead3..6be29129465e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -213,6 +213,8 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCPOPT_AO 29 /* Authentication Option (RFC5925) */ #define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */ #define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */ +#define TCPOPT_ACCECN0 172 /* 0xAC: Accurate ECN Order 0 */ +#define TCPOPT_ACCECN1 174 /* 0xAE: Accurate ECN Order 1 */ #define TCPOPT_EXP 254 /* Experimental */ /* Magic number to be after the option value for sharing TCP * experimental options. See draft-ietf-tcpm-experimental-options-00.txt @@ -230,6 +232,7 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_MD5SIG 18 #define TCPOLEN_FASTOPEN_BASE 2 +#define TCPOLEN_ACCECN_BASE 2 #define TCPOLEN_EXP_FASTOPEN_BASE 4 #define TCPOLEN_EXP_SMC_BASE 6 @@ -243,6 +246,13 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCPOLEN_MD5SIG_ALIGNED 20 #define TCPOLEN_MSS_ALIGNED 4 #define TCPOLEN_EXP_SMC_BASE_ALIGNED 8 +#define TCPOLEN_ACCECN_PERFIELD 3 + +/* Maximum number of byte counters in AccECN option + size */ +#define TCP_ACCECN_NUMFIELDS 3 +#define TCP_ACCECN_MAXSIZE (TCPOLEN_ACCECN_BASE + \ + TCPOLEN_ACCECN_PERFIELD * \ + TCP_ACCECN_NUMFIELDS) /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ @@ -981,6 +991,9 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) * See draft-ietf-tcpm-accurate-ecn for the latest values. */ #define TCP_ACCECN_CEP_INIT_OFFSET 5 +#define TCP_ACCECN_E1B_INIT_OFFSET 1 +#define TCP_ACCECN_E0B_INIT_OFFSET 1 +#define TCP_ACCECN_CEB_INIT_OFFSET 0 /* State flags for sacked in struct tcp_skb_cb */ enum tcp_skb_cb_sacked_flags { diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index 1a41a459aa07..08c7f4757e4e 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -24,6 +24,13 @@ enum tcp_ecn_mode { TCP_ECN_IN_ACCECN_OUT_NOECN = 5, }; +/* AccECN option sending when AccECN has been successfully negotiated */ +enum tcp_accecn_option { + TCP_ACCECN_OPTION_DISABLED = 0, + TCP_ACCECN_OPTION_MINIMUM = 1, + TCP_ACCECN_OPTION_FULL = 2, +}; + static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) { /* Do not set CWR if in AccECN mode! */ @@ -169,6 +176,79 @@ static inline void tcp_accecn_third_ack(struct sock *sk, } } +/* Maps IP ECN field ECT/CE code point to AccECN option field number, given + * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0). + */ +static inline u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield) +{ + switch (ecnfield & INET_ECN_MASK) { + case INET_ECN_NOT_ECT: + return 0; /* AccECN does not send counts of NOT_ECT */ + case INET_ECN_ECT_1: + return 1; + case INET_ECN_CE: + return 2; + case INET_ECN_ECT_0: + return 3; + } + return 0; +} + +/* Maps IP ECN field ECT/CE code point to AccECN option field value offset. + * Some fields do not start from zero, to detect zeroing by middleboxes. + */ +static inline u32 tcp_accecn_field_init_offset(u8 ecnfield) +{ + switch (ecnfield & INET_ECN_MASK) { + case INET_ECN_NOT_ECT: + return 0; /* AccECN does not send counts of NOT_ECT */ + case INET_ECN_ECT_1: + return TCP_ACCECN_E1B_INIT_OFFSET; + case INET_ECN_CE: + return TCP_ACCECN_CEB_INIT_OFFSET; + case INET_ECN_ECT_0: + return TCP_ACCECN_E0B_INIT_OFFSET; + } + return 0; +} + +/* Maps AccECN option field #nr to IP ECN field ECT/CE bits */ +static inline unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int option, + bool order) +{ + /* Based on Table 5 of the AccECN spec to map (option, order) to + * the corresponding ECN conuters (ECT-1, ECT-0, or CE). + */ + static const u8 optfield_lookup[2][3] = { + /* order = 0: 1st field ECT-0, 2nd field CE, 3rd field ECT-1 */ + { INET_ECN_ECT_0, INET_ECN_CE, INET_ECN_ECT_1 }, + /* order = 1: 1st field ECT-1, 2nd field CE, 3rd field ECT-0 */ + { INET_ECN_ECT_1, INET_ECN_CE, INET_ECN_ECT_0 } + }; + + return optfield_lookup[order][option % 3]; +} + +/* Handles AccECN option ECT and CE 24-bit byte counters update into + * the u32 value in tcp_sock. As we're processing TCP options, it is + * safe to access from - 1. + */ +static inline s32 tcp_update_ecn_bytes(u32 *cnt, const char *from, + u32 init_offset) +{ + u32 truncated = (get_unaligned_be32(from - 1) - init_offset) & + 0xFFFFFFU; + u32 delta = (truncated - *cnt) & 0xFFFFFFU; + + /* If delta has the highest bit set (24th bit) indicating + * negative, sign extend to correct an estimation using + * sign_extend32(delta, 24 - 1) + */ + delta = sign_extend32(delta, 23); + *cnt += delta; + return (s32)delta; +} + /* Updates Accurate ECN received counters from the received IP ECN field */ static inline void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb, u32 len) @@ -192,8 +272,12 @@ static inline void tcp_ecn_received_counters(struct sock *sk, tp->received_ce_pending = min(tp->received_ce_pending + pcount, 0xfU); - if (len > 0) + if (len > 0) { + u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield); tp->received_ecn_bytes[ecnfield - 1] += len; + tp->accecn_minlen = max_t(u8, tp->accecn_minlen, + minlen); + } } } @@ -263,6 +347,9 @@ static inline void tcp_accecn_init_counters(struct tcp_sock *tp) tp->received_ce = 0; tp->received_ce_pending = 0; __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes); + __tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes); + tp->accecn_minlen = 0; + tp->est_ecnfield = 0; } /* Used for make_synack to form the ACE flags */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index bdac8c42fa82..53e0e85b52be 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -316,6 +316,13 @@ struct tcp_info { * in milliseconds, including any * unfinished recovery. */ + __u32 tcpi_received_ce; /* # of CE marks received */ + __u32 tcpi_delivered_e1_bytes; /* Accurate ECN byte counters */ + __u32 tcpi_delivered_e0_bytes; + __u32 tcpi_delivered_ce_bytes; + __u32 tcpi_received_e1_bytes; + __u32 tcpi_received_e0_bytes; + __u32 tcpi_received_ce_bytes; }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 268f8b86e8a7..4a697acb4e85 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -731,6 +731,15 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &tcp_ecn_mode_max, }, + { + .procname = "tcp_ecn_option", + .data = &init_net.ipv4.sysctl_tcp_ecn_option, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, { .procname = "tcp_ecn_fallback", .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a45a4184b603..8c4a4b8666fc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -270,6 +270,7 @@ #include #include +#include #include #include #include @@ -4155,6 +4156,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); + const u8 ect1_idx = INET_ECN_ECT_1 - 1; + const u8 ect0_idx = INET_ECN_ECT_0 - 1; + const u8 ce_idx = INET_ECN_CE - 1; unsigned long rate; u32 now; u64 rate64; @@ -4281,6 +4285,14 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (tp->rto_stamp) info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; + info->tcpi_received_ce = tp->received_ce; + info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx]; + info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx]; + info->tcpi_delivered_ce_bytes = tp->delivered_ecn_bytes[ce_idx]; + info->tcpi_received_e1_bytes = tp->received_ecn_bytes[ect1_idx]; + info->tcpi_received_e0_bytes = tp->received_ecn_bytes[ect0_idx]; + info->tcpi_received_ce_bytes = tp->received_ecn_bytes[ce_idx]; + unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -5162,12 +5174,13 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 96); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 112); } void __init tcp_init(void) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8b48a3c00945..e898a76c485e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include #include @@ -384,6 +385,73 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) } } +/* Returns true if the byte counters can be used */ +static bool tcp_accecn_process_option(struct tcp_sock *tp, + const struct sk_buff *skb, + u32 delivered_bytes, int flag) +{ + u8 estimate_ecnfield = tp->est_ecnfield; + bool ambiguous_ecn_bytes_incr = false; + bool first_changed = false; + unsigned int optlen; + bool order1, res; + unsigned int i; + u8 *ptr; + + if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { + if (estimate_ecnfield) { + u8 ecnfield = estimate_ecnfield - 1; + + tp->delivered_ecn_bytes[ecnfield] += delivered_bytes; + return true; + } + return false; + } + + ptr = skb_transport_header(skb) + tp->rx_opt.accecn; + optlen = ptr[1] - 2; + if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1)) + return false; + order1 = (ptr[0] == TCPOPT_ACCECN1); + ptr += 2; + + res = !!estimate_ecnfield; + for (i = 0; i < 3; i++) { + u32 init_offset; + u8 ecnfield; + s32 delta; + u32 *cnt; + + if (optlen < TCPOLEN_ACCECN_PERFIELD) + break; + + ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1); + init_offset = tcp_accecn_field_init_offset(ecnfield); + cnt = &tp->delivered_ecn_bytes[ecnfield - 1]; + delta = tcp_update_ecn_bytes(cnt, ptr, init_offset); + if (delta && delta < 0) { + res = false; + ambiguous_ecn_bytes_incr = true; + } + if (delta && ecnfield != estimate_ecnfield) { + if (!first_changed) { + tp->est_ecnfield = ecnfield; + first_changed = true; + } else { + res = false; + ambiguous_ecn_bytes_incr = true; + } + } + + optlen -= TCPOLEN_ACCECN_PERFIELD; + ptr += TCPOLEN_ACCECN_PERFIELD; + } + if (ambiguous_ecn_bytes_incr) + tp->est_ecnfield = 0; + + return res; +} + static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count) { tp->delivered_ce += ecn_count; @@ -400,7 +468,8 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, /* Returns the ECN CE delta */ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, - u32 delivered_pkts, int flag) + u32 delivered_pkts, u32 delivered_bytes, + int flag) { const struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); @@ -411,6 +480,8 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS))) return 0; + tcp_accecn_process_option(tp, skb, delivered_bytes, flag); + if (!(flag & FLAG_SLOWPATH)) { /* AccECN counter might overflow on large ACKs */ if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) @@ -436,12 +507,14 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, } static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, - u32 delivered_pkts, int *flag) + u32 delivered_pkts, u32 delivered_bytes, + int *flag) { struct tcp_sock *tp = tcp_sk(sk); u32 delta; - delta = __tcp_accecn_process(sk, skb, delivered_pkts, *flag); + delta = __tcp_accecn_process(sk, skb, delivered_pkts, + delivered_bytes, *flag); if (delta > 0) { tcp_count_delivered_ce(tp, delta); *flag |= FLAG_ECE; @@ -3973,6 +4046,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (tcp_ecn_mode_accecn(tp)) ecn_count = tcp_accecn_process(sk, skb, tp->delivered - delivered, + sack_state.delivered_bytes, &flag); tcp_in_ack_event(sk, flag); @@ -4012,6 +4086,7 @@ no_queue: if (tcp_ecn_mode_accecn(tp)) ecn_count = tcp_accecn_process(sk, skb, tp->delivered - delivered, + sack_state.delivered_bytes, &flag); tcp_in_ack_event(sk, flag); /* If data was DSACKed, see if we can undo a cwnd reduction. */ @@ -4139,6 +4214,7 @@ void tcp_parse_options(const struct net *net, ptr = (const unsigned char *)(th + 1); opt_rx->saw_tstamp = 0; + opt_rx->accecn = 0; opt_rx->saw_unknown = 0; while (length > 0) { @@ -4230,6 +4306,12 @@ void tcp_parse_options(const struct net *net, ptr, th->syn, foc, false); break; + case TCPOPT_ACCECN0: + case TCPOPT_ACCECN1: + /* Save offset of AccECN option in TCP header */ + opt_rx->accecn = (ptr - 2) - (__u8 *)th; + break; + case TCPOPT_EXP: /* Fast Open option shares code 254 using a * 16 bits magic number. @@ -4290,11 +4372,14 @@ static bool tcp_fast_parse_options(const struct net *net, */ if (th->doff == (sizeof(*th) / 4)) { tp->rx_opt.saw_tstamp = 0; + tp->rx_opt.accecn = 0; return false; } else if (tp->rx_opt.tstamp_ok && th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { - if (tcp_parse_aligned_timestamp(tp, th)) + if (tcp_parse_aligned_timestamp(tp, th)) { + tp->rx_opt.accecn = 0; return true; + } } tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL); @@ -6119,6 +6204,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) */ tp->rx_opt.saw_tstamp = 0; + tp->rx_opt.accecn = 0; /* pred_flags is 0xS?10 << 16 + snd_wnd * if header_prediction is to be made diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6162f8dbe9d2..aa8dbfe20924 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3561,6 +3561,7 @@ fallback: static int __net_init tcp_sk_init(struct net *net) { net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; + net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5be2b3eb73d3..34e5c83bbace 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -385,6 +385,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_SMC BIT(9) #define OPTION_MPTCP BIT(10) #define OPTION_AO BIT(11) +#define OPTION_ACCECN BIT(12) static void smc_options_write(__be32 *ptr, u16 *options) { @@ -406,6 +407,8 @@ struct tcp_out_options { u16 mss; /* 0 to disable */ u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ + u8 num_accecn_fields:7, /* number of AccECN fields needed */ + use_synack_ecn_bytes:1; /* Use synack_ecn_bytes or not */ u8 hash_size; /* bytes in hash_location */ u8 bpf_opt_len; /* length of BPF hdr option */ __u8 *hash_location; /* temporary pointer, overloaded */ @@ -603,6 +606,11 @@ static __be32 *process_tcp_ao_options(struct tcp_sock *tp, return ptr; } +/* Initial values for AccECN option, ordered is based on ECN field bits + * similar to received_ecn_bytes. Used for SYN/ACK AccECN option. + */ +static const u32 synack_ecn_bytes[3] = { 0, 0, 0 }; + /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of @@ -621,6 +629,8 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, struct tcp_out_options *opts, struct tcp_key *key) { + u8 leftover_highbyte = TCPOPT_NOP; /* replace 1st NOP if avail */ + u8 leftover_lowbyte = TCPOPT_NOP; /* replace 2nd NOP in succession */ __be32 *ptr = (__be32 *)(th + 1); u16 options = opts->options; /* mungable copy */ @@ -656,15 +666,71 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, *ptr++ = htonl(opts->tsecr); } + if (OPTION_ACCECN & options) { + const u32 *ecn_bytes = opts->use_synack_ecn_bytes ? + synack_ecn_bytes : + tp->received_ecn_bytes; + const u8 ect0_idx = INET_ECN_ECT_0 - 1; + const u8 ect1_idx = INET_ECN_ECT_1 - 1; + const u8 ce_idx = INET_ECN_CE - 1; + u32 e0b; + u32 e1b; + u32 ceb; + u8 len; + + e0b = ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET; + e1b = ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET; + ceb = ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET; + len = TCPOLEN_ACCECN_BASE + + opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD; + + if (opts->num_accecn_fields == 2) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + *ptr++ = htonl(((e1b & 0xff) << 24) | + (ceb & 0xffffff)); + } else if (opts->num_accecn_fields == 1) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + leftover_highbyte = e1b & 0xff; + leftover_lowbyte = TCPOPT_NOP; + } else if (opts->num_accecn_fields == 0) { + leftover_highbyte = TCPOPT_ACCECN1; + leftover_lowbyte = len; + } else if (opts->num_accecn_fields == 3) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + *ptr++ = htonl(((e1b & 0xff) << 24) | + (ceb & 0xffffff)); + *ptr++ = htonl(((e0b & 0xffffff) << 8) | + TCPOPT_NOP); + } + if (tp) + tp->accecn_minlen = 0; + } + if (unlikely(OPTION_SACK_ADVERTISE & options)) { - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | + *ptr++ = htonl((leftover_highbyte << 24) | + (leftover_lowbyte << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); + leftover_highbyte = TCPOPT_NOP; + leftover_lowbyte = TCPOPT_NOP; } if (unlikely(OPTION_WSCALE & options)) { - *ptr++ = htonl((TCPOPT_NOP << 24) | + u8 highbyte = TCPOPT_NOP; + + /* Do not split the leftover 2-byte to fit into a single + * NOP, i.e., replace this NOP only when 1 byte is leftover + * within leftover_highbyte. + */ + if (unlikely(leftover_highbyte != TCPOPT_NOP && + leftover_lowbyte == TCPOPT_NOP)) { + highbyte = leftover_highbyte; + leftover_highbyte = TCPOPT_NOP; + } + *ptr++ = htonl((highbyte << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | opts->ws); @@ -675,11 +741,13 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, tp->duplicate_sack : tp->selective_acks; int this_sack; - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | + *ptr++ = htonl((leftover_highbyte << 24) | + (leftover_lowbyte << 16) | (TCPOPT_SACK << 8) | (TCPOLEN_SACK_BASE + (opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK))); + leftover_highbyte = TCPOPT_NOP; + leftover_lowbyte = TCPOPT_NOP; for (this_sack = 0; this_sack < opts->num_sack_blocks; ++this_sack) { @@ -688,6 +756,14 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, } tp->rx_opt.dsack = 0; + } else if (unlikely(leftover_highbyte != TCPOPT_NOP || + leftover_lowbyte != TCPOPT_NOP)) { + *ptr++ = htonl((leftover_highbyte << 24) | + (leftover_lowbyte << 16) | + (TCPOPT_NOP << 8) | + TCPOPT_NOP); + leftover_highbyte = TCPOPT_NOP; + leftover_lowbyte = TCPOPT_NOP; } if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { @@ -768,6 +844,61 @@ static void mptcp_set_option_cond(const struct request_sock *req, } } +static u32 tcp_synack_options_combine_saving(struct tcp_out_options *opts) +{ + /* How much there's room for combining with the alignment padding? */ + if ((opts->options & (OPTION_SACK_ADVERTISE | OPTION_TS)) == + OPTION_SACK_ADVERTISE) + return 2; + else if (opts->options & OPTION_WSCALE) + return 1; + return 0; +} + +/* Calculates how long AccECN option will fit to @remaining option space. + * + * AccECN option can sometimes replace NOPs used for alignment of other + * TCP options (up to @max_combine_saving available). + * + * Only solutions with at least @required AccECN fields are accepted. + * + * Returns: The size of the AccECN option excluding space repurposed from + * the alignment of the other options. + */ +static int tcp_options_fit_accecn(struct tcp_out_options *opts, int required, + int remaining) +{ + int size = TCP_ACCECN_MAXSIZE; + int max_combine_saving; + int align_size; + + if (opts->use_synack_ecn_bytes) + max_combine_saving = tcp_synack_options_combine_saving(opts); + else + max_combine_saving = opts->num_sack_blocks > 0 ? 2 : 0; + opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS; + while (opts->num_accecn_fields >= required) { + /* Pad to dword if cannot combine */ + if ((size & 0x3) > max_combine_saving) + align_size = ALIGN(size, 4); + else + align_size = ALIGN_DOWN(size, 4); + + if (remaining >= align_size) { + size = align_size; + break; + } + + opts->num_accecn_fields--; + size -= TCPOLEN_ACCECN_PERFIELD; + } + if (opts->num_accecn_fields < required) + return 0; + + opts->options |= OPTION_ACCECN; + return size; +} + /* Compute TCP options for SYN packets. This is not the final * network wire format yet. */ @@ -850,6 +981,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } } + /* Simultaneous open SYN/ACK needs AccECN option but not SYN */ + if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) && + tcp_ecn_mode_accecn(tp) && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && + remaining >= TCPOLEN_ACCECN_BASE)) { + opts->use_synack_ecn_bytes = 1; + remaining -= tcp_options_fit_accecn(opts, 0, remaining); + } + bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); return MAX_TCP_OPTION_SPACE - remaining; @@ -867,6 +1007,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; + struct tcp_request_sock *treq = tcp_rsk(req); if (tcp_key_is_md5(key)) { opts->options |= OPTION_MD5; @@ -929,6 +1070,13 @@ static unsigned int tcp_synack_options(const struct sock *sk, smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); + if (treq->accecn_ok && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && + remaining >= TCPOLEN_ACCECN_BASE) { + opts->use_synack_ecn_bytes = 1; + remaining -= tcp_options_fit_accecn(opts, 0, remaining); + } + bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb, synack_type, opts, &remaining); @@ -1001,6 +1149,13 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb opts->num_sack_blocks = 0; } + if (tcp_ecn_mode_accecn(tp) && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option)) { + opts->use_synack_ecn_bytes = 0; + size += tcp_options_fit_accecn(opts, tp->accecn_minlen, + MAX_TCP_OPTION_SPACE - size); + } + if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) { unsigned int remaining = MAX_TCP_OPTION_SPACE - size; -- cgit v1.2.3 From aa55a7dde7ec506bb23448a5005ae3f4f809d022 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Tue, 16 Sep 2025 10:24:31 +0200 Subject: tcp: accecn: AccECN option send control MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of sending the option in every ACK, limit sending to those ACKs where the option is necessary: - Handshake - "Change-triggered ACK" + the ACK following it. The 2nd ACK is necessary to unambiguously indicate which of the ECN byte counters in increasing. The first ACK has two counters increasing due to the ecnfield edge. - ACKs with CE to allow CEP delta validations to take advantage of the option. - Force option to be sent every at least once per 2^22 bytes. The check is done using the bit edges of the byte counters (avoids need for extra variables). - AccECN option beacon to send a few times per RTT even if nothing in the ECN state requires that. The default is 3 times per RTT, and its period can be set via sysctl_tcp_ecn_option_beacon. Below are the pahole outcomes before and after this patch, in which the group size of tcp_sock_write_tx is increased from 89 to 97 due to the new u64 accecn_opt_tstamp member: [BEFORE THIS PATCH] struct tcp_sock { [...] u64 tcp_wstamp_ns; /* 2488 8 */ struct list_head tsorted_sent_queue; /* 2496 16 */ [...] __cacheline_group_end__tcp_sock_write_tx[0]; /* 2521 0 */ __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */ u8 nonagle:4; /* 2521: 0 1 */ u8 rate_app_limited:1; /* 2521: 4 1 */ /* XXX 3 bits hole, try to pack */ /* Force alignment to the next boundary: */ u8 :0; u8 received_ce_pending:4;/* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ u8 accecn_minlen:2; /* 2523: 0 1 */ u8 est_ecnfield:2; /* 2523: 2 1 */ u8 unused3:4; /* 2523: 4 1 */ [...] __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2628 0 */ [...] /* size: 3200, cachelines: 50, members: 171 */ } [AFTER THIS PATCH] struct tcp_sock { [...] u64 tcp_wstamp_ns; /* 2488 8 */ u64 accecn_opt_tstamp; /* 2596 8 */ struct list_head tsorted_sent_queue; /* 2504 16 */ [...] __cacheline_group_end__tcp_sock_write_tx[0]; /* 2529 0 */ __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2529 0 */ u8 nonagle:4; /* 2529: 0 1 */ u8 rate_app_limited:1; /* 2529: 4 1 */ /* XXX 3 bits hole, try to pack */ /* Force alignment to the next boundary: */ u8 :0; u8 received_ce_pending:4;/* 2530: 0 1 */ u8 unused2:4; /* 2530: 4 1 */ u8 accecn_minlen:2; /* 2531: 0 1 */ u8 est_ecnfield:2; /* 2531: 2 1 */ u8 accecn_opt_demand:2; /* 2531: 4 1 */ u8 prev_ecnfield:2; /* 2531: 6 1 */ [...] __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2636 0 */ [...] /* size: 3200, cachelines: 50, members: 173 */ } Signed-off-by: Chia-Yu Chang Co-developed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-8-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- Documentation/networking/ip-sysctl.rst | 6 +++ .../networking/net_cachelines/tcp_sock.rst | 3 ++ include/linux/tcp.h | 4 +- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 3 ++ include/net/tcp_ecn.h | 52 ++++++++++++++++++++++ net/ipv4/sysctl_net_ipv4.c | 9 ++++ net/ipv4/tcp.c | 5 ++- net/ipv4/tcp_input.c | 4 +- net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_minisocks.c | 2 + net/ipv4/tcp_output.c | 26 ++++++++--- 12 files changed, 107 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 1c206501b973..a06cb99d66dc 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -487,6 +487,12 @@ tcp_ecn_option - INTEGER Default: 2 +tcp_ecn_option_beacon - INTEGER + Control Accurate ECN (AccECN) option sending frequency per RTT and it + takes effect only when tcp_ecn_option is set to 2. + + Default: 3 (AccECN will be send at least 3 times per RTT) + tcp_ecn_fallback - BOOLEAN If the kernel detects that ECN connection misbehaves, enable fall back to non-ECN. Currently, this knob implements the fallback diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index b941151f8c0a..d4dc01800945 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -109,6 +109,9 @@ u8:2 syn_ect_snt write_mostly read_w u8:2 syn_ect_rcv read_mostly read_write u8:2 accecn_minlen write_mostly read_write u8:2 est_ecnfield read_write +u8:2 accecn_opt_demand read_mostly read_write +u8:2 prev_ecnfield read_write +u64 accecn_opt_tstamp read_write u8:4 accecn_fail_mode u32 lost read_mostly tcp_ack u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 73557656cb2d..f637b659b35a 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -275,6 +275,7 @@ struct tcp_sock { u32 mdev_us; /* medium deviation */ u32 rtt_seq; /* sequence number to update rttvar */ u64 tcp_wstamp_ns; /* departure time for next sent data packet */ + u64 accecn_opt_tstamp; /* Last AccECN option sent timestamp */ struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */ struct sk_buff *highest_sack; /* skb just after the highest * skb with SACKed bit set @@ -296,7 +297,8 @@ struct tcp_sock { unused2:4; u8 accecn_minlen:2,/* Minimum length of AccECN option sent */ est_ecnfield:2,/* ECN field for AccECN delivered estimates */ - unused3:4; + accecn_opt_demand:2,/* Demand AccECN option for n next ACKs */ + prev_ecnfield:2; /* ECN bits from the previous segment */ __be32 pred_flags; u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index acbb7dd497e1..34eb3aecb3f2 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -149,6 +149,7 @@ struct netns_ipv4 { u8 sysctl_tcp_ecn; u8 sysctl_tcp_ecn_option; + u8 sysctl_tcp_ecn_option_beacon; u8 sysctl_tcp_ecn_fallback; u8 sysctl_ip_default_ttl; diff --git a/include/net/tcp.h b/include/net/tcp.h index 6be29129465e..78dd7b8a4145 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -100,6 +100,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* Maximal number of window scale according to RFC1323 */ #define TCP_MAX_WSCALE 14U +/* Default sending frequency of accurate ECN option per RTT */ +#define TCP_ACCECN_OPTION_BEACON 3 + /* urg_data states */ #define TCP_URG_VALID 0x0100 #define TCP_URG_NOTYET 0x0200 diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index 08c7f4757e4e..133fb6b79500 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -176,6 +176,17 @@ static inline void tcp_accecn_third_ack(struct sock *sk, } } +/* Demand the minimum # to send AccECN optnio */ +static inline void tcp_accecn_opt_demand_min(struct sock *sk, + u8 opt_demand_min) +{ + struct tcp_sock *tp = tcp_sk(sk); + u8 opt_demand; + + opt_demand = max_t(u8, opt_demand_min, tp->accecn_opt_demand); + tp->accecn_opt_demand = opt_demand; +} + /* Maps IP ECN field ECT/CE code point to AccECN option field number, given * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0). */ @@ -256,6 +267,7 @@ static inline void tcp_ecn_received_counters(struct sock *sk, u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; u8 is_ce = INET_ECN_is_ce(ecnfield); struct tcp_sock *tp = tcp_sk(sk); + bool ecn_edge; if (!INET_ECN_is_not_ect(ecnfield)) { u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs); @@ -274,9 +286,34 @@ static inline void tcp_ecn_received_counters(struct sock *sk, if (len > 0) { u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield); + u32 oldbytes = tp->received_ecn_bytes[ecnfield - 1]; + u32 bytes_mask = GENMASK_U32(31, 22); + tp->received_ecn_bytes[ecnfield - 1] += len; tp->accecn_minlen = max_t(u8, tp->accecn_minlen, minlen); + + /* Send AccECN option at least once per 2^22-byte + * increase in any ECN byte counter. + */ + if ((tp->received_ecn_bytes[ecnfield - 1] ^ oldbytes) & + bytes_mask) { + tcp_accecn_opt_demand_min(sk, 1); + } + } + } + + ecn_edge = tp->prev_ecnfield != ecnfield; + if (ecn_edge || is_ce) { + tp->prev_ecnfield = ecnfield; + /* Demand Accurate ECN change-triggered ACKs. Two ACK are + * demanded to indicate unambiguously the ecnfield value + * in the latter ACK. + */ + if (tcp_ecn_mode_accecn(tp)) { + if (ecn_edge) + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + tp->accecn_opt_demand = 2; } } } @@ -349,6 +386,7 @@ static inline void tcp_accecn_init_counters(struct tcp_sock *tp) __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes); __tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes); tp->accecn_minlen = 0; + tp->accecn_opt_demand = 0; tp->est_ecnfield = 0; } @@ -431,6 +469,7 @@ static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th, default: tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; + tp->accecn_opt_demand = 2; if (INET_ECN_is_ce(ip_dsfield) && tcp_accecn_validate_syn_feedback(sk, ace, tp->syn_ect_snt)) { @@ -451,6 +490,7 @@ static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th, } else { tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; + tp->prev_ecnfield = tp->syn_ect_rcv; tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); } } @@ -542,4 +582,16 @@ tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) th->ece = 1; } +static inline bool tcp_accecn_option_beacon_check(const struct sock *sk) +{ + u32 ecn_beacon = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon); + const struct tcp_sock *tp = tcp_sk(sk); + + if (!ecn_beacon) + return false; + + return tcp_stamp_us_delta(tp->tcp_mstamp, tp->accecn_opt_tstamp) * ecn_beacon >= + (tp->srtt_us >> 3); +} + #endif /* _LINUX_TCP_ECN_H */ diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4a697acb4e85..24dbc603cc44 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -740,6 +740,15 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, + { + .procname = "tcp_ecn_option_beacon", + .data = &init_net.ipv4.sysctl_tcp_ecn_option_beacon, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_THREE, + }, { .procname = "tcp_ecn_fallback", .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8c4a4b8666fc..090f9ac43d4c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3410,6 +3410,8 @@ int tcp_disconnect(struct sock *sk, int flags) tp->delivered_ce = 0; tp->accecn_fail_mode = 0; tcp_accecn_init_counters(tp); + tp->prev_ecnfield = 0; + tp->accecn_opt_tstamp = 0; if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); @@ -5134,11 +5136,12 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, accecn_opt_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 89); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 97); /* TXRX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e898a76c485e..87154fd86167 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6121,8 +6121,10 @@ step1: * RFC 5961 4.2 : Send a challenge ack */ if (th->syn) { - if (tcp_ecn_mode_accecn(tp)) + if (tcp_ecn_mode_accecn(tp)) { accecn_reflector = true; + tcp_accecn_opt_demand_min(sk, 1); + } if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack && TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq && TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt && diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index aa8dbfe20924..6a63be1f6461 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3562,6 +3562,7 @@ static int __net_init tcp_sk_init(struct net *net) { net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; + net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON; net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1dbcc09ff7a9..193343494558 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -463,6 +463,8 @@ static void tcp_ecn_openreq_child(struct sock *sk, tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_snt = treq->syn_ect_snt; tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); + tp->prev_ecnfield = treq->syn_ect_rcv; + tp->accecn_opt_demand = 1; tcp_ecn_received_counters_payload(sk, skb); } else { tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 34e5c83bbace..f897c2594954 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -705,8 +705,12 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, *ptr++ = htonl(((e0b & 0xffffff) << 8) | TCPOPT_NOP); } - if (tp) + if (tp) { tp->accecn_minlen = 0; + tp->accecn_opt_tstamp = tp->tcp_mstamp; + if (tp->accecn_opt_demand) + tp->accecn_opt_demand--; + } } if (unlikely(OPTION_SACK_ADVERTISE & options)) { @@ -1149,11 +1153,16 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb opts->num_sack_blocks = 0; } - if (tcp_ecn_mode_accecn(tp) && - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option)) { - opts->use_synack_ecn_bytes = 0; - size += tcp_options_fit_accecn(opts, tp->accecn_minlen, - MAX_TCP_OPTION_SPACE - size); + if (tcp_ecn_mode_accecn(tp)) { + int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option); + + if (ecn_opt && + (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand || + tcp_accecn_option_beacon_check(sk))) { + opts->use_synack_ecn_bytes = 0; + size += tcp_options_fit_accecn(opts, tp->accecn_minlen, + MAX_TCP_OPTION_SPACE - size); + } } if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, @@ -2863,6 +2872,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, sent_pkts = 0; tcp_mstamp_refresh(tp); + + /* AccECN option beacon depends on mstamp, it may change mss */ + if (tcp_ecn_mode_accecn(tp) && tcp_accecn_option_beacon_check(sk)) + mss_now = tcp_current_mss(sk); + if (!push_one) { /* Do MTU probing. */ result = tcp_mtu_probe(sk); -- cgit v1.2.3 From b40671b5ee588c8a61b2d0eacbad32ffc57e9a8f Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Tue, 16 Sep 2025 10:24:32 +0200 Subject: tcp: accecn: AccECN option failure handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AccECN option may fail in various way, handle these: - Attempt to negotiate the use of AccECN on the 1st retransmitted SYN - From the 2nd retransmitted SYN, stop AccECN negotiation - Remove option from SYN/ACK rexmits to handle blackholes - If no option arrives in SYN/ACK, assume Option is not usable - If an option arrives later, re-enabled - If option is zeroed, disable AccECN option processing This patch use existing padding bits in tcp_request_sock and holes in tcp_sock without increasing the size. Signed-off-by: Ilpo Järvinen Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-9-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- include/linux/tcp.h | 4 +++- include/net/tcp_ecn.h | 51 +++++++++++++++++++++++++++++++++++++++++++++--- include/uapi/linux/tcp.h | 2 ++ net/ipv4/tcp.c | 3 +++ net/ipv4/tcp_input.c | 35 +++++++++++++++++++++++++++++++-- net/ipv4/tcp_minisocks.c | 14 +++++++++++++ net/ipv4/tcp_output.c | 11 ++++++++--- 7 files changed, 111 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index f637b659b35a..3ca5ed02de6d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -173,6 +173,7 @@ struct tcp_request_sock { u8 syn_ect_snt: 2, syn_ect_rcv: 2, accecn_fail_mode:4; + u8 saw_accecn_opt :2; #ifdef CONFIG_TCP_AO u8 ao_keyid; u8 ao_rcv_next; @@ -407,7 +408,8 @@ struct tcp_sock { syn_fastopen_child:1; /* created TFO passive child socket */ u8 keepalive_probes; /* num of allowed keep alive probes */ - u8 accecn_fail_mode:4; /* AccECN failure handling */ + u8 accecn_fail_mode:4, /* AccECN failure handling */ + saw_accecn_opt:2; /* An AccECN option was seen */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ /* RTT measurement */ diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index 133fb6b79500..f13e5cd2b1ac 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -91,6 +91,11 @@ static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode) tp->accecn_fail_mode |= mode; } +#define TCP_ACCECN_OPT_NOT_SEEN 0x0 +#define TCP_ACCECN_OPT_EMPTY_SEEN 0x1 +#define TCP_ACCECN_OPT_COUNTER_SEEN 0x2 +#define TCP_ACCECN_OPT_FAIL_SEEN 0x3 + static inline u8 tcp_accecn_ace(const struct tcphdr *th) { return (th->ae << 2) | (th->cwr << 1) | th->ece; @@ -146,6 +151,14 @@ static inline bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, return true; } +static inline void tcp_accecn_saw_opt_fail_recv(struct tcp_sock *tp, + u8 saw_opt) +{ + tp->saw_accecn_opt = saw_opt; + if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV); +} + /* Validate the 3rd ACK based on the ACE field, see Table 4 of AccECN spec */ static inline void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, u8 sent_ect) @@ -428,9 +441,35 @@ static inline void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb, } } +static inline u8 tcp_accecn_option_init(const struct sk_buff *skb, + u8 opt_offset) +{ + u8 *ptr = skb_transport_header(skb) + opt_offset; + unsigned int optlen = ptr[1] - 2; + + if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1)) + return TCP_ACCECN_OPT_FAIL_SEEN; + ptr += 2; + + /* Detect option zeroing: an AccECN connection "MAY check that the + * initial value of the EE0B field or the EE1B field is non-zero" + */ + if (optlen < TCPOLEN_ACCECN_PERFIELD) + return TCP_ACCECN_OPT_EMPTY_SEEN; + if (get_unaligned_be24(ptr) == 0) + return TCP_ACCECN_OPT_FAIL_SEEN; + if (optlen < TCPOLEN_ACCECN_PERFIELD * 3) + return TCP_ACCECN_OPT_COUNTER_SEEN; + ptr += TCPOLEN_ACCECN_PERFIELD * 2; + if (get_unaligned_be24(ptr) == 0) + return TCP_ACCECN_OPT_FAIL_SEEN; + + return TCP_ACCECN_OPT_COUNTER_SEEN; +} + /* See Table 2 of the AccECN draft */ -static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th, - u8 ip_dsfield) +static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb, + const struct tcphdr *th, u8 ip_dsfield) { struct tcp_sock *tp = tcp_sk(sk); u8 ace = tcp_accecn_ace(th); @@ -469,7 +508,13 @@ static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th, default: tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; - tp->accecn_opt_demand = 2; + if (tp->rx_opt.accecn && + tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + tp->accecn_opt_demand = 2; + } if (INET_ECN_is_ce(ip_dsfield) && tcp_accecn_validate_syn_feedback(sk, ace, tp->syn_ect_snt)) { diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 53e0e85b52be..dce3113787a7 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -323,6 +323,8 @@ struct tcp_info { __u32 tcpi_received_e1_bytes; __u32 tcpi_received_e0_bytes; __u32 tcpi_received_ce_bytes; + __u16 tcpi_accecn_fail_mode; + __u16 tcpi_accecn_opt_seen; }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 090f9ac43d4c..5b5c655ded1d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3409,6 +3409,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->delivered = 0; tp->delivered_ce = 0; tp->accecn_fail_mode = 0; + tp->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN; tcp_accecn_init_counters(tp); tp->prev_ecnfield = 0; tp->accecn_opt_tstamp = 0; @@ -4287,6 +4288,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (tp->rto_stamp) info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; + info->tcpi_accecn_fail_mode = tp->accecn_fail_mode; + info->tcpi_accecn_opt_seen = tp->saw_accecn_opt; info->tcpi_received_ce = tp->received_ce; info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx]; info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx]; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 87154fd86167..5732f2d4329c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -398,7 +398,22 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp, unsigned int i; u8 *ptr; + if (tcp_accecn_opt_fail_recv(tp)) + return false; + if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { + if (!tp->saw_accecn_opt) { + /* Too late to enable after this point due to + * potential counter wraps + */ + if (tp->bytes_sent >= (1 << 23) - 1) { + u8 saw_opt = TCP_ACCECN_OPT_FAIL_SEEN; + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + } + return false; + } + if (estimate_ecnfield) { u8 ecnfield = estimate_ecnfield - 1; @@ -415,6 +430,13 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp, order1 = (ptr[0] == TCPOPT_ACCECN1); ptr += 2; + if (tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + tp->saw_accecn_opt = tcp_accecn_option_init(skb, + tp->rx_opt.accecn); + if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV); + } + res = !!estimate_ecnfield; for (i = 0; i < 3; i++) { u32 init_offset; @@ -6123,7 +6145,13 @@ step1: if (th->syn) { if (tcp_ecn_mode_accecn(tp)) { accecn_reflector = true; - tcp_accecn_opt_demand_min(sk, 1); + if (tp->rx_opt.accecn && + tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + tcp_accecn_opt_demand_min(sk, 1); + } } if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack && TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq && @@ -6606,7 +6634,8 @@ consume: */ if (tcp_ecn_mode_any(tp)) - tcp_ecn_rcv_synack(sk, th, TCP_SKB_CB(skb)->ip_dsfield); + tcp_ecn_rcv_synack(sk, skb, th, + TCP_SKB_CB(skb)->ip_dsfield); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_try_undo_spurious_syn(sk); @@ -7177,6 +7206,8 @@ static void tcp_openreq_init(struct request_sock *req, tcp_rsk(req)->snt_tsval_first = 0; tcp_rsk(req)->last_oow_ack_time = 0; tcp_rsk(req)->accecn_ok = 0; + tcp_rsk(req)->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN; + tcp_rsk(req)->accecn_fail_mode = 0; tcp_rsk(req)->syn_ect_rcv = 0; tcp_rsk(req)->syn_ect_snt = 0; req->mss = rx_opt->mss_clamp; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 193343494558..327095ef95ef 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -463,6 +463,7 @@ static void tcp_ecn_openreq_child(struct sock *sk, tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_snt = treq->syn_ect_snt; tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); + tp->saw_accecn_opt = treq->saw_accecn_opt; tp->prev_ecnfield = treq->syn_ect_rcv; tp->accecn_opt_demand = 1; tcp_ecn_received_counters_payload(sk, skb); @@ -678,6 +679,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, bool own_req; tmp_opt.saw_tstamp = 0; + tmp_opt.accecn = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); @@ -855,6 +857,18 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!(flg & TCP_FLAG_ACK)) return NULL; + if (tcp_rsk(req)->accecn_ok && tmp_opt.accecn && + tcp_rsk(req)->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tmp_opt.accecn); + + tcp_rsk(req)->saw_accecn_opt = saw_opt; + if (tcp_rsk(req)->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) { + u8 fail_mode = TCP_ACCECN_OPT_FAIL_RECV; + + tcp_rsk(req)->accecn_fail_mode |= fail_mode; + } + } + /* For Fast Open no more processing is needed (sk is the * child socket). */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f897c2594954..65b90f73daa0 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -985,9 +985,14 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } } - /* Simultaneous open SYN/ACK needs AccECN option but not SYN */ + /* Simultaneous open SYN/ACK needs AccECN option but not SYN. + * It is attempted to negotiate the use of AccECN also on the first + * retransmitted SYN, as mentioned in "3.1.4.1. Retransmitted SYNs" + * of AccECN draft. + */ if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) && tcp_ecn_mode_accecn(tp) && + inet_csk(sk)->icsk_retransmits < 2 && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && remaining >= TCPOLEN_ACCECN_BASE)) { opts->use_synack_ecn_bytes = 1; @@ -1076,7 +1081,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, if (treq->accecn_ok && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && - remaining >= TCPOLEN_ACCECN_BASE) { + req->num_timeout < 1 && remaining >= TCPOLEN_ACCECN_BASE) { opts->use_synack_ecn_bytes = 1; remaining -= tcp_options_fit_accecn(opts, 0, remaining); } @@ -1156,7 +1161,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (tcp_ecn_mode_accecn(tp)) { int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option); - if (ecn_opt && + if (ecn_opt && tp->saw_accecn_opt && !tcp_accecn_opt_fail_send(tp) && (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand || tcp_accecn_option_beacon_check(sk))) { opts->use_synack_ecn_bytes = 0; -- cgit v1.2.3 From fe2cddc648f0d7cdf7377e1cb5a8c7dc5547e290 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:33 +0200 Subject: tcp: accecn: AccECN option ceb/cep and ACE field multi-wrap heuristics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The AccECN option ceb/cep heuristic algorithm is from AccECN spec Appendix A.2.2 to mitigate against false ACE field overflows. Armed with ceb delta from option, delivered bytes, and delivered packets it is possible to estimate how many times ACE field wrapped. This calculation is necessary only if more than one wrap is possible. Without SACK, delivered bytes and packets are not always trustworthy in which case TCP falls back to the simpler no-or-all wraps ceb algorithm. Signed-off-by: Ilpo Järvinen Signed-off-by: Chia-Yu Chang Acked-by: Paolo Abeni Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-10-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- include/net/tcp.h | 1 + net/ipv4/tcp_input.c | 36 ++++++++++++++++++++++++++++++++++-- 2 files changed, 35 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 78dd7b8a4145..7c51a0a5ace8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -256,6 +256,7 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCP_ACCECN_MAXSIZE (TCPOLEN_ACCECN_BASE + \ TCPOLEN_ACCECN_PERFIELD * \ TCP_ACCECN_NUMFIELDS) +#define TCP_ACCECN_SAFETY_SHIFT 1 /* SAFETY_FACTOR in accecn draft */ /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5732f2d4329c..9fdc6ce25eb1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -493,16 +493,19 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, u32 delivered_pkts, u32 delivered_bytes, int flag) { + u32 old_ceb = tcp_sk(sk)->delivered_ecn_bytes[INET_ECN_CE - 1]; const struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); - u32 delta, safe_delta; + u32 delta, safe_delta, d_ceb; + bool opt_deltas_valid; u32 corrected_ace; /* Reordered ACK or uncertain due to lack of data to send and ts */ if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS))) return 0; - tcp_accecn_process_option(tp, skb, delivered_bytes, flag); + opt_deltas_valid = tcp_accecn_process_option(tp, skb, + delivered_bytes, flag); if (!(flag & FLAG_SLOWPATH)) { /* AccECN counter might overflow on large ACKs */ @@ -525,6 +528,35 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, safe_delta = delivered_pkts - ((delivered_pkts - delta) & TCP_ACCECN_CEP_ACE_MASK); + if (opt_deltas_valid) { + d_ceb = tp->delivered_ecn_bytes[INET_ECN_CE - 1] - old_ceb; + if (!d_ceb) + return delta; + + if ((delivered_pkts >= (TCP_ACCECN_CEP_ACE_MASK + 1) * 2) && + (tcp_is_sack(tp) || + ((1 << inet_csk(sk)->icsk_ca_state) & + (TCPF_CA_Open | TCPF_CA_CWR)))) { + u32 est_d_cep; + + if (delivered_bytes <= d_ceb) + return safe_delta; + + est_d_cep = DIV_ROUND_UP_ULL((u64)d_ceb * + delivered_pkts, + delivered_bytes); + return min(safe_delta, + delta + + (est_d_cep & ~TCP_ACCECN_CEP_ACE_MASK)); + } + + if (d_ceb > delta * tp->mss_cache) + return safe_delta; + if (d_ceb < + safe_delta * tp->mss_cache >> TCP_ACCECN_SAFETY_SHIFT) + return delta; + } + return safe_delta; } -- cgit v1.2.3 From e7e9da850a46b0632b18861525602faa08f3e9e1 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Tue, 16 Sep 2025 10:24:34 +0200 Subject: tcp: accecn: try to fit AccECN option with SACK MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As SACK blocks tend to eat all option space when there are many holes, it is useful to compromise on sending many SACK blocks in every ACK and attempt to fit the AccECN option there by reducing the number of SACK blocks. However, it will never go below two SACK blocks because of the AccECN option. As the AccECN option is often not put to every ACK, the space hijack is usually only temporary. Depending on the reuqired AccECN fields (can be either 3, 2, 1, or 0, cf. Table 5 in AccECN spec) and the NOPs used for alignment of other TCP options, up to two SACK blocks will be reduced. Please find below tables for more details: +====================+=========================================+ | Number of | Required | Remaining | Number of | Final | | SACK | AccECN | option | reduced | number of | | blocks | fields | spaces | SACK blocks | SACK blocks | +===========+==========+===========+=============+=============+ | x (<=2) | 0 to 3 | any | 0 | x | +-----------+----------+-----------+-------------+-------------+ | 3 | 0 | any | 0 | 3 | | 3 | 1 | <4 | 1 | 2 | | 3 | 1 | >=4 | 0 | 3 | | 3 | 2 | <8 | 1 | 2 | | 3 | 2 | >=8 | 0 | 3 | | 3 | 3 | <12 | 1 | 2 | | 3 | 3 | >=12 | 0 | 3 | +-----------+----------+-----------+-------------+-------------+ | y (>=4) | 0 | any | 0 | y | | y (>=4) | 1 | <4 | 1 | y-1 | | y (>=4) | 1 | >=4 | 0 | y | | y (>=4) | 2 | <8 | 1 | y-1 | | y (>=4) | 2 | >=8 | 0 | y | | y (>=4) | 3 | <4 | 2 | y-2 | | y (>=4) | 3 | <12 | 1 | y-1 | | y (>=4) | 3 | >=12 | 0 | y | +===========+==========+===========+=============+=============+ Signed-off-by: Chia-Yu Chang Co-developed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-11-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- net/ipv4/tcp_output.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 65b90f73daa0..388c45859469 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -873,7 +873,9 @@ static int tcp_options_fit_accecn(struct tcp_out_options *opts, int required, int remaining) { int size = TCP_ACCECN_MAXSIZE; + int sack_blocks_reduce = 0; int max_combine_saving; + int rem = remaining; int align_size; if (opts->use_synack_ecn_bytes) @@ -888,14 +890,31 @@ static int tcp_options_fit_accecn(struct tcp_out_options *opts, int required, else align_size = ALIGN_DOWN(size, 4); - if (remaining >= align_size) { + if (rem >= align_size) { size = align_size; break; + } else if (opts->num_accecn_fields == required && + opts->num_sack_blocks > 2 && + required > 0) { + /* Try to fit the option by removing one SACK block */ + opts->num_sack_blocks--; + sack_blocks_reduce++; + rem = rem + TCPOLEN_SACK_PERBLOCK; + + opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS; + size = TCP_ACCECN_MAXSIZE; + continue; } opts->num_accecn_fields--; size -= TCPOLEN_ACCECN_PERFIELD; } + if (sack_blocks_reduce > 0) { + if (opts->num_accecn_fields >= required) + size -= sack_blocks_reduce * TCPOLEN_SACK_PERBLOCK; + else + opts->num_sack_blocks += sack_blocks_reduce; + } if (opts->num_accecn_fields < required) return 0; -- cgit v1.2.3 From 3fbb2a6f3a70c27a6a2be80d131970608c0f84d0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Sep 2025 16:09:42 +0000 Subject: ipv6: make ipv6_pinfo.saddr_cache a boolean ipv6_pinfo.saddr_cache is either NULL or &np->saddr. We do not need 8 bytes, a boolean is enough. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250916160951.541279-2-edumazet@google.com Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/linux/ipv6.h | 4 ++-- include/net/ip6_route.h | 4 ++-- net/ipv6/af_inet6.c | 2 +- net/ipv6/inet6_connection_sock.c | 2 +- net/ipv6/ip6_output.c | 3 ++- net/ipv6/route.c | 4 ++-- net/ipv6/tcp_ipv6.c | 4 ++-- 7 files changed, 12 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index f43314517396..55c4d1e4dd7d 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -216,10 +216,10 @@ struct inet6_cork { struct ipv6_pinfo { struct in6_addr saddr; struct in6_pktinfo sticky_pktinfo; - const struct in6_addr *daddr_cache; #ifdef CONFIG_IPV6_SUBTREES - const struct in6_addr *saddr_cache; + bool saddr_cache; #endif + const struct in6_addr *daddr_cache; __be32 flow_label; __u32 frag_size; diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 59f48ca3abdf..223c02d42688 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -230,7 +230,7 @@ static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) */ static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, const struct in6_addr *daddr, - const struct in6_addr *saddr) + bool saddr_set) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -238,7 +238,7 @@ static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, sk_setup_caps(sk, dst); np->daddr_cache = daddr; #ifdef CONFIG_IPV6_SUBTREES - np->saddr_cache = saddr; + np->saddr_cache = saddr_set; #endif } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 1992621e3f3f..c342f8daea7f 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -857,7 +857,7 @@ int inet6_sk_rebuild_header(struct sock *sk) return PTR_ERR(dst); } - ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, false); } return 0; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 333e43434dd7..1947ccdb00df 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -91,7 +91,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (!IS_ERR(dst)) - ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, false); } return dst; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 9d64c13bab5e..82ff6e1293d0 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1102,7 +1102,8 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, */ if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || #ifdef CONFIG_IPV6_SUBTREES - ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || + ip6_rt_check(&rt->rt6i_src, &fl6->saddr, + np->saddr_cache ? &np->saddr : NULL) || #endif (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) { dst_release(dst); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3371f16b7a3e..e1b0aebf8bf9 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3036,9 +3036,9 @@ void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, &sk->sk_v6_daddr : NULL, #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_equal(&fl6->saddr, &np->saddr) ? - &np->saddr : + true : #endif - NULL); + false); } static bool ip6_redirect_nh_match(const struct fib6_result *res, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5f0a138f4220..ecc3a87cd8c4 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -299,7 +299,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; - ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, false); icsk->icsk_ext_hdr_len = 0; if (opt) @@ -1459,7 +1459,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - ip6_dst_store(newsk, dst, NULL, NULL); + ip6_dst_store(newsk, dst, NULL, false); newnp->saddr = ireq->ir_v6_loc_addr; -- cgit v1.2.3 From 5489f333ef993bfceebce9ae98944f04eaafcc30 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Sep 2025 16:09:43 +0000 Subject: ipv6: make ipv6_pinfo.daddr_cache a boolean ipv6_pinfo.daddr_cache is either NULL or &sk->sk_v6_daddr We do not need 8 bytes, a boolean is enough. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250916160951.541279-3-edumazet@google.com Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/linux/ipv6.h | 2 +- include/net/ip6_route.h | 4 ++-- net/ipv6/af_inet6.c | 2 +- net/ipv6/inet6_connection_sock.c | 2 +- net/ipv6/ip6_output.c | 3 ++- net/ipv6/route.c | 3 +-- net/ipv6/tcp_ipv6.c | 4 ++-- 7 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 55c4d1e4dd7d..8e6d9f8b3dc8 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -219,7 +219,7 @@ struct ipv6_pinfo { #ifdef CONFIG_IPV6_SUBTREES bool saddr_cache; #endif - const struct in6_addr *daddr_cache; + bool daddr_cache; __be32 flow_label; __u32 frag_size; diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 223c02d42688..7c5512baa4b2 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -229,14 +229,14 @@ static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) * Store a destination cache entry in a socket */ static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, - const struct in6_addr *daddr, + bool daddr_set, bool saddr_set) { struct ipv6_pinfo *np = inet6_sk(sk); np->dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); sk_setup_caps(sk, dst); - np->daddr_cache = daddr; + np->daddr_cache = daddr_set; #ifdef CONFIG_IPV6_SUBTREES np->saddr_cache = saddr_set; #endif diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index c342f8daea7f..1b0314644e0c 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -857,7 +857,7 @@ int inet6_sk_rebuild_header(struct sock *sk) return PTR_ERR(dst); } - ip6_dst_store(sk, dst, NULL, false); + ip6_dst_store(sk, dst, false, false); } return 0; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 1947ccdb00df..ea5cf3fdfdd6 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -91,7 +91,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (!IS_ERR(dst)) - ip6_dst_store(sk, dst, NULL, false); + ip6_dst_store(sk, dst, false, false); } return dst; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 82ff6e1293d0..f904739e99b9 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1100,7 +1100,8 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, * sockets. * 2. oif also should be the same. */ - if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || + if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, + np->daddr_cache ? &sk->sk_v6_daddr : NULL) || #ifdef CONFIG_IPV6_SUBTREES ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache ? &np->saddr : NULL) || diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e1b0aebf8bf9..aee6a10b112a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3032,8 +3032,7 @@ void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, #endif ip6_dst_store(sk, dst, - ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? - &sk->sk_v6_daddr : NULL, + ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr), #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_equal(&fl6->saddr, &np->saddr) ? true : diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ecc3a87cd8c4..c7271f6359db 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -299,7 +299,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; - ip6_dst_store(sk, dst, NULL, false); + ip6_dst_store(sk, dst, false, false); icsk->icsk_ext_hdr_len = 0; if (opt) @@ -1459,7 +1459,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - ip6_dst_store(newsk, dst, NULL, false); + ip6_dst_store(newsk, dst, false, false); newnp->saddr = ireq->ir_v6_loc_addr; -- cgit v1.2.3 From 9fba1eb39e2f74d2002c5cbcf1d4435d37a4f752 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Sep 2025 16:09:44 +0000 Subject: ipv6: np->rxpmtu race annotation Add READ_ONCE() annotations because np->rxpmtu can be changed while udpv6_recvmsg() and rawv6_recvmsg() read it. Since this is a very rarely used feature, and that udpv6_recvmsg() and rawv6_recvmsg() read np->rxopt anyway, change the test order so that np->rxpmtu does not need to be in a hot cache line. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250916160951.541279-4-edumazet@google.com Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- net/ipv6/raw.c | 2 +- net/ipv6/udp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 4ae07a67b4d4..e369f54844dd 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -445,7 +445,7 @@ static int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (flags & MSG_ERRQUEUE) return ipv6_recv_error(sk, msg, len, addr_len); - if (np->rxpmtu && np->rxopt.bits.rxpmtu) + if (np->rxopt.bits.rxpmtu && READ_ONCE(np->rxpmtu)) return ipv6_recv_rxpmtu(sk, msg, len, addr_len); skb = skb_recv_datagram(sk, flags, &err); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b70369f3cd32..e87d0ef861f8 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -479,7 +479,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (flags & MSG_ERRQUEUE) return ipv6_recv_error(sk, msg, len, addr_len); - if (np->rxpmtu && np->rxopt.bits.rxpmtu) + if (np->rxopt.bits.rxpmtu && READ_ONCE(np->rxpmtu)) return ipv6_recv_rxpmtu(sk, msg, len, addr_len); try_again: -- cgit v1.2.3 From 9aaec660b5be29f23aaa7d1b0ae426b895dc0ca5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Sep 2025 16:09:46 +0000 Subject: udp: refine __udp_enqueue_schedule_skb() test Commit 5a465a0da13e ("udp: Fix multiple wraparounds of sk->sk_rmem_alloc.") allowed to slightly overshoot sk->sk_rmem_alloc, when many cpus are trying to feed packets to a common UDP socket. This patch, combined with the following one reduces false sharing on the victim socket under DDOS. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250916160951.541279-6-edumazet@google.com Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- net/ipv4/udp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cca41c569f37..edd846fee90f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1739,8 +1739,8 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) if (rcvbuf > INT_MAX >> 1) goto drop; - /* Always allow at least one packet for small buffer. */ - if (rmem > rcvbuf) + /* Accept the packet if queue is empty. */ + if (rmem) goto drop; } -- cgit v1.2.3 From faf7b4aefd5be1d1b460e2161b8f730e03abb9b9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Sep 2025 16:09:47 +0000 Subject: udp: update sk_rmem_alloc before busylock acquisition Avoid piling too many producers on the busylock by updating sk_rmem_alloc before busylock acquisition. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250916160951.541279-7-edumazet@google.com Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- net/ipv4/udp.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index edd846fee90f..658ae8782799 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1753,13 +1753,16 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) if (rmem > (rcvbuf >> 1)) { skb_condense(skb); size = skb->truesize; + rmem = atomic_add_return(size, &sk->sk_rmem_alloc); + if (rmem > rcvbuf) + goto uncharge_drop; busy = busylock_acquire(sk); + } else { + atomic_add(size, &sk->sk_rmem_alloc); } udp_set_dev_scratch(skb); - atomic_add(size, &sk->sk_rmem_alloc); - spin_lock(&list->lock); err = udp_rmem_schedule(sk, size); if (err) { -- cgit v1.2.3 From 9db27c80622bd612549ea213390500f7377ee3e1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Sep 2025 16:09:49 +0000 Subject: udp: add udp_drops_inc() helper Generic sk_drops_inc() reads sk->sk_drop_counters. We know the precise location for UDP sockets. Move sk_drop_counters out of sock_read_rxtx so that sock_write_rxtx starts at a cache line boundary. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250916160951.541279-9-edumazet@google.com Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/net/sock.h | 2 +- include/net/udp.h | 5 +++++ net/core/sock.c | 1 - net/ipv4/udp.c | 12 ++++++------ net/ipv6/udp.c | 6 +++--- 5 files changed, 15 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 867dc44140d4..82bcdb7d7e67 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -451,7 +451,6 @@ struct sock { #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif - struct numa_drop_counters *sk_drop_counters; __cacheline_group_end(sock_read_rxtx); __cacheline_group_begin(sock_write_rxtx); @@ -568,6 +567,7 @@ struct sock { #ifdef CONFIG_BPF_SYSCALL struct bpf_local_storage __rcu *sk_bpf_storage; #endif + struct numa_drop_counters *sk_drop_counters; struct rcu_head sk_rcu; netns_tracker ns_tracker; struct xarray sk_user_frags; diff --git a/include/net/udp.h b/include/net/udp.h index 93b159f30e88..a08822e294b0 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -295,6 +295,11 @@ static inline void udp_lib_init_sock(struct sock *sk) set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); } +static inline void udp_drops_inc(struct sock *sk) +{ + numa_drop_add(&udp_sk(sk)->drop_counters, 1); +} + /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */ static inline int udp_lib_hash(struct sock *sk) { diff --git a/net/core/sock.c b/net/core/sock.c index 1f8ef4d8bcd9..21742da19e45 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -4444,7 +4444,6 @@ static int __init sock_struct_check(void) #ifdef CONFIG_MEMCG CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); #endif - CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_drop_counters); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 658ae8782799..25143f932447 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1790,7 +1790,7 @@ uncharge_drop: atomic_sub(skb->truesize, &sk->sk_rmem_alloc); drop: - sk_drops_inc(sk); + udp_drops_inc(sk); busylock_release(busy); return err; } @@ -1855,7 +1855,7 @@ static struct sk_buff *__first_packet_length(struct sock *sk, IS_UDPLITE(sk)); __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, IS_UDPLITE(sk)); - sk_drops_inc(sk); + udp_drops_inc(sk); __skb_unlink(skb, rcvq); *total += skb->truesize; kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); @@ -2011,7 +2011,7 @@ try_again: __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite); __UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite); - sk_drops_inc(sk); + udp_drops_inc(sk); kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); goto try_again; } @@ -2081,7 +2081,7 @@ try_again: if (unlikely(err)) { if (!peeking) { - sk_drops_inc(sk); + udp_drops_inc(sk); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } @@ -2452,7 +2452,7 @@ csum_error: __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - sk_drops_inc(sk); + udp_drops_inc(sk); sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -2537,7 +2537,7 @@ start_lookup: nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { - sk_drops_inc(sk); + udp_drops_inc(sk); __UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); __UDP_INC_STATS(net, UDP_MIB_INERRORS, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e87d0ef861f8..9f4d340d1e3a 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -524,7 +524,7 @@ try_again: } if (unlikely(err)) { if (!peeking) { - sk_drops_inc(sk); + udp_drops_inc(sk); SNMP_INC_STATS(mib, UDP_MIB_INERRORS); } kfree_skb(skb); @@ -908,7 +908,7 @@ csum_error: __UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: __UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - sk_drops_inc(sk); + udp_drops_inc(sk); sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -1013,7 +1013,7 @@ start_lookup: } nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { - sk_drops_inc(sk); + udp_drops_inc(sk); __UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); __UDP6_INC_STATS(net, UDP_MIB_INERRORS, -- cgit v1.2.3 From 3cd04c8f4afed71a48edef0db5255afc249c2feb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Sep 2025 16:09:50 +0000 Subject: udp: make busylock per socket While having all spinlocks packed into an array was a space saver, this also caused NUMA imbalance and hash collisions. UDPv6 socket size becomes 1600 after this patch. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250916160951.541279-10-edumazet@google.com Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/linux/udp.h | 1 + include/net/udp.h | 1 + net/ipv4/udp.c | 20 ++------------------ 3 files changed, 4 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/linux/udp.h b/include/linux/udp.h index 6ed008ab1665..e554890c4415 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -109,6 +109,7 @@ struct udp_sock { */ struct hlist_node tunnel_list; struct numa_drop_counters drop_counters; + spinlock_t busylock ____cacheline_aligned_in_smp; }; #define udp_test_bit(nr, sk) \ diff --git a/include/net/udp.h b/include/net/udp.h index a08822e294b0..eecd64097f91 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -289,6 +289,7 @@ static inline void udp_lib_init_sock(struct sock *sk) struct udp_sock *up = udp_sk(sk); sk->sk_drop_counters = &up->drop_counters; + spin_lock_init(&up->busylock); skb_queue_head_init(&up->reader_queue); INIT_HLIST_NODE(&up->tunnel_list); up->forward_threshold = sk->sk_rcvbuf >> 2; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 25143f932447..7d1444821ee5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1689,17 +1689,11 @@ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb) * to relieve pressure on the receive_queue spinlock shared by consumer. * Under flood, this means that only one producer can be in line * trying to acquire the receive_queue spinlock. - * These busylock can be allocated on a per cpu manner, instead of a - * per socket one (that would consume a cache line per socket) */ -static int udp_busylocks_log __read_mostly; -static spinlock_t *udp_busylocks __read_mostly; - -static spinlock_t *busylock_acquire(void *ptr) +static spinlock_t *busylock_acquire(struct sock *sk) { - spinlock_t *busy; + spinlock_t *busy = &udp_sk(sk)->busylock; - busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log); spin_lock(busy); return busy; } @@ -3997,7 +3991,6 @@ static void __init bpf_iter_register(void) void __init udp_init(void) { unsigned long limit; - unsigned int i; udp_table_init(&udp_table, "UDP"); limit = nr_free_buffer_pages() / 8; @@ -4006,15 +3999,6 @@ void __init udp_init(void) sysctl_udp_mem[1] = limit; sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2; - /* 16 spinlocks per cpu */ - udp_busylocks_log = ilog2(nr_cpu_ids) + 4; - udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log, - GFP_KERNEL); - if (!udp_busylocks) - panic("UDP: failed to alloc udp_busylocks\n"); - for (i = 0; i < (1U << udp_busylocks_log); i++) - spin_lock_init(udp_busylocks + i); - if (register_pernet_subsys(&udp_sysctl_ops)) panic("UDP: failed to init sysctl parameters.\n"); -- cgit v1.2.3 From 6471658dc66c670580a7616e75f51b52917e7883 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Sep 2025 16:09:51 +0000 Subject: udp: use skb_attempt_defer_free() Move skb freeing from udp recvmsg() path to the cpu which allocated/received it, as TCP did in linux-5.17. This increases max thoughput by 20% to 30%, depending on number of BH producers. Signed-off-by: Eric Dumazet Acked-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250916160951.541279-11-edumazet@google.com Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- net/ipv4/udp.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 7d1444821ee5..0c40426628eb 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1825,6 +1825,13 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) if (unlikely(READ_ONCE(udp_sk(sk)->peeking_with_offset))) sk_peek_offset_bwd(sk, len); + if (!skb_shared(skb)) { + if (unlikely(udp_skb_has_head_state(skb))) + skb_release_head_state(skb); + skb_attempt_defer_free(skb); + return; + } + if (!skb_unref(skb)) return; -- cgit v1.2.3 From 00c94ca2b99e6610e483f92e531b319eeaed94aa Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:29 -0700 Subject: psp: base PSP device support Add a netlink family for PSP and allow drivers to register support. The "PSP device" is its own object. This allows us to perform more flexible reference counting / lifetime control than if PSP information was part of net_device. In the future we should also be able to "delegate" PSP access to software devices, such as *vlan, veth or netkit more easily. Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-3-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/psp.yaml | 96 +++++++++++++++ include/linux/netdevice.h | 4 + include/net/psp.h | 12 ++ include/net/psp/functions.h | 14 +++ include/net/psp/types.h | 100 ++++++++++++++++ include/uapi/linux/psp.h | 42 +++++++ net/Kconfig | 1 + net/Makefile | 1 + net/psp/Kconfig | 13 ++ net/psp/Makefile | 5 + net/psp/psp-nl-gen.c | 65 ++++++++++ net/psp/psp-nl-gen.h | 30 +++++ net/psp/psp.h | 31 +++++ net/psp/psp_main.c | 139 ++++++++++++++++++++++ net/psp/psp_nl.c | 223 +++++++++++++++++++++++++++++++++++ tools/net/ynl/Makefile.deps | 1 + 16 files changed, 777 insertions(+) create mode 100644 Documentation/netlink/specs/psp.yaml create mode 100644 include/net/psp.h create mode 100644 include/net/psp/functions.h create mode 100644 include/net/psp/types.h create mode 100644 include/uapi/linux/psp.h create mode 100644 net/psp/Kconfig create mode 100644 net/psp/Makefile create mode 100644 net/psp/psp-nl-gen.c create mode 100644 net/psp/psp-nl-gen.h create mode 100644 net/psp/psp.h create mode 100644 net/psp/psp_main.c create mode 100644 net/psp/psp_nl.c (limited to 'net') diff --git a/Documentation/netlink/specs/psp.yaml b/Documentation/netlink/specs/psp.yaml new file mode 100644 index 000000000000..706f4baf8764 --- /dev/null +++ b/Documentation/netlink/specs/psp.yaml @@ -0,0 +1,96 @@ +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +--- +name: psp + +doc: + PSP Security Protocol Generic Netlink family. + +definitions: + - + type: enum + name: version + entries: [hdr0-aes-gcm-128, hdr0-aes-gcm-256, + hdr0-aes-gmac-128, hdr0-aes-gmac-256] + +attribute-sets: + - + name: dev + attributes: + - + name: id + doc: PSP device ID. + type: u32 + checks: + min: 1 + - + name: ifindex + doc: ifindex of the main netdevice linked to the PSP device. + type: u32 + - + name: psp-versions-cap + doc: Bitmask of PSP versions supported by the device. + type: u32 + enum: version + enum-as-flags: true + - + name: psp-versions-ena + doc: Bitmask of currently enabled (accepted on Rx) PSP versions. + type: u32 + enum: version + enum-as-flags: true + +operations: + list: + - + name: dev-get + doc: Get / dump information about PSP capable devices on the system. + attribute-set: dev + do: + request: + attributes: + - id + reply: &dev-all + attributes: + - id + - ifindex + - psp-versions-cap + - psp-versions-ena + pre: psp-device-get-locked + post: psp-device-unlock + dump: + reply: *dev-all + - + name: dev-add-ntf + doc: Notification about device appearing. + notify: dev-get + mcgrp: mgmt + - + name: dev-del-ntf + doc: Notification about device disappearing. + notify: dev-get + mcgrp: mgmt + - + name: dev-set + doc: Set the configuration of a PSP device. + attribute-set: dev + do: + request: + attributes: + - id + - psp-versions-ena + reply: + attributes: [] + pre: psp-device-get-locked + post: psp-device-unlock + - + name: dev-change-ntf + doc: Notification about device configuration being changed. + notify: dev-get + mcgrp: mgmt + +mcast-groups: + list: + - + name: mgmt + +... diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f5a840c07cf1..1c54d44805fa 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1906,6 +1906,7 @@ enum netdev_reg_state { * device struct * @mpls_ptr: mpls_dev struct pointer * @mctp_ptr: MCTP specific data + * @psp_dev: PSP crypto device registered for this netdev * * @dev_addr: Hw address (before bcast, * because most packets are unicast) @@ -2310,6 +2311,9 @@ struct net_device { #if IS_ENABLED(CONFIG_MCTP) struct mctp_dev __rcu *mctp_ptr; #endif +#if IS_ENABLED(CONFIG_INET_PSP) + struct psp_dev __rcu *psp_dev; +#endif /* * Cache lines mostly used on receive path (including eth_type_trans()) diff --git a/include/net/psp.h b/include/net/psp.h new file mode 100644 index 000000000000..33bb4d1dc46e --- /dev/null +++ b/include/net/psp.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_ALL_H +#define __NET_PSP_ALL_H + +#include +#include +#include + +/* Do not add any code here. Put it in the sub-headers instead. */ + +#endif /* __NET_PSP_ALL_H */ diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h new file mode 100644 index 000000000000..074f9df9afc3 --- /dev/null +++ b/include/net/psp/functions.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_HELPERS_H +#define __NET_PSP_HELPERS_H + +#include + +/* Driver-facing API */ +struct psp_dev * +psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, + struct psp_dev_caps *psd_caps, void *priv_ptr); +void psp_dev_unregister(struct psp_dev *psd); + +#endif /* __NET_PSP_HELPERS_H */ diff --git a/include/net/psp/types.h b/include/net/psp/types.h new file mode 100644 index 000000000000..d242b1ecee7d --- /dev/null +++ b/include/net/psp/types.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_H +#define __NET_PSP_H + +#include +#include + +struct netlink_ext_ack; + +#define PSP_DEFAULT_UDP_PORT 1000 + +struct psphdr { + u8 nexthdr; + u8 hdrlen; + u8 crypt_offset; + u8 verfl; + __be32 spi; + __be64 iv; + __be64 vc[]; /* optional */ +}; + +#define PSP_SPI_KEY_ID GENMASK(30, 0) +#define PSP_SPI_KEY_PHASE BIT(31) + +#define PSPHDR_CRYPT_OFFSET GENMASK(5, 0) + +#define PSPHDR_VERFL_SAMPLE BIT(7) +#define PSPHDR_VERFL_DROP BIT(6) +#define PSPHDR_VERFL_VERSION GENMASK(5, 2) +#define PSPHDR_VERFL_VIRT BIT(1) +#define PSPHDR_VERFL_ONE BIT(0) + +#define PSP_HDRLEN_NOOPT ((sizeof(struct psphdr) - 8) / 8) + +/** + * struct psp_dev_config - PSP device configuration + * @versions: PSP versions enabled on the device + */ +struct psp_dev_config { + u32 versions; +}; + +/** + * struct psp_dev - PSP device struct + * @main_netdev: original netdevice of this PSP device + * @ops: driver callbacks + * @caps: device capabilities + * @drv_priv: driver priv pointer + * @lock: instance lock, protects all fields + * @refcnt: reference count for the instance + * @id: instance id + * @config: current device configuration + * + * @rcu: RCU head for freeing the structure + */ +struct psp_dev { + struct net_device *main_netdev; + + struct psp_dev_ops *ops; + struct psp_dev_caps *caps; + void *drv_priv; + + struct mutex lock; + refcount_t refcnt; + + u32 id; + + struct psp_dev_config config; + + struct rcu_head rcu; +}; + +/** + * struct psp_dev_caps - PSP device capabilities + */ +struct psp_dev_caps { + /** + * @versions: mask of supported PSP versions + * Set this field to 0 to indicate PSP is not supported at all. + */ + u32 versions; +}; + +#define PSP_MAX_KEY 32 + +/** + * struct psp_dev_ops - netdev driver facing PSP callbacks + */ +struct psp_dev_ops { + /** + * @set_config: set configuration of a PSP device + * Driver can inspect @psd->config for the previous configuration. + * Core will update @psd->config with @config on success. + */ + int (*set_config)(struct psp_dev *psd, struct psp_dev_config *conf, + struct netlink_ext_ack *extack); +}; + +#endif /* __NET_PSP_H */ diff --git a/include/uapi/linux/psp.h b/include/uapi/linux/psp.h new file mode 100644 index 000000000000..4a404f085190 --- /dev/null +++ b/include/uapi/linux/psp.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/psp.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_PSP_H +#define _UAPI_LINUX_PSP_H + +#define PSP_FAMILY_NAME "psp" +#define PSP_FAMILY_VERSION 1 + +enum psp_version { + PSP_VERSION_HDR0_AES_GCM_128, + PSP_VERSION_HDR0_AES_GCM_256, + PSP_VERSION_HDR0_AES_GMAC_128, + PSP_VERSION_HDR0_AES_GMAC_256, +}; + +enum { + PSP_A_DEV_ID = 1, + PSP_A_DEV_IFINDEX, + PSP_A_DEV_PSP_VERSIONS_CAP, + PSP_A_DEV_PSP_VERSIONS_ENA, + + __PSP_A_DEV_MAX, + PSP_A_DEV_MAX = (__PSP_A_DEV_MAX - 1) +}; + +enum { + PSP_CMD_DEV_GET = 1, + PSP_CMD_DEV_ADD_NTF, + PSP_CMD_DEV_DEL_NTF, + PSP_CMD_DEV_SET, + PSP_CMD_DEV_CHANGE_NTF, + + __PSP_CMD_MAX, + PSP_CMD_MAX = (__PSP_CMD_MAX - 1) +}; + +#define PSP_MCGRP_MGMT "mgmt" + +#endif /* _UAPI_LINUX_PSP_H */ diff --git a/net/Kconfig b/net/Kconfig index d5865cf19799..4b563aea4c23 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -82,6 +82,7 @@ config NET_CRC32C menu "Networking options" source "net/packet/Kconfig" +source "net/psp/Kconfig" source "net/unix/Kconfig" source "net/tls/Kconfig" source "net/xfrm/Kconfig" diff --git a/net/Makefile b/net/Makefile index aac960c41db6..90e3d72bf58b 100644 --- a/net/Makefile +++ b/net/Makefile @@ -18,6 +18,7 @@ obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_TLS) += tls/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX) += unix/ +obj-$(CONFIG_INET_PSP) += psp/ obj-y += ipv6/ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ diff --git a/net/psp/Kconfig b/net/psp/Kconfig new file mode 100644 index 000000000000..55f9dd87446b --- /dev/null +++ b/net/psp/Kconfig @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# PSP configuration +# +config INET_PSP + bool "PSP Security Protocol support" + depends on INET + help + Enable kernel support for the PSP protocol. + For more information see: + https://raw.githubusercontent.com/google/psp/main/doc/PSP_Arch_Spec.pdf + + If unsure, say N. diff --git a/net/psp/Makefile b/net/psp/Makefile new file mode 100644 index 000000000000..41b51d06e560 --- /dev/null +++ b/net/psp/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_INET_PSP) += psp.o + +psp-y := psp_main.o psp_nl.o psp-nl-gen.o diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c new file mode 100644 index 000000000000..859712e7c2c1 --- /dev/null +++ b/net/psp/psp-nl-gen.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/psp.yaml */ +/* YNL-GEN kernel source */ + +#include +#include + +#include "psp-nl-gen.h" + +#include + +/* PSP_CMD_DEV_GET - do */ +static const struct nla_policy psp_dev_get_nl_policy[PSP_A_DEV_ID + 1] = { + [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +/* PSP_CMD_DEV_SET - do */ +static const struct nla_policy psp_dev_set_nl_policy[PSP_A_DEV_PSP_VERSIONS_ENA + 1] = { + [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), + [PSP_A_DEV_PSP_VERSIONS_ENA] = NLA_POLICY_MASK(NLA_U32, 0xf), +}; + +/* Ops table for psp */ +static const struct genl_split_ops psp_nl_ops[] = { + { + .cmd = PSP_CMD_DEV_GET, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_dev_get_doit, + .post_doit = psp_device_unlock, + .policy = psp_dev_get_nl_policy, + .maxattr = PSP_A_DEV_ID, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = PSP_CMD_DEV_GET, + .dumpit = psp_nl_dev_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = PSP_CMD_DEV_SET, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_dev_set_doit, + .post_doit = psp_device_unlock, + .policy = psp_dev_set_nl_policy, + .maxattr = PSP_A_DEV_PSP_VERSIONS_ENA, + .flags = GENL_CMD_CAP_DO, + }, +}; + +static const struct genl_multicast_group psp_nl_mcgrps[] = { + [PSP_NLGRP_MGMT] = { "mgmt", }, +}; + +struct genl_family psp_nl_family __ro_after_init = { + .name = PSP_FAMILY_NAME, + .version = PSP_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = psp_nl_ops, + .n_split_ops = ARRAY_SIZE(psp_nl_ops), + .mcgrps = psp_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(psp_nl_mcgrps), +}; diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h new file mode 100644 index 000000000000..a099686cab5d --- /dev/null +++ b/net/psp/psp-nl-gen.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/psp.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_PSP_GEN_H +#define _LINUX_PSP_GEN_H + +#include +#include + +#include + +int psp_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); +void +psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); + +int psp_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info); + +enum { + PSP_NLGRP_MGMT, +}; + +extern struct genl_family psp_nl_family; + +#endif /* _LINUX_PSP_GEN_H */ diff --git a/net/psp/psp.h b/net/psp/psp.h new file mode 100644 index 000000000000..94d0cc31a61f --- /dev/null +++ b/net/psp/psp.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __PSP_PSP_H +#define __PSP_PSP_H + +#include +#include +#include +#include +#include + +extern struct xarray psp_devs; +extern struct mutex psp_devs_lock; + +void psp_dev_destroy(struct psp_dev *psd); +int psp_dev_check_access(struct psp_dev *psd, struct net *net); + +void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd); + +static inline void psp_dev_get(struct psp_dev *psd) +{ + refcount_inc(&psd->refcnt); +} + +static inline void psp_dev_put(struct psp_dev *psd) +{ + if (refcount_dec_and_test(&psd->refcnt)) + psp_dev_destroy(psd); +} + +#endif /* __PSP_PSP_H */ diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c new file mode 100644 index 000000000000..e09499b7b14a --- /dev/null +++ b/net/psp/psp_main.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include + +#include "psp.h" +#include "psp-nl-gen.h" + +DEFINE_XARRAY_ALLOC1(psp_devs); +struct mutex psp_devs_lock; + +/** + * DOC: PSP locking + * + * psp_devs_lock protects the psp_devs xarray. + * Ordering is take the psp_devs_lock and then the instance lock. + * Each instance is protected by RCU, and has a refcount. + * When driver unregisters the instance gets flushed, but struct sticks around. + */ + +/** + * psp_dev_check_access() - check if user in a given net ns can access PSP dev + * @psd: PSP device structure user is trying to access + * @net: net namespace user is in + * + * Return: 0 if PSP device should be visible in @net, errno otherwise. + */ +int psp_dev_check_access(struct psp_dev *psd, struct net *net) +{ + if (dev_net(psd->main_netdev) == net) + return 0; + return -ENOENT; +} + +/** + * psp_dev_create() - create and register PSP device + * @netdev: main netdevice + * @psd_ops: driver callbacks + * @psd_caps: device capabilities + * @priv_ptr: back-pointer to driver private data + * + * Return: pointer to allocated PSP device, or ERR_PTR. + */ +struct psp_dev * +psp_dev_create(struct net_device *netdev, + struct psp_dev_ops *psd_ops, struct psp_dev_caps *psd_caps, + void *priv_ptr) +{ + struct psp_dev *psd; + static u32 last_id; + int err; + + if (WARN_ON(!psd_caps->versions || + !psd_ops->set_config)) + return ERR_PTR(-EINVAL); + + psd = kzalloc(sizeof(*psd), GFP_KERNEL); + if (!psd) + return ERR_PTR(-ENOMEM); + + psd->main_netdev = netdev; + psd->ops = psd_ops; + psd->caps = psd_caps; + psd->drv_priv = priv_ptr; + + mutex_init(&psd->lock); + refcount_set(&psd->refcnt, 1); + + mutex_lock(&psp_devs_lock); + err = xa_alloc_cyclic(&psp_devs, &psd->id, psd, xa_limit_16b, + &last_id, GFP_KERNEL); + if (err) { + mutex_unlock(&psp_devs_lock); + kfree(psd); + return ERR_PTR(err); + } + mutex_lock(&psd->lock); + mutex_unlock(&psp_devs_lock); + + psp_nl_notify_dev(psd, PSP_CMD_DEV_ADD_NTF); + + rcu_assign_pointer(netdev->psp_dev, psd); + + mutex_unlock(&psd->lock); + + return psd; +} +EXPORT_SYMBOL(psp_dev_create); + +void psp_dev_destroy(struct psp_dev *psd) +{ + mutex_lock(&psp_devs_lock); + xa_erase(&psp_devs, psd->id); + mutex_unlock(&psp_devs_lock); + + mutex_destroy(&psd->lock); + kfree_rcu(psd, rcu); +} + +/** + * psp_dev_unregister() - unregister PSP device + * @psd: PSP device structure + */ +void psp_dev_unregister(struct psp_dev *psd) +{ + mutex_lock(&psp_devs_lock); + mutex_lock(&psd->lock); + + psp_nl_notify_dev(psd, PSP_CMD_DEV_DEL_NTF); + + /* Wait until psp_dev_destroy() to call xa_erase() to prevent a + * different psd from being added to the xarray with this id, while + * there are still references to this psd being held. + */ + xa_store(&psp_devs, psd->id, NULL, GFP_KERNEL); + mutex_unlock(&psp_devs_lock); + + rcu_assign_pointer(psd->main_netdev->psp_dev, NULL); + + psd->ops = NULL; + psd->drv_priv = NULL; + + mutex_unlock(&psd->lock); + + psp_dev_put(psd); +} +EXPORT_SYMBOL(psp_dev_unregister); + +static int __init psp_init(void) +{ + mutex_init(&psp_devs_lock); + + return genl_register_family(&psp_nl_family); +} + +subsys_initcall(psp_init); diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c new file mode 100644 index 000000000000..fda5ce800f82 --- /dev/null +++ b/net/psp/psp_nl.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include + +#include "psp-nl-gen.h" +#include "psp.h" + +/* Netlink helpers */ + +static struct sk_buff *psp_nl_reply_new(struct genl_info *info) +{ + struct sk_buff *rsp; + void *hdr; + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return NULL; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) { + nlmsg_free(rsp); + return NULL; + } + + return rsp; +} + +static int psp_nl_reply_send(struct sk_buff *rsp, struct genl_info *info) +{ + /* Note that this *only* works with a single message per skb! */ + nlmsg_end(rsp, (struct nlmsghdr *)rsp->data); + + return genlmsg_reply(rsp, info); +} + +/* Device stuff */ + +static struct psp_dev * +psp_device_get_and_lock(struct net *net, struct nlattr *dev_id) +{ + struct psp_dev *psd; + int err; + + mutex_lock(&psp_devs_lock); + psd = xa_load(&psp_devs, nla_get_u32(dev_id)); + if (!psd) { + mutex_unlock(&psp_devs_lock); + return ERR_PTR(-ENODEV); + } + + mutex_lock(&psd->lock); + mutex_unlock(&psp_devs_lock); + + err = psp_dev_check_access(psd, net); + if (err) { + mutex_unlock(&psd->lock); + return ERR_PTR(err); + } + + return psd; +} + +int psp_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + if (GENL_REQ_ATTR_CHECK(info, PSP_A_DEV_ID)) + return -EINVAL; + + info->user_ptr[0] = psp_device_get_and_lock(genl_info_net(info), + info->attrs[PSP_A_DEV_ID]); + return PTR_ERR_OR_ZERO(info->user_ptr[0]); +} + +void +psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + + mutex_unlock(&psd->lock); +} + +static int +psp_nl_dev_fill(struct psp_dev *psd, struct sk_buff *rsp, + const struct genl_info *info) +{ + void *hdr; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u32(rsp, PSP_A_DEV_ID, psd->id) || + nla_put_u32(rsp, PSP_A_DEV_IFINDEX, psd->main_netdev->ifindex) || + nla_put_u32(rsp, PSP_A_DEV_PSP_VERSIONS_CAP, psd->caps->versions) || + nla_put_u32(rsp, PSP_A_DEV_PSP_VERSIONS_ENA, psd->config.versions)) + goto err_cancel_msg; + + genlmsg_end(rsp, hdr); + return 0; + +err_cancel_msg: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd) +{ + struct genl_info info; + struct sk_buff *ntf; + + if (!genl_has_listeners(&psp_nl_family, dev_net(psd->main_netdev), + PSP_NLGRP_MGMT)) + return; + + ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!ntf) + return; + + genl_info_init_ntf(&info, &psp_nl_family, cmd); + if (psp_nl_dev_fill(psd, ntf, &info)) { + nlmsg_free(ntf); + return; + } + + genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf, + 0, PSP_NLGRP_MGMT, GFP_KERNEL); +} + +int psp_nl_dev_get_doit(struct sk_buff *req, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct sk_buff *rsp; + int err; + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + err = psp_nl_dev_fill(psd, rsp, info); + if (err) + goto err_free_msg; + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} + +static int +psp_nl_dev_get_dumpit_one(struct sk_buff *rsp, struct netlink_callback *cb, + struct psp_dev *psd) +{ + if (psp_dev_check_access(psd, sock_net(rsp->sk))) + return 0; + + return psp_nl_dev_fill(psd, rsp, genl_info_dump(cb)); +} + +int psp_nl_dev_get_dumpit(struct sk_buff *rsp, struct netlink_callback *cb) +{ + struct psp_dev *psd; + int err = 0; + + mutex_lock(&psp_devs_lock); + xa_for_each_start(&psp_devs, cb->args[0], psd, cb->args[0]) { + mutex_lock(&psd->lock); + err = psp_nl_dev_get_dumpit_one(rsp, cb, psd); + mutex_unlock(&psd->lock); + if (err) + break; + } + mutex_unlock(&psp_devs_lock); + + return err; +} + +int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct psp_dev_config new_config; + struct sk_buff *rsp; + int err; + + memcpy(&new_config, &psd->config, sizeof(new_config)); + + if (info->attrs[PSP_A_DEV_PSP_VERSIONS_ENA]) { + new_config.versions = + nla_get_u32(info->attrs[PSP_A_DEV_PSP_VERSIONS_ENA]); + if (new_config.versions & ~psd->caps->versions) { + NL_SET_ERR_MSG(info->extack, "Requested PSP versions not supported by the device"); + return -EINVAL; + } + } else { + NL_SET_ERR_MSG(info->extack, "No settings present"); + return -EINVAL; + } + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + if (memcmp(&new_config, &psd->config, sizeof(new_config))) { + err = psd->ops->set_config(psd, &new_config, info->extack); + if (err) + goto err_free_rsp; + + memcpy(&psd->config, &new_config, sizeof(new_config)); + } + + psp_nl_notify_dev(psd, PSP_CMD_DEV_CHANGE_NTF); + + return psp_nl_reply_send(rsp, info); + +err_free_rsp: + nlmsg_free(rsp); + return err; +} diff --git a/tools/net/ynl/Makefile.deps b/tools/net/ynl/Makefile.deps index 90686e241157..865fd2e8519e 100644 --- a/tools/net/ynl/Makefile.deps +++ b/tools/net/ynl/Makefile.deps @@ -31,6 +31,7 @@ CFLAGS_ovpn:=$(call get_hdr_inc,_LINUX_OVPN_H,ovpn.h) CFLAGS_ovs_datapath:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) CFLAGS_ovs_flow:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) CFLAGS_ovs_vport:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) +CFLAGS_psp:=$(call get_hdr_inc,_LINUX_PSP_H,psp.h) CFLAGS_rt-addr:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) \ $(call get_hdr_inc,__LINUX_IF_ADDR_H,if_addr.h) CFLAGS_rt-link:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) \ -- cgit v1.2.3 From ed8a507b748336902525aa79e3573552534e8b3e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:30 -0700 Subject: net: modify core data structures for PSP datapath support Add pointers to psp data structures to core networking structs, and an SKB extension to carry the PSP information from the drivers to the socket layer. Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Co-developed-by: Daniel Zahka Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-4-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 3 +++ include/net/inet_timewait_sock.h | 3 +++ include/net/psp/functions.h | 6 ++++++ include/net/psp/types.h | 7 +++++++ include/net/sock.h | 4 ++++ net/core/skbuff.c | 4 ++++ net/ipv4/af_inet.c | 2 ++ net/ipv4/tcp_minisocks.c | 2 ++ 8 files changed, 31 insertions(+) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 62e7addccdf6..78ecfa7d00d0 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4901,6 +4901,9 @@ enum skb_ext_id { #endif #if IS_ENABLED(CONFIG_MCTP_FLOWS) SKB_EXT_MCTP, +#endif +#if IS_ENABLED(CONFIG_INET_PSP) + SKB_EXT_PSP, #endif SKB_EXT_NUM, /* must be last */ }; diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 67a313575780..c1295246216c 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -81,6 +81,9 @@ struct inet_timewait_sock { struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; struct inet_bind2_bucket *tw_tb2; +#if IS_ENABLED(CONFIG_INET_PSP) + struct psp_assoc __rcu *psp_assoc; +#endif }; #define tw_tclass tw_tos diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index 074f9df9afc3..d0043bd14299 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -5,10 +5,16 @@ #include +struct inet_timewait_sock; + /* Driver-facing API */ struct psp_dev * psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, struct psp_dev_caps *psd_caps, void *priv_ptr); void psp_dev_unregister(struct psp_dev *psd); +/* Kernel-facing API */ +static inline void psp_sk_assoc_free(struct sock *sk) { } +static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } + #endif /* __NET_PSP_HELPERS_H */ diff --git a/include/net/psp/types.h b/include/net/psp/types.h index d242b1ecee7d..4922fc8d42fd 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -84,6 +84,13 @@ struct psp_dev_caps { #define PSP_MAX_KEY 32 +struct psp_skb_ext { + __be32 spi; + u16 dev_id; + u8 generation; + u8 version; +}; + /** * struct psp_dev_ops - netdev driver facing PSP callbacks */ diff --git a/include/net/sock.h b/include/net/sock.h index 0fd465935334..d1d3d36e39ae 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -249,6 +249,7 @@ struct sk_filter; * @sk_dst_cache: destination cache * @sk_dst_pending_confirm: need to confirm neighbour * @sk_policy: flow policy + * @psp_assoc: PSP association, if socket is PSP-secured * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed * @sk_tsq_flags: TCP Small Queues flags @@ -450,6 +451,9 @@ struct sock { #endif #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; +#endif +#if IS_ENABLED(CONFIG_INET_PSP) + struct psp_assoc __rcu *psp_assoc; #endif struct numa_drop_counters *sk_drop_counters; __cacheline_group_end(sock_read_rxtx); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 23b776cd9879..d331e607edfb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -79,6 +79,7 @@ #include #include #include +#include #include #include @@ -5062,6 +5063,9 @@ static const u8 skb_ext_type_len[] = { #if IS_ENABLED(CONFIG_MCTP_FLOWS) [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), #endif +#if IS_ENABLED(CONFIG_INET_PSP) + [SKB_EXT_PSP] = SKB_EXT_CHUNKSIZEOF(struct psp_skb_ext), +#endif }; static __always_inline unsigned int skb_ext_total_length(void) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 76e38092cd8a..e298dacb4a06 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -102,6 +102,7 @@ #include #include #include +#include #include #include #include @@ -158,6 +159,7 @@ void inet_sock_destruct(struct sock *sk) kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1)); + psp_sk_assoc_free(sk); } EXPORT_SYMBOL(inet_sock_destruct); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 327095ef95ef..ddb67015ba28 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -24,6 +24,7 @@ #include #include #include +#include static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { @@ -392,6 +393,7 @@ void tcp_twsk_destructor(struct sock *sk) } #endif tcp_ao_destroy_sock(sk, true); + psp_twsk_assoc_free(inet_twsk(sk)); } void tcp_twsk_purge(struct list_head *net_exit_list) -- cgit v1.2.3 From 659a2899a57da59f433182eba571881884d6323e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:31 -0700 Subject: tcp: add datapath logic for PSP with inline key exchange Add validation points and state propagation to support PSP key exchange inline, on TCP connections. The expectation is that application will use some well established mechanism like TLS handshake to establish a secure channel over the connection and if both endpoints are PSP-capable - exchange and install PSP keys. Because the connection can existing in PSP-unsecured and PSP-secured state we need to make sure that there are no race conditions or retransmission leaks. On Tx - mark packets with the skb->decrypted bit when PSP key is at the enqueue time. Drivers should only encrypt packets with this bit set. This prevents retransmissions getting encrypted when original transmission was not. Similarly to TLS, we'll use sk->sk_validate_xmit_skb to make sure PSP skbs can't "escape" via a PSP-unaware device without being encrypted. On Rx - validation is done under socket lock. This moves the validation point later than xfrm, for example. Please see the documentation patch for more details on the flow of securing a connection, but for the purpose of this patch what's important is that we want to enforce the invariant that once connection is secured any skb in the receive queue has been encrypted with PSP. Add GRO and coalescing checks to prevent PSP authenticated data from being combined with cleartext data, or data with non-matching PSP state. On Rx, check skb's with psp_skb_coalesce_diff() at points before psp_sk_rx_policy_check(). After skb's are policy checked and on the socket receive queue, skb_cmp_decrypted() is sufficient for checking for coalescable PSP state. On Tx, tcp_write_collapse_fence() should be called when transitioning a socket into PSP Tx state to prevent data sent as cleartext from being coalesced with PSP encapsulated data. This change only adds the validation points, for ease of review. Subsequent change will add the ability to install keys, and flesh the enforcement logic out Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Co-developed-by: Daniel Zahka Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-5-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- include/net/dropreason-core.h | 6 ++++ include/net/psp/functions.h | 77 +++++++++++++++++++++++++++++++++++++++++++ net/core/gro.c | 2 ++ net/ipv4/inet_timewait_sock.c | 2 ++ net/ipv4/ip_output.c | 5 ++- net/ipv4/tcp.c | 2 ++ net/ipv4/tcp_ipv4.c | 14 +++++++- net/ipv4/tcp_minisocks.c | 18 ++++++++++ net/ipv4/tcp_output.c | 17 ++++++---- net/ipv6/tcp_ipv6.c | 11 +++++++ net/psp/Kconfig | 1 + 11 files changed, 147 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index d8ff24a33459..58d91ccc56e0 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -127,6 +127,8 @@ FN(CANXL_RX_INVALID_FRAME) \ FN(PFMEMALLOC) \ FN(DUALPI2_STEP_DROP) \ + FN(PSP_INPUT) \ + FN(PSP_OUTPUT) \ FNe(MAX) /** @@ -610,6 +612,10 @@ enum skb_drop_reason { * threshold of DualPI2 qdisc. */ SKB_DROP_REASON_DUALPI2_STEP_DROP, + /** @SKB_DROP_REASON_PSP_INPUT: PSP input checks failed */ + SKB_DROP_REASON_PSP_INPUT, + /** @SKB_DROP_REASON_PSP_OUTPUT: PSP output checks failed */ + SKB_DROP_REASON_PSP_OUTPUT, /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index d0043bd14299..1ccc5fc238b8 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -3,6 +3,8 @@ #ifndef __NET_PSP_HELPERS_H #define __NET_PSP_HELPERS_H +#include +#include #include struct inet_timewait_sock; @@ -14,7 +16,82 @@ psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, void psp_dev_unregister(struct psp_dev *psd); /* Kernel-facing API */ +#if IS_ENABLED(CONFIG_INET_PSP) static inline void psp_sk_assoc_free(struct sock *sk) { } +static inline void +psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) { } static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } +static inline void +psp_reply_set_decrypted(struct sk_buff *skb) { } + +static inline void +psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) +{ +} + +static inline unsigned long +__psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two, + unsigned long diffs) +{ + return diffs; +} + +static inline enum skb_drop_reason +psp_sk_rx_policy_check(struct sock *sk, struct sk_buff *skb) +{ + return 0; +} + +static inline enum skb_drop_reason +psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb) +{ + return 0; +} + +static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) +{ + return NULL; +} +#else +static inline void psp_sk_assoc_free(struct sock *sk) { } +static inline void +psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) { } +static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } +static inline void +psp_reply_set_decrypted(struct sk_buff *skb) { } + +static inline void +psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) { } + +static inline unsigned long +__psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two, + unsigned long diffs) +{ + return diffs; +} + +static inline enum skb_drop_reason +psp_sk_rx_policy_check(struct sock *sk, struct sk_buff *skb) +{ + return 0; +} + +static inline enum skb_drop_reason +psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb) +{ + return 0; +} + +static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) +{ + return NULL; +} +#endif + +static inline unsigned long +psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two) +{ + return __psp_skb_coalesce_diff(one, two, 0); +} #endif /* __NET_PSP_HELPERS_H */ diff --git a/net/core/gro.c b/net/core/gro.c index b350e5b69549..5ba4504cfd28 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include #include #include #include @@ -376,6 +377,7 @@ static void gro_list_prepare(const struct list_head *head, diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); diffs |= gro_list_prepare_tc_ext(skb, p, diffs); + diffs |= __psp_skb_coalesce_diff(skb, p, diffs); } NAPI_GRO_CB(p)->same_flow = !diffs; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 5b5426b8ee92..1f83f333b8ac 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -16,6 +16,7 @@ #include #include #include +#include /** * inet_twsk_bind_unhash - unhash a timewait socket from bind hash @@ -219,6 +220,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, refcount_set(&tw->tw_refcnt, 0); __module_get(tw->tw_prot->owner); + psp_twsk_init(tw, sk); } return tw; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 2b96651d719b..5ca97ede979c 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -84,6 +84,7 @@ #include #include #include +#include static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, @@ -1665,8 +1666,10 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; - if (orig_sk) + if (orig_sk) { skb_set_owner_edemux(nskb, (struct sock *)orig_sk); + psp_reply_set_decrypted(nskb); + } if (transmit_time) nskb->tstamp_type = SKB_CLOCK_MONOTONIC; if (txhash) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5b5c655ded1d..d6d0d970e014 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -277,6 +277,7 @@ #include #include #include +#include #include #include @@ -705,6 +706,7 @@ void tcp_skb_entail(struct sock *sk, struct sk_buff *skb) tcb->seq = tcb->end_seq = tp->write_seq; tcb->tcp_flags = TCPHDR_ACK; __skb_header_release(skb); + psp_enqueue_set_decrypted(sk, skb); tcp_add_write_queue_tail(sk, skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6a63be1f6461..f27f6f865a48 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -75,6 +75,7 @@ #include #include #include +#include #include #include @@ -1907,6 +1908,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) enum skb_drop_reason reason; struct sock *rsk; + reason = psp_sk_rx_policy_check(sk, skb); + if (reason) + goto err_discard; + if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ struct dst_entry *dst; @@ -1968,6 +1973,7 @@ csum_err: reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); +err_discard: TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; } @@ -2069,7 +2075,9 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) || !tcp_skb_can_collapse_rx(tail, skb) || thtail->doff != th->doff || - memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) + memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) || + /* prior to PSP Rx policy check, retain exact PSP metadata */ + psp_skb_coalesce_diff(tail, skb)) goto no_coalesce; __skb_pull(skb, hdrlen); @@ -2437,6 +2445,10 @@ do_time_wait: __this_cpu_write(tcp_tw_isn, isn); goto process; } + + drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb); + if (drop_reason) + break; } /* to ACK */ fallthrough; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index ddb67015ba28..2ec8c6f1cdcc 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -105,9 +105,16 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt); struct tcp_options_received tmp_opt; + enum skb_drop_reason psp_drop; bool paws_reject = false; int ts_recent_stamp; + /* Instead of dropping immediately, wait to see what value is + * returned. We will accept a non psp-encapsulated syn in the + * case where TCP_TW_SYN is returned. + */ + psp_drop = psp_twsk_rx_policy_check(tw, skb); + tmp_opt.saw_tstamp = 0; ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); if (th->doff > (sizeof(*th) >> 2) && ts_recent_stamp) { @@ -125,6 +132,9 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) { /* Just repeat all the checks of tcp_rcv_state_process() */ + if (psp_drop) + goto out_put; + /* Out of window, send ACK */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, @@ -195,6 +205,9 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { /* In window segment, it may be only reset or bare ack. */ + if (psp_drop) + goto out_put; + if (th->rst) { /* This is TIME_WAIT assassination, in two flavors. * Oh well... nobody has a sufficient solution to this @@ -248,6 +261,9 @@ kill: return TCP_TW_SYN; } + if (psp_drop) + goto out_put; + if (paws_reject) { *drop_reason = SKB_DROP_REASON_TCP_RFC7323_TW_PAWS; __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWS_TW_REJECTED); @@ -266,6 +282,8 @@ kill: return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); } + +out_put: inet_twsk_put(tw); return TCP_TW_SUCCESS; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 388c45859469..223d7feeb19d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -358,13 +359,15 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, /* Constructs common control bits of non-data skb. If SYN/FIN is present, * auto increment end seqno. */ -static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u16 flags) +static void tcp_init_nondata_skb(struct sk_buff *skb, struct sock *sk, + u32 seq, u16 flags) { skb->ip_summed = CHECKSUM_PARTIAL; TCP_SKB_CB(skb)->tcp_flags = flags; tcp_skb_pcount_set(skb, 1); + psp_enqueue_set_decrypted(sk, skb); TCP_SKB_CB(skb)->seq = seq; if (flags & (TCPHDR_SYN | TCPHDR_FIN)) @@ -1656,6 +1659,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) /* Advance write_seq and place onto the write_queue. */ WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq); __skb_header_release(skb); + psp_enqueue_set_decrypted(sk, skb); tcp_add_write_queue_tail(sk, skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); @@ -3778,7 +3782,7 @@ void tcp_send_fin(struct sock *sk) skb_reserve(skb, MAX_TCP_HEADER); sk_forced_mem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ - tcp_init_nondata_skb(skb, tp->write_seq, + tcp_init_nondata_skb(skb, sk, tp->write_seq, TCPHDR_ACK | TCPHDR_FIN); tcp_queue_skb(sk, skb); } @@ -3806,7 +3810,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority, /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_TCP_HEADER); - tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), + tcp_init_nondata_skb(skb, sk, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST); tcp_mstamp_refresh(tcp_sk(sk)); /* Send it off. */ @@ -4303,7 +4307,7 @@ int tcp_connect(struct sock *sk) /* SYN eats a sequence byte, write_seq updated by * tcp_connect_queue_skb(). */ - tcp_init_nondata_skb(buff, tp->write_seq, TCPHDR_SYN); + tcp_init_nondata_skb(buff, sk, tp->write_seq, TCPHDR_SYN); tcp_mstamp_refresh(tp); tp->retrans_stamp = tcp_time_stamp_ts(tp); tcp_connect_queue_skb(sk, buff); @@ -4428,7 +4432,8 @@ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt, u16 flags) /* Reserve space for headers and prepare control bits. */ skb_reserve(buff, MAX_TCP_HEADER); - tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK | flags); + tcp_init_nondata_skb(buff, sk, + tcp_acceptable_seq(sk), TCPHDR_ACK | flags); /* We do not want pure acks influencing TCP Small Queues or fq/pacing * too much. @@ -4474,7 +4479,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) * end to send an ack. Don't queue or clone SKB, just * send it. */ - tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); + tcp_init_nondata_skb(skb, sk, tp->snd_una - !urgent, TCPHDR_ACK); NET_INC_STATS(sock_net(sk), mib); return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0); } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5f0a138f4220..4da8eb9183d7 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -973,6 +974,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 if (sk) { /* unconstify the socket only to attach it to buff with care. */ skb_set_owner_edemux(buff, (struct sock *)sk); + psp_reply_set_decrypted(buff); if (sk->sk_state == TCP_TIME_WAIT) mark = inet_twsk(sk)->tw_mark; @@ -1605,6 +1607,10 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_do_rcv(sk, skb); + reason = psp_sk_rx_policy_check(sk, skb); + if (reason) + goto err_discard; + /* * socket locking is here for SMP purposes as backlog rcv * is currently called with bh processing disabled. @@ -1684,6 +1690,7 @@ csum_err: reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); +err_discard: TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; @@ -1988,6 +1995,10 @@ do_time_wait: __this_cpu_write(tcp_tw_isn, isn); goto process; } + + drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb); + if (drop_reason) + break; } /* to ACK */ fallthrough; diff --git a/net/psp/Kconfig b/net/psp/Kconfig index 55f9dd87446b..5e3908a40945 100644 --- a/net/psp/Kconfig +++ b/net/psp/Kconfig @@ -5,6 +5,7 @@ config INET_PSP bool "PSP Security Protocol support" depends on INET + select SKB_DECRYPTED help Enable kernel support for the PSP protocol. For more information see: -- cgit v1.2.3 From 117f02a49b7719b210d154a0d0e728001bf4af06 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:32 -0700 Subject: psp: add op for rotation of device key Rotating the device key is a key part of the PSP protocol design. Some external daemon needs to do it once a day, or so. Add a netlink op to perform this operation. Add a notification group for informing users that key has been rotated and they should rekey (next rotation will cut them off). Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-6-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/psp.yaml | 21 +++++++++++++++++++ include/net/psp/types.h | 5 +++++ include/uapi/linux/psp.h | 3 +++ net/psp/psp-nl-gen.c | 15 ++++++++++++++ net/psp/psp-nl-gen.h | 2 ++ net/psp/psp_main.c | 3 ++- net/psp/psp_nl.c | 40 ++++++++++++++++++++++++++++++++++++ 7 files changed, 88 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/Documentation/netlink/specs/psp.yaml b/Documentation/netlink/specs/psp.yaml index 706f4baf8764..054cc02b65ad 100644 --- a/Documentation/netlink/specs/psp.yaml +++ b/Documentation/netlink/specs/psp.yaml @@ -88,9 +88,30 @@ operations: notify: dev-get mcgrp: mgmt + - + name: key-rotate + doc: Rotate the device key. + attribute-set: dev + do: + request: + attributes: + - id + reply: + attributes: + - id + pre: psp-device-get-locked + post: psp-device-unlock + - + name: key-rotate-ntf + doc: Notification about device key getting rotated. + notify: key-rotate + mcgrp: use + mcast-groups: list: - name: mgmt + - + name: use ... diff --git a/include/net/psp/types.h b/include/net/psp/types.h index 4922fc8d42fd..66327fa80c92 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -102,6 +102,11 @@ struct psp_dev_ops { */ int (*set_config)(struct psp_dev *psd, struct psp_dev_config *conf, struct netlink_ext_ack *extack); + + /** + * @key_rotate: rotate the device key + */ + int (*key_rotate)(struct psp_dev *psd, struct netlink_ext_ack *extack); }; #endif /* __NET_PSP_H */ diff --git a/include/uapi/linux/psp.h b/include/uapi/linux/psp.h index 4a404f085190..cbfbf3f0f364 100644 --- a/include/uapi/linux/psp.h +++ b/include/uapi/linux/psp.h @@ -32,11 +32,14 @@ enum { PSP_CMD_DEV_DEL_NTF, PSP_CMD_DEV_SET, PSP_CMD_DEV_CHANGE_NTF, + PSP_CMD_KEY_ROTATE, + PSP_CMD_KEY_ROTATE_NTF, __PSP_CMD_MAX, PSP_CMD_MAX = (__PSP_CMD_MAX - 1) }; #define PSP_MCGRP_MGMT "mgmt" +#define PSP_MCGRP_USE "use" #endif /* _UAPI_LINUX_PSP_H */ diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c index 859712e7c2c1..7f49577ac72f 100644 --- a/net/psp/psp-nl-gen.c +++ b/net/psp/psp-nl-gen.c @@ -21,6 +21,11 @@ static const struct nla_policy psp_dev_set_nl_policy[PSP_A_DEV_PSP_VERSIONS_ENA [PSP_A_DEV_PSP_VERSIONS_ENA] = NLA_POLICY_MASK(NLA_U32, 0xf), }; +/* PSP_CMD_KEY_ROTATE - do */ +static const struct nla_policy psp_key_rotate_nl_policy[PSP_A_DEV_ID + 1] = { + [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), +}; + /* Ops table for psp */ static const struct genl_split_ops psp_nl_ops[] = { { @@ -46,10 +51,20 @@ static const struct genl_split_ops psp_nl_ops[] = { .maxattr = PSP_A_DEV_PSP_VERSIONS_ENA, .flags = GENL_CMD_CAP_DO, }, + { + .cmd = PSP_CMD_KEY_ROTATE, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_key_rotate_doit, + .post_doit = psp_device_unlock, + .policy = psp_key_rotate_nl_policy, + .maxattr = PSP_A_DEV_ID, + .flags = GENL_CMD_CAP_DO, + }, }; static const struct genl_multicast_group psp_nl_mcgrps[] = { [PSP_NLGRP_MGMT] = { "mgmt", }, + [PSP_NLGRP_USE] = { "use", }, }; struct genl_family psp_nl_family __ro_after_init = { diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h index a099686cab5d..00a2d4ec59e4 100644 --- a/net/psp/psp-nl-gen.h +++ b/net/psp/psp-nl-gen.h @@ -20,9 +20,11 @@ psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, int psp_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); int psp_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info); enum { PSP_NLGRP_MGMT, + PSP_NLGRP_USE, }; extern struct genl_family psp_nl_family; diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c index e09499b7b14a..f60155493afc 100644 --- a/net/psp/psp_main.c +++ b/net/psp/psp_main.c @@ -54,7 +54,8 @@ psp_dev_create(struct net_device *netdev, int err; if (WARN_ON(!psd_caps->versions || - !psd_ops->set_config)) + !psd_ops->set_config || + !psd_ops->key_rotate)) return ERR_PTR(-EINVAL); psd = kzalloc(sizeof(*psd), GFP_KERNEL); diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c index fda5ce800f82..75f2702c1029 100644 --- a/net/psp/psp_nl.c +++ b/net/psp/psp_nl.c @@ -221,3 +221,43 @@ err_free_rsp: nlmsg_free(rsp); return err; } + +int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct genl_info ntf_info; + struct sk_buff *ntf, *rsp; + int err; + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + genl_info_init_ntf(&ntf_info, &psp_nl_family, PSP_CMD_KEY_ROTATE_NTF); + ntf = psp_nl_reply_new(&ntf_info); + if (!ntf) { + err = -ENOMEM; + goto err_free_rsp; + } + + if (nla_put_u32(rsp, PSP_A_DEV_ID, psd->id) || + nla_put_u32(ntf, PSP_A_DEV_ID, psd->id)) { + err = -EMSGSIZE; + goto err_free_ntf; + } + + err = psd->ops->key_rotate(psd, info->extack); + if (err) + goto err_free_ntf; + + nlmsg_end(ntf, (struct nlmsghdr *)ntf->data); + genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf, + 0, PSP_NLGRP_USE, GFP_KERNEL); + return psp_nl_reply_send(rsp, info); + +err_free_ntf: + nlmsg_free(ntf); +err_free_rsp: + nlmsg_free(rsp); + return err; +} -- cgit v1.2.3 From 8c511c1df380780b8a81050767dbfe7ca518d3a2 Mon Sep 17 00:00:00 2001 From: Daniel Zahka Date: Tue, 16 Sep 2025 17:09:33 -0700 Subject: net: move sk_validate_xmit_skb() to net/core/dev.c Move definition of sk_validate_xmit_skb() from net/core/sock.c to net/core/dev.c. This change is in preparation of the next patch, where sk_validate_xmit_skb() will need to cast sk to a tcp_timewait_sock *, and access member fields. Including linux/tcp.h from linux/sock.h creates a circular dependency, and dev.c is the only current call site of this function. Reviewed-by: Willem de Bruijn Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-7-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- include/net/sock.h | 22 ---------------------- net/core/dev.c | 22 ++++++++++++++++++++++ 2 files changed, 22 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index d1d3d36e39ae..bf92029a88d6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2960,28 +2960,6 @@ sk_requests_wifi_status(struct sock *sk) return sk && sk_fullsock(sk) && sock_flag(sk, SOCK_WIFI_STATUS); } -/* Checks if this SKB belongs to an HW offloaded socket - * and whether any SW fallbacks are required based on dev. - * Check decrypted mark in case skb_orphan() cleared socket. - */ -static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, - struct net_device *dev) -{ -#ifdef CONFIG_SOCK_VALIDATE_XMIT - struct sock *sk = skb->sk; - - if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { - skb = sk->sk_validate_xmit_skb(sk, dev, skb); - } else if (unlikely(skb_is_decrypted(skb))) { - pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); - kfree_skb(skb); - skb = NULL; - } -#endif - - return skb; -} - /* This helper checks if a socket is a LISTEN or NEW_SYN_RECV * SYNACK messages can be attached to either ones (depending on SYNCOOKIE) */ diff --git a/net/core/dev.c b/net/core/dev.c index 2522d9d8f0e4..384e59d7e715 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3907,6 +3907,28 @@ sw_checksum: } EXPORT_SYMBOL(skb_csum_hwoffload_help); +/* Checks if this SKB belongs to an HW offloaded socket + * and whether any SW fallbacks are required based on dev. + * Check decrypted mark in case skb_orphan() cleared socket. + */ +static struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, + struct net_device *dev) +{ +#ifdef CONFIG_SOCK_VALIDATE_XMIT + struct sock *sk = skb->sk; + + if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { + skb = sk->sk_validate_xmit_skb(sk, dev, skb); + } else if (unlikely(skb_is_decrypted(skb))) { + pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); + kfree_skb(skb); + skb = NULL; + } +#endif + + return skb; +} + static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb, struct net_device *dev) { -- cgit v1.2.3 From 0917bb139eed467a6376db903ad7a67981ec1420 Mon Sep 17 00:00:00 2001 From: Daniel Zahka Date: Tue, 16 Sep 2025 17:09:34 -0700 Subject: net: tcp: allow tcp_timewait_sock to validate skbs before handing to device Provide a callback to validate skb's originating from tcp timewait socks before passing to the device layer. Full socks have a sk_validate_xmit_skb member for checking that a device is capable of performing offloads required for transmitting an skb. With psp, tcp timewait socks will inherit the crypto state from their corresponding full socks. Any ACKs or RSTs that originate from a tcp timewait sock carrying psp state should be psp encapsulated. Reviewed-by: Willem de Bruijn Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-8-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- include/net/inet_timewait_sock.h | 5 +++++ net/core/dev.c | 14 ++++++++++++-- net/ipv4/inet_timewait_sock.c | 3 +++ 3 files changed, 20 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index c1295246216c..3a31c74c9e15 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -84,6 +84,11 @@ struct inet_timewait_sock { #if IS_ENABLED(CONFIG_INET_PSP) struct psp_assoc __rcu *psp_assoc; #endif +#ifdef CONFIG_SOCK_VALIDATE_XMIT + struct sk_buff* (*tw_validate_xmit_skb)(struct sock *sk, + struct net_device *dev, + struct sk_buff *skb); +#endif }; #define tw_tclass tw_tos diff --git a/net/core/dev.c b/net/core/dev.c index 384e59d7e715..5e22d062bac5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3915,10 +3915,20 @@ static struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) { #ifdef CONFIG_SOCK_VALIDATE_XMIT + struct sk_buff *(*sk_validate)(struct sock *sk, struct net_device *dev, + struct sk_buff *skb); struct sock *sk = skb->sk; - if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { - skb = sk->sk_validate_xmit_skb(sk, dev, skb); + sk_validate = NULL; + if (sk) { + if (sk_fullsock(sk)) + sk_validate = sk->sk_validate_xmit_skb; + else if (sk_is_inet(sk) && sk->sk_state == TCP_TIME_WAIT) + sk_validate = inet_twsk(sk)->tw_validate_xmit_skb; + } + + if (sk_validate) { + skb = sk_validate(sk, dev, skb); } else if (unlikely(skb_is_decrypted(skb))) { pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); kfree_skb(skb); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 1f83f333b8ac..2ca2912f61f4 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -212,6 +212,9 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); twsk_net_set(tw, sock_net(sk)); timer_setup(&tw->tw_timer, tw_timer_handler, 0); +#ifdef CONFIG_SOCK_VALIDATE_XMIT + tw->tw_validate_xmit_skb = NULL; +#endif /* * Because we use RCU lookups, we should not set tw_refcnt * to a non null value before everything is setup for this -- cgit v1.2.3 From 6b46ca260e2290e3453d1355ab5b6d283d73d780 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:35 -0700 Subject: net: psp: add socket security association code Add the ability to install PSP Rx and Tx crypto keys on TCP connections. Netlink ops are provided for both operations. Rx side combines allocating a new Rx key and installing it on the socket. Theoretically these are separate actions, but in practice they will always be used one after the other. We can add distinct "alloc" and "install" ops later. Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Co-developed-by: Daniel Zahka Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-9-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/psp.yaml | 70 +++++++++ include/net/psp/functions.h | 114 +++++++++++++-- include/net/psp/types.h | 57 ++++++++ include/uapi/linux/psp.h | 21 +++ net/psp/Kconfig | 1 + net/psp/Makefile | 2 +- net/psp/psp-nl-gen.c | 39 +++++ net/psp/psp-nl-gen.h | 7 + net/psp/psp.h | 22 +++ net/psp/psp_main.c | 26 +++- net/psp/psp_nl.c | 232 +++++++++++++++++++++++++++++ net/psp/psp_sock.c | 274 +++++++++++++++++++++++++++++++++++ 12 files changed, 854 insertions(+), 11 deletions(-) create mode 100644 net/psp/psp_sock.c (limited to 'net') diff --git a/Documentation/netlink/specs/psp.yaml b/Documentation/netlink/specs/psp.yaml index 054cc02b65ad..944429e5c9a8 100644 --- a/Documentation/netlink/specs/psp.yaml +++ b/Documentation/netlink/specs/psp.yaml @@ -38,6 +38,44 @@ attribute-sets: type: u32 enum: version enum-as-flags: true + - + name: assoc + attributes: + - + name: dev-id + doc: PSP device ID. + type: u32 + checks: + min: 1 + - + name: version + doc: | + PSP versions (AEAD and protocol version) used by this association, + dictates the size of the key. + type: u32 + enum: version + - + name: rx-key + type: nest + nested-attributes: keys + - + name: tx-key + type: nest + nested-attributes: keys + - + name: sock-fd + doc: Sockets which should be bound to the association immediately. + type: u32 + - + name: keys + attributes: + - + name: key + type: binary + - + name: spi + doc: Security Parameters Index (SPI) of the association. + type: u32 operations: list: @@ -107,6 +145,38 @@ operations: notify: key-rotate mcgrp: use + - + name: rx-assoc + doc: Allocate a new Rx key + SPI pair, associate it with a socket. + attribute-set: assoc + do: + request: + attributes: + - dev-id + - version + - sock-fd + reply: + attributes: + - dev-id + - rx-key + pre: psp-assoc-device-get-locked + post: psp-device-unlock + - + name: tx-assoc + doc: Add a PSP Tx association. + attribute-set: assoc + do: + request: + attributes: + - dev-id + - version + - tx-key + - sock-fd + reply: + attributes: [] + pre: psp-assoc-device-get-locked + post: psp-device-unlock + mcast-groups: list: - diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index 1ccc5fc238b8..0d7141230f47 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -4,7 +4,9 @@ #define __NET_PSP_HELPERS_H #include +#include #include +#include #include struct inet_timewait_sock; @@ -16,41 +18,130 @@ psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, void psp_dev_unregister(struct psp_dev *psd); /* Kernel-facing API */ +void psp_assoc_put(struct psp_assoc *pas); + +static inline void *psp_assoc_drv_data(struct psp_assoc *pas) +{ + return pas->drv_data; +} + #if IS_ENABLED(CONFIG_INET_PSP) -static inline void psp_sk_assoc_free(struct sock *sk) { } -static inline void -psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) { } -static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } -static inline void -psp_reply_set_decrypted(struct sk_buff *skb) { } +unsigned int psp_key_size(u32 version); +void psp_sk_assoc_free(struct sock *sk); +void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk); +void psp_twsk_assoc_free(struct inet_timewait_sock *tw); +void psp_reply_set_decrypted(struct sk_buff *skb); + +static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk) +{ + return rcu_dereference_check(sk->psp_assoc, lockdep_sock_is_held(sk)); +} static inline void psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) { + struct psp_assoc *pas; + + pas = psp_sk_assoc(sk); + if (pas && pas->tx.spi) + skb->decrypted = 1; } static inline unsigned long __psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two, unsigned long diffs) { + struct psp_skb_ext *a, *b; + + a = skb_ext_find(one, SKB_EXT_PSP); + b = skb_ext_find(two, SKB_EXT_PSP); + + diffs |= (!!a) ^ (!!b); + if (!diffs && unlikely(a)) + diffs |= memcmp(a, b, sizeof(*a)); return diffs; } +static inline bool +psp_is_allowed_nondata(struct sk_buff *skb, struct psp_assoc *pas) +{ + bool fin = !!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN); + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + u32 seq = TCP_SKB_CB(skb)->seq; + bool pure_fin; + + pure_fin = fin && end_seq - seq == 1; + + return seq == end_seq || (pure_fin && seq == pas->upgrade_seq); +} + +static inline bool +psp_pse_matches_pas(struct psp_skb_ext *pse, struct psp_assoc *pas) +{ + return pse && pas->rx.spi == pse->spi && + pas->generation == pse->generation && + pas->version == pse->version && + pas->dev_id == pse->dev_id; +} + +static inline enum skb_drop_reason +__psp_sk_rx_policy_check(struct sk_buff *skb, struct psp_assoc *pas) +{ + struct psp_skb_ext *pse = skb_ext_find(skb, SKB_EXT_PSP); + + if (!pas) + return pse ? SKB_DROP_REASON_PSP_INPUT : 0; + + if (likely(psp_pse_matches_pas(pse, pas))) { + if (unlikely(!pas->peer_tx)) + pas->peer_tx = 1; + + return 0; + } + + if (!pse) { + if (!pas->tx.spi || + (!pas->peer_tx && psp_is_allowed_nondata(skb, pas))) + return 0; + } + + return SKB_DROP_REASON_PSP_INPUT; +} + static inline enum skb_drop_reason psp_sk_rx_policy_check(struct sock *sk, struct sk_buff *skb) { - return 0; + return __psp_sk_rx_policy_check(skb, psp_sk_assoc(sk)); } static inline enum skb_drop_reason psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb) { - return 0; + return __psp_sk_rx_policy_check(skb, rcu_dereference(tw->psp_assoc)); +} + +static inline struct psp_assoc *psp_sk_get_assoc_rcu(struct sock *sk) +{ + struct inet_timewait_sock *tw; + struct psp_assoc *pas; + int state; + + state = 1 << READ_ONCE(sk->sk_state); + if (!sk_is_inet(sk) || state & TCPF_NEW_SYN_RECV) + return NULL; + + tw = inet_twsk(sk); + pas = state & TCPF_TIME_WAIT ? rcu_dereference(tw->psp_assoc) : + rcu_dereference(sk->psp_assoc); + return pas; } static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) { - return NULL; + if (!skb->decrypted || !skb->sk) + return NULL; + + return psp_sk_get_assoc_rcu(skb->sk); } #else static inline void psp_sk_assoc_free(struct sock *sk) { } @@ -60,6 +151,11 @@ static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } static inline void psp_reply_set_decrypted(struct sk_buff *skb) { } +static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk) +{ + return NULL; +} + static inline void psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) { } diff --git a/include/net/psp/types.h b/include/net/psp/types.h index 66327fa80c92..b0e32e7165a3 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -51,6 +51,7 @@ struct psp_dev_config { * @refcnt: reference count for the instance * @id: instance id * @config: current device configuration + * @active_assocs: list of registered associations * * @rcu: RCU head for freeing the structure */ @@ -68,6 +69,8 @@ struct psp_dev { struct psp_dev_config config; + struct list_head active_assocs; + struct rcu_head rcu; }; @@ -80,6 +83,12 @@ struct psp_dev_caps { * Set this field to 0 to indicate PSP is not supported at all. */ u32 versions; + + /** + * @assoc_drv_spc: size of driver-specific state in Tx assoc + * Determines the size of struct psp_assoc::drv_spc + */ + u32 assoc_drv_spc; }; #define PSP_MAX_KEY 32 @@ -91,6 +100,32 @@ struct psp_skb_ext { u8 version; }; +struct psp_key_parsed { + __be32 spi; + u8 key[PSP_MAX_KEY]; +}; + +struct psp_assoc { + struct psp_dev *psd; + + u16 dev_id; + u8 generation; + u8 version; + u8 peer_tx; + + u32 upgrade_seq; + + struct psp_key_parsed tx; + struct psp_key_parsed rx; + + refcount_t refcnt; + struct rcu_head rcu; + struct work_struct work; + struct list_head assocs_list; + + u8 drv_data[] __aligned(8); +}; + /** * struct psp_dev_ops - netdev driver facing PSP callbacks */ @@ -107,6 +142,28 @@ struct psp_dev_ops { * @key_rotate: rotate the device key */ int (*key_rotate)(struct psp_dev *psd, struct netlink_ext_ack *extack); + + /** + * @rx_spi_alloc: allocate an Rx SPI+key pair + * Allocate an Rx SPI and resulting derived key. + * This key should remain valid until key rotation. + */ + int (*rx_spi_alloc)(struct psp_dev *psd, u32 version, + struct psp_key_parsed *assoc, + struct netlink_ext_ack *extack); + + /** + * @tx_key_add: add a Tx key to the device + * Install an association in the device. Core will allocate space + * for the driver to use at drv_data. + */ + int (*tx_key_add)(struct psp_dev *psd, struct psp_assoc *pas, + struct netlink_ext_ack *extack); + /** + * @tx_key_del: remove a Tx key from the device + * Remove an association from the device. + */ + void (*tx_key_del)(struct psp_dev *psd, struct psp_assoc *pas); }; #endif /* __NET_PSP_H */ diff --git a/include/uapi/linux/psp.h b/include/uapi/linux/psp.h index cbfbf3f0f364..607c42c39ba5 100644 --- a/include/uapi/linux/psp.h +++ b/include/uapi/linux/psp.h @@ -26,6 +26,25 @@ enum { PSP_A_DEV_MAX = (__PSP_A_DEV_MAX - 1) }; +enum { + PSP_A_ASSOC_DEV_ID = 1, + PSP_A_ASSOC_VERSION, + PSP_A_ASSOC_RX_KEY, + PSP_A_ASSOC_TX_KEY, + PSP_A_ASSOC_SOCK_FD, + + __PSP_A_ASSOC_MAX, + PSP_A_ASSOC_MAX = (__PSP_A_ASSOC_MAX - 1) +}; + +enum { + PSP_A_KEYS_KEY = 1, + PSP_A_KEYS_SPI, + + __PSP_A_KEYS_MAX, + PSP_A_KEYS_MAX = (__PSP_A_KEYS_MAX - 1) +}; + enum { PSP_CMD_DEV_GET = 1, PSP_CMD_DEV_ADD_NTF, @@ -34,6 +53,8 @@ enum { PSP_CMD_DEV_CHANGE_NTF, PSP_CMD_KEY_ROTATE, PSP_CMD_KEY_ROTATE_NTF, + PSP_CMD_RX_ASSOC, + PSP_CMD_TX_ASSOC, __PSP_CMD_MAX, PSP_CMD_MAX = (__PSP_CMD_MAX - 1) diff --git a/net/psp/Kconfig b/net/psp/Kconfig index 5e3908a40945..a7d24691a7e1 100644 --- a/net/psp/Kconfig +++ b/net/psp/Kconfig @@ -6,6 +6,7 @@ config INET_PSP bool "PSP Security Protocol support" depends on INET select SKB_DECRYPTED + select SOCK_VALIDATE_XMIT help Enable kernel support for the PSP protocol. For more information see: diff --git a/net/psp/Makefile b/net/psp/Makefile index 41b51d06e560..eb5ff3c5bfb2 100644 --- a/net/psp/Makefile +++ b/net/psp/Makefile @@ -2,4 +2,4 @@ obj-$(CONFIG_INET_PSP) += psp.o -psp-y := psp_main.o psp_nl.o psp-nl-gen.o +psp-y := psp_main.o psp_nl.o psp_sock.o psp-nl-gen.o diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c index 7f49577ac72f..9fdd6f831803 100644 --- a/net/psp/psp-nl-gen.c +++ b/net/psp/psp-nl-gen.c @@ -10,6 +10,12 @@ #include +/* Common nested types */ +const struct nla_policy psp_keys_nl_policy[PSP_A_KEYS_SPI + 1] = { + [PSP_A_KEYS_KEY] = { .type = NLA_BINARY, }, + [PSP_A_KEYS_SPI] = { .type = NLA_U32, }, +}; + /* PSP_CMD_DEV_GET - do */ static const struct nla_policy psp_dev_get_nl_policy[PSP_A_DEV_ID + 1] = { [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), @@ -26,6 +32,21 @@ static const struct nla_policy psp_key_rotate_nl_policy[PSP_A_DEV_ID + 1] = { [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), }; +/* PSP_CMD_RX_ASSOC - do */ +static const struct nla_policy psp_rx_assoc_nl_policy[PSP_A_ASSOC_SOCK_FD + 1] = { + [PSP_A_ASSOC_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), + [PSP_A_ASSOC_VERSION] = NLA_POLICY_MAX(NLA_U32, 3), + [PSP_A_ASSOC_SOCK_FD] = { .type = NLA_U32, }, +}; + +/* PSP_CMD_TX_ASSOC - do */ +static const struct nla_policy psp_tx_assoc_nl_policy[PSP_A_ASSOC_SOCK_FD + 1] = { + [PSP_A_ASSOC_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), + [PSP_A_ASSOC_VERSION] = NLA_POLICY_MAX(NLA_U32, 3), + [PSP_A_ASSOC_TX_KEY] = NLA_POLICY_NESTED(psp_keys_nl_policy), + [PSP_A_ASSOC_SOCK_FD] = { .type = NLA_U32, }, +}; + /* Ops table for psp */ static const struct genl_split_ops psp_nl_ops[] = { { @@ -60,6 +81,24 @@ static const struct genl_split_ops psp_nl_ops[] = { .maxattr = PSP_A_DEV_ID, .flags = GENL_CMD_CAP_DO, }, + { + .cmd = PSP_CMD_RX_ASSOC, + .pre_doit = psp_assoc_device_get_locked, + .doit = psp_nl_rx_assoc_doit, + .post_doit = psp_device_unlock, + .policy = psp_rx_assoc_nl_policy, + .maxattr = PSP_A_ASSOC_SOCK_FD, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = PSP_CMD_TX_ASSOC, + .pre_doit = psp_assoc_device_get_locked, + .doit = psp_nl_tx_assoc_doit, + .post_doit = psp_device_unlock, + .policy = psp_tx_assoc_nl_policy, + .maxattr = PSP_A_ASSOC_SOCK_FD, + .flags = GENL_CMD_CAP_DO, + }, }; static const struct genl_multicast_group psp_nl_mcgrps[] = { diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h index 00a2d4ec59e4..25268ed11fb5 100644 --- a/net/psp/psp-nl-gen.h +++ b/net/psp/psp-nl-gen.h @@ -11,8 +11,13 @@ #include +/* Common nested types */ +extern const struct nla_policy psp_keys_nl_policy[PSP_A_KEYS_SPI + 1]; + int psp_device_get_locked(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); +int psp_assoc_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); void psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); @@ -21,6 +26,8 @@ int psp_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); int psp_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info); int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_rx_assoc_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_tx_assoc_doit(struct sk_buff *skb, struct genl_info *info); enum { PSP_NLGRP_MGMT, diff --git a/net/psp/psp.h b/net/psp/psp.h index 94d0cc31a61f..defd3e3fd5e7 100644 --- a/net/psp/psp.h +++ b/net/psp/psp.h @@ -4,6 +4,7 @@ #define __PSP_PSP_H #include +#include #include #include #include @@ -17,15 +18,36 @@ int psp_dev_check_access(struct psp_dev *psd, struct net *net); void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd); +struct psp_assoc *psp_assoc_create(struct psp_dev *psd); +struct psp_dev *psp_dev_get_for_sock(struct sock *sk); +void psp_dev_tx_key_del(struct psp_dev *psd, struct psp_assoc *pas); +int psp_sock_assoc_set_rx(struct sock *sk, struct psp_assoc *pas, + struct psp_key_parsed *key, + struct netlink_ext_ack *extack); +int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, + u32 version, struct psp_key_parsed *key, + struct netlink_ext_ack *extack); + static inline void psp_dev_get(struct psp_dev *psd) { refcount_inc(&psd->refcnt); } +static inline bool psp_dev_tryget(struct psp_dev *psd) +{ + return refcount_inc_not_zero(&psd->refcnt); +} + static inline void psp_dev_put(struct psp_dev *psd) { if (refcount_dec_and_test(&psd->refcnt)) psp_dev_destroy(psd); } +static inline bool psp_dev_is_registered(struct psp_dev *psd) +{ + lockdep_assert_held(&psd->lock); + return !!psd->ops; +} + #endif /* __PSP_PSP_H */ diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c index f60155493afc..a1ae3c8920c3 100644 --- a/net/psp/psp_main.c +++ b/net/psp/psp_main.c @@ -55,7 +55,10 @@ psp_dev_create(struct net_device *netdev, if (WARN_ON(!psd_caps->versions || !psd_ops->set_config || - !psd_ops->key_rotate)) + !psd_ops->key_rotate || + !psd_ops->rx_spi_alloc || + !psd_ops->tx_key_add || + !psd_ops->tx_key_del)) return ERR_PTR(-EINVAL); psd = kzalloc(sizeof(*psd), GFP_KERNEL); @@ -68,6 +71,7 @@ psp_dev_create(struct net_device *netdev, psd->drv_priv = priv_ptr; mutex_init(&psd->lock); + INIT_LIST_HEAD(&psd->active_assocs); refcount_set(&psd->refcnt, 1); mutex_lock(&psp_devs_lock); @@ -107,6 +111,8 @@ void psp_dev_destroy(struct psp_dev *psd) */ void psp_dev_unregister(struct psp_dev *psd) { + struct psp_assoc *pas, *next; + mutex_lock(&psp_devs_lock); mutex_lock(&psd->lock); @@ -119,6 +125,9 @@ void psp_dev_unregister(struct psp_dev *psd) xa_store(&psp_devs, psd->id, NULL, GFP_KERNEL); mutex_unlock(&psp_devs_lock); + list_for_each_entry_safe(pas, next, &psd->active_assocs, assocs_list) + psp_dev_tx_key_del(psd, pas); + rcu_assign_pointer(psd->main_netdev->psp_dev, NULL); psd->ops = NULL; @@ -130,6 +139,21 @@ void psp_dev_unregister(struct psp_dev *psd) } EXPORT_SYMBOL(psp_dev_unregister); +unsigned int psp_key_size(u32 version) +{ + switch (version) { + case PSP_VERSION_HDR0_AES_GCM_128: + case PSP_VERSION_HDR0_AES_GMAC_128: + return 16; + case PSP_VERSION_HDR0_AES_GCM_256: + case PSP_VERSION_HDR0_AES_GMAC_256: + return 32; + default: + return 0; + } +} +EXPORT_SYMBOL(psp_key_size); + static int __init psp_init(void) { mutex_init(&psp_devs_lock); diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c index 75f2702c1029..1b1d08fce637 100644 --- a/net/psp/psp_nl.c +++ b/net/psp/psp_nl.c @@ -79,9 +79,12 @@ void psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { + struct socket *socket = info->user_ptr[1]; struct psp_dev *psd = info->user_ptr[0]; mutex_unlock(&psd->lock); + if (socket) + sockfd_put(socket); } static int @@ -261,3 +264,232 @@ err_free_rsp: nlmsg_free(rsp); return err; } + +/* Key etc. */ + +int psp_assoc_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + struct socket *socket; + struct psp_dev *psd; + struct nlattr *id; + int fd, err; + + if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_SOCK_FD)) + return -EINVAL; + + fd = nla_get_u32(info->attrs[PSP_A_ASSOC_SOCK_FD]); + socket = sockfd_lookup(fd, &err); + if (!socket) + return err; + + if (!sk_is_tcp(socket->sk)) { + NL_SET_ERR_MSG_ATTR(info->extack, + info->attrs[PSP_A_ASSOC_SOCK_FD], + "Unsupported socket family and type"); + err = -EOPNOTSUPP; + goto err_sock_put; + } + + psd = psp_dev_get_for_sock(socket->sk); + if (psd) { + err = psp_dev_check_access(psd, genl_info_net(info)); + if (err) { + psp_dev_put(psd); + psd = NULL; + } + } + + if (!psd && GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_DEV_ID)) { + err = -EINVAL; + goto err_sock_put; + } + + id = info->attrs[PSP_A_ASSOC_DEV_ID]; + if (psd) { + mutex_lock(&psd->lock); + if (id && psd->id != nla_get_u32(id)) { + mutex_unlock(&psd->lock); + NL_SET_ERR_MSG_ATTR(info->extack, id, + "Device id vs socket mismatch"); + err = -EINVAL; + goto err_psd_put; + } + + psp_dev_put(psd); + } else { + psd = psp_device_get_and_lock(genl_info_net(info), id); + if (IS_ERR(psd)) { + err = PTR_ERR(psd); + goto err_sock_put; + } + } + + info->user_ptr[0] = psd; + info->user_ptr[1] = socket; + + return 0; + +err_psd_put: + psp_dev_put(psd); +err_sock_put: + sockfd_put(socket); + return err; +} + +static int +psp_nl_parse_key(struct genl_info *info, u32 attr, struct psp_key_parsed *key, + unsigned int key_sz) +{ + struct nlattr *nest = info->attrs[attr]; + struct nlattr *tb[PSP_A_KEYS_SPI + 1]; + u32 spi; + int err; + + err = nla_parse_nested(tb, ARRAY_SIZE(tb) - 1, nest, + psp_keys_nl_policy, info->extack); + if (err) + return err; + + if (NL_REQ_ATTR_CHECK(info->extack, nest, tb, PSP_A_KEYS_KEY) || + NL_REQ_ATTR_CHECK(info->extack, nest, tb, PSP_A_KEYS_SPI)) + return -EINVAL; + + if (nla_len(tb[PSP_A_KEYS_KEY]) != key_sz) { + NL_SET_ERR_MSG_ATTR(info->extack, tb[PSP_A_KEYS_KEY], + "incorrect key length"); + return -EINVAL; + } + + spi = nla_get_u32(tb[PSP_A_KEYS_SPI]); + if (!(spi & PSP_SPI_KEY_ID)) { + NL_SET_ERR_MSG_ATTR(info->extack, tb[PSP_A_KEYS_KEY], + "invalid SPI: lower 31b must be non-zero"); + return -EINVAL; + } + + key->spi = cpu_to_be32(spi); + memcpy(key->key, nla_data(tb[PSP_A_KEYS_KEY]), key_sz); + + return 0; +} + +static int +psp_nl_put_key(struct sk_buff *skb, u32 attr, u32 version, + struct psp_key_parsed *key) +{ + int key_sz = psp_key_size(version); + void *nest; + + nest = nla_nest_start(skb, attr); + + if (nla_put_u32(skb, PSP_A_KEYS_SPI, be32_to_cpu(key->spi)) || + nla_put(skb, PSP_A_KEYS_KEY, key_sz, key->key)) { + nla_nest_cancel(skb, nest); + return -EMSGSIZE; + } + + nla_nest_end(skb, nest); + + return 0; +} + +int psp_nl_rx_assoc_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct socket *socket = info->user_ptr[1]; + struct psp_dev *psd = info->user_ptr[0]; + struct psp_key_parsed key; + struct psp_assoc *pas; + struct sk_buff *rsp; + u32 version; + int err; + + if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_VERSION)) + return -EINVAL; + + version = nla_get_u32(info->attrs[PSP_A_ASSOC_VERSION]); + if (!(psd->caps->versions & (1 << version))) { + NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_VERSION]); + return -EOPNOTSUPP; + } + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + pas = psp_assoc_create(psd); + if (!pas) { + err = -ENOMEM; + goto err_free_rsp; + } + pas->version = version; + + err = psd->ops->rx_spi_alloc(psd, version, &key, info->extack); + if (err) + goto err_free_pas; + + if (nla_put_u32(rsp, PSP_A_ASSOC_DEV_ID, psd->id) || + psp_nl_put_key(rsp, PSP_A_ASSOC_RX_KEY, version, &key)) { + err = -EMSGSIZE; + goto err_free_pas; + } + + err = psp_sock_assoc_set_rx(socket->sk, pas, &key, info->extack); + if (err) { + NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_SOCK_FD]); + goto err_free_pas; + } + psp_assoc_put(pas); + + return psp_nl_reply_send(rsp, info); + +err_free_pas: + psp_assoc_put(pas); +err_free_rsp: + nlmsg_free(rsp); + return err; +} + +int psp_nl_tx_assoc_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct socket *socket = info->user_ptr[1]; + struct psp_dev *psd = info->user_ptr[0]; + struct psp_key_parsed key; + struct sk_buff *rsp; + unsigned int key_sz; + u32 version; + int err; + + if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_VERSION) || + GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_TX_KEY)) + return -EINVAL; + + version = nla_get_u32(info->attrs[PSP_A_ASSOC_VERSION]); + if (!(psd->caps->versions & (1 << version))) { + NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_VERSION]); + return -EOPNOTSUPP; + } + + key_sz = psp_key_size(version); + if (!key_sz) + return -EINVAL; + + err = psp_nl_parse_key(info, PSP_A_ASSOC_TX_KEY, &key, key_sz); + if (err < 0) + return err; + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + err = psp_sock_assoc_set_tx(socket->sk, psd, version, &key, + info->extack); + if (err) + goto err_free_msg; + + return psp_nl_reply_send(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} diff --git a/net/psp/psp_sock.c b/net/psp/psp_sock.c new file mode 100644 index 000000000000..8ebccee94593 --- /dev/null +++ b/net/psp/psp_sock.c @@ -0,0 +1,274 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include + +#include +#include +#include "psp.h" + +struct psp_dev *psp_dev_get_for_sock(struct sock *sk) +{ + struct dst_entry *dst; + struct psp_dev *psd; + + dst = sk_dst_get(sk); + if (!dst) + return NULL; + + rcu_read_lock(); + psd = rcu_dereference(dst->dev->psp_dev); + if (psd && !psp_dev_tryget(psd)) + psd = NULL; + rcu_read_unlock(); + + dst_release(dst); + + return psd; +} + +static struct sk_buff * +psp_validate_xmit(struct sock *sk, struct net_device *dev, struct sk_buff *skb) +{ + struct psp_assoc *pas; + bool good; + + rcu_read_lock(); + pas = psp_skb_get_assoc_rcu(skb); + good = !pas || rcu_access_pointer(dev->psp_dev) == pas->psd; + rcu_read_unlock(); + if (!good) { + kfree_skb_reason(skb, SKB_DROP_REASON_PSP_OUTPUT); + return NULL; + } + + return skb; +} + +struct psp_assoc *psp_assoc_create(struct psp_dev *psd) +{ + struct psp_assoc *pas; + + lockdep_assert_held(&psd->lock); + + pas = kzalloc(struct_size(pas, drv_data, psd->caps->assoc_drv_spc), + GFP_KERNEL_ACCOUNT); + if (!pas) + return NULL; + + pas->psd = psd; + pas->dev_id = psd->id; + psp_dev_get(psd); + refcount_set(&pas->refcnt, 1); + + list_add_tail(&pas->assocs_list, &psd->active_assocs); + + return pas; +} + +static struct psp_assoc *psp_assoc_dummy(struct psp_assoc *pas) +{ + struct psp_dev *psd = pas->psd; + size_t sz; + + lockdep_assert_held(&psd->lock); + + sz = struct_size(pas, drv_data, psd->caps->assoc_drv_spc); + return kmemdup(pas, sz, GFP_KERNEL); +} + +static int psp_dev_tx_key_add(struct psp_dev *psd, struct psp_assoc *pas, + struct netlink_ext_ack *extack) +{ + return psd->ops->tx_key_add(psd, pas, extack); +} + +void psp_dev_tx_key_del(struct psp_dev *psd, struct psp_assoc *pas) +{ + if (pas->tx.spi) + psd->ops->tx_key_del(psd, pas); + list_del(&pas->assocs_list); +} + +static void psp_assoc_free(struct work_struct *work) +{ + struct psp_assoc *pas = container_of(work, struct psp_assoc, work); + struct psp_dev *psd = pas->psd; + + mutex_lock(&psd->lock); + if (psd->ops) + psp_dev_tx_key_del(psd, pas); + mutex_unlock(&psd->lock); + psp_dev_put(psd); + kfree(pas); +} + +static void psp_assoc_free_queue(struct rcu_head *head) +{ + struct psp_assoc *pas = container_of(head, struct psp_assoc, rcu); + + INIT_WORK(&pas->work, psp_assoc_free); + schedule_work(&pas->work); +} + +/** + * psp_assoc_put() - release a reference on a PSP association + * @pas: association to release + */ +void psp_assoc_put(struct psp_assoc *pas) +{ + if (pas && refcount_dec_and_test(&pas->refcnt)) + call_rcu(&pas->rcu, psp_assoc_free_queue); +} + +void psp_sk_assoc_free(struct sock *sk) +{ + struct psp_assoc *pas = rcu_dereference_protected(sk->psp_assoc, 1); + + rcu_assign_pointer(sk->psp_assoc, NULL); + psp_assoc_put(pas); +} + +int psp_sock_assoc_set_rx(struct sock *sk, struct psp_assoc *pas, + struct psp_key_parsed *key, + struct netlink_ext_ack *extack) +{ + int err; + + memcpy(&pas->rx, key, sizeof(*key)); + + lock_sock(sk); + + if (psp_sk_assoc(sk)) { + NL_SET_ERR_MSG(extack, "Socket already has PSP state"); + err = -EBUSY; + goto exit_unlock; + } + + refcount_inc(&pas->refcnt); + rcu_assign_pointer(sk->psp_assoc, pas); + err = 0; + +exit_unlock: + release_sock(sk); + + return err; +} + +static int psp_sock_recv_queue_check(struct sock *sk, struct psp_assoc *pas) +{ + struct psp_skb_ext *pse; + struct sk_buff *skb; + + skb_rbtree_walk(skb, &tcp_sk(sk)->out_of_order_queue) { + pse = skb_ext_find(skb, SKB_EXT_PSP); + if (!psp_pse_matches_pas(pse, pas)) + return -EBUSY; + } + + skb_queue_walk(&sk->sk_receive_queue, skb) { + pse = skb_ext_find(skb, SKB_EXT_PSP); + if (!psp_pse_matches_pas(pse, pas)) + return -EBUSY; + } + return 0; +} + +int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, + u32 version, struct psp_key_parsed *key, + struct netlink_ext_ack *extack) +{ + struct psp_assoc *pas, *dummy; + int err; + + lock_sock(sk); + + pas = psp_sk_assoc(sk); + if (!pas) { + NL_SET_ERR_MSG(extack, "Socket has no Rx key"); + err = -EINVAL; + goto exit_unlock; + } + if (pas->psd != psd) { + NL_SET_ERR_MSG(extack, "Rx key from different device"); + err = -EINVAL; + goto exit_unlock; + } + if (pas->version != version) { + NL_SET_ERR_MSG(extack, + "PSP version mismatch with existing state"); + err = -EINVAL; + goto exit_unlock; + } + if (pas->tx.spi) { + NL_SET_ERR_MSG(extack, "Tx key already set"); + err = -EBUSY; + goto exit_unlock; + } + + err = psp_sock_recv_queue_check(sk, pas); + if (err) { + NL_SET_ERR_MSG(extack, "Socket has incompatible segments already in the recv queue"); + goto exit_unlock; + } + + /* Pass a fake association to drivers to make sure they don't + * try to store pointers to it. For re-keying we'll need to + * re-allocate the assoc structures. + */ + dummy = psp_assoc_dummy(pas); + if (!dummy) { + err = -ENOMEM; + goto exit_unlock; + } + + memcpy(&dummy->tx, key, sizeof(*key)); + err = psp_dev_tx_key_add(psd, dummy, extack); + if (err) + goto exit_free_dummy; + + memcpy(pas->drv_data, dummy->drv_data, psd->caps->assoc_drv_spc); + memcpy(&pas->tx, key, sizeof(*key)); + + WRITE_ONCE(sk->sk_validate_xmit_skb, psp_validate_xmit); + tcp_write_collapse_fence(sk); + pas->upgrade_seq = tcp_sk(sk)->rcv_nxt; + +exit_free_dummy: + kfree(dummy); +exit_unlock: + release_sock(sk); + return err; +} + +void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) +{ + struct psp_assoc *pas = psp_sk_assoc(sk); + + if (pas) + refcount_inc(&pas->refcnt); + rcu_assign_pointer(tw->psp_assoc, pas); + tw->tw_validate_xmit_skb = psp_validate_xmit; +} + +void psp_twsk_assoc_free(struct inet_timewait_sock *tw) +{ + struct psp_assoc *pas = rcu_dereference_protected(tw->psp_assoc, 1); + + rcu_assign_pointer(tw->psp_assoc, NULL); + psp_assoc_put(pas); +} + +void psp_reply_set_decrypted(struct sk_buff *skb) +{ + struct psp_assoc *pas; + + rcu_read_lock(); + pas = psp_sk_get_assoc_rcu(skb->sk); + if (pas && pas->tx.spi) + skb->decrypted = 1; + rcu_read_unlock(); +} +EXPORT_IPV6_MOD_GPL(psp_reply_set_decrypted); -- cgit v1.2.3 From e97269257fe437910cddc7c642a636ca3cf9fb1d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:36 -0700 Subject: net: psp: update the TCP MSS to reflect PSP packet overhead PSP eats 40B of header space. Adjust MSS appropriately. We can either modify tcp_mtu_to_mss() / tcp_mss_to_mtu() or reuse icsk_ext_hdr_len. The former option is more TCP specific and has runtime overhead. The latter is a bit of a hack as PSP is not an ext_hdr. If one squints hard enough, UDP encap is just a more practical version of IPv6 exthdr, so go with the latter. Happy to change. Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-10-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- include/net/psp/functions.h | 14 ++++++++++++++ include/net/psp/types.h | 3 +++ net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv6/ipv6_sockglue.c | 6 +++++- net/ipv6/tcp_ipv6.c | 6 +++--- net/psp/psp_sock.c | 5 +++++ 6 files changed, 32 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index 0d7141230f47..183a3c9216b7 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -143,6 +144,14 @@ static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) return psp_sk_get_assoc_rcu(skb->sk); } + +static inline unsigned int psp_sk_overhead(const struct sock *sk) +{ + int psp_encap = sizeof(struct udphdr) + PSP_HDR_SIZE + PSP_TRL_SIZE; + bool has_psp = rcu_access_pointer(sk->psp_assoc); + + return has_psp ? psp_encap : 0; +} #else static inline void psp_sk_assoc_free(struct sock *sk) { } static inline void @@ -182,6 +191,11 @@ static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) { return NULL; } + +static inline unsigned int psp_sk_overhead(const struct sock *sk) +{ + return 0; +} #endif static inline unsigned long diff --git a/include/net/psp/types.h b/include/net/psp/types.h index b0e32e7165a3..f93ad0e6c04f 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -93,6 +93,9 @@ struct psp_dev_caps { #define PSP_MAX_KEY 32 +#define PSP_HDR_SIZE 16 /* We don't support optional fields, yet */ +#define PSP_TRL_SIZE 16 /* AES-GCM/GMAC trailer size */ + struct psp_skb_ext { __be32 spi; u16 dev_id; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f27f6f865a48..b1fcf3e4e1ce 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -294,9 +294,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_dport = usin->sin_port; sk_daddr_set(sk, daddr); - inet_csk(sk)->icsk_ext_hdr_len = 0; + inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk); if (inet_opt) - inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; + inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen; tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index e66ec623972e..a61e742794f9 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -49,6 +49,7 @@ #include #include #include +#include #include @@ -107,7 +108,10 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk, !((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->inet_daddr != LOOPBACK4_IPV6) { struct inet_connection_sock *icsk = inet_csk(sk); - icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; + + icsk->icsk_ext_hdr_len = + psp_sk_overhead(sk) + + opt->opt_flen + opt->opt_nflen; icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4da8eb9183d7..43d1109e2180 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -302,10 +302,10 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sk->sk_gso_type = SKB_GSO_TCPV6; ip6_dst_store(sk, dst, NULL, NULL); - icsk->icsk_ext_hdr_len = 0; + icsk->icsk_ext_hdr_len = psp_sk_overhead(sk); if (opt) - icsk->icsk_ext_hdr_len = opt->opt_flen + - opt->opt_nflen; + icsk->icsk_ext_hdr_len += opt->opt_flen + + opt->opt_nflen; tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); diff --git a/net/psp/psp_sock.c b/net/psp/psp_sock.c index 8ebccee94593..10e1fda30aa0 100644 --- a/net/psp/psp_sock.c +++ b/net/psp/psp_sock.c @@ -180,6 +180,7 @@ int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, u32 version, struct psp_key_parsed *key, struct netlink_ext_ack *extack) { + struct inet_connection_sock *icsk; struct psp_assoc *pas, *dummy; int err; @@ -236,6 +237,10 @@ int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, tcp_write_collapse_fence(sk); pas->upgrade_seq = tcp_sk(sk)->rcv_nxt; + icsk = inet_csk(sk); + icsk->icsk_ext_hdr_len += psp_sk_overhead(sk); + icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); + exit_free_dummy: kfree(dummy); exit_unlock: -- cgit v1.2.3 From e78851058b35deb9f2d60ecf698fbf7ae7790d09 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:37 -0700 Subject: psp: track generations of device key There is a (somewhat theoretical in absence of multi-host support) possibility that another entity will rotate the key and we won't know. This may lead to accepting packets with matching SPI but which used different crypto keys than we expected. The PSP Architecture specification mentions that an implementation should track device key generation when device keys are managed by the NIC. Some PSP implementations may opt to include this key generation state in decryption metadata each time a device key is used to decrypt a packet. If that is the case, that key generation counter can also be used when policy checking a decrypted skb against a psp_assoc. This is an optional feature that is not explicitly part of the PSP spec, but can provide additional security in the case where an attacker may have the ability to force key rotations faster than rekeying can occur. Since we're tracking "key generations" more explicitly now, maintain different lists for associations from different generations. This way we can catch stale associations (the user space should listen to rotation notifications and change the keys). Drivers can "opt out" of generation tracking by setting the generation value to 0. Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-11-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- include/net/psp/types.h | 10 ++++++++++ net/psp/psp.h | 1 + net/psp/psp_main.c | 6 +++++- net/psp/psp_nl.c | 10 ++++++++++ net/psp/psp_sock.c | 16 ++++++++++++++++ 5 files changed, 42 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/psp/types.h b/include/net/psp/types.h index f93ad0e6c04f..ec218747ced0 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -50,8 +50,12 @@ struct psp_dev_config { * @lock: instance lock, protects all fields * @refcnt: reference count for the instance * @id: instance id + * @generation: current generation of the device key * @config: current device configuration * @active_assocs: list of registered associations + * @prev_assocs: associations which use old (but still usable) + * device key + * @stale_assocs: associations which use a rotated out key * * @rcu: RCU head for freeing the structure */ @@ -67,13 +71,19 @@ struct psp_dev { u32 id; + u8 generation; + struct psp_dev_config config; struct list_head active_assocs; + struct list_head prev_assocs; + struct list_head stale_assocs; struct rcu_head rcu; }; +#define PSP_GEN_VALID_MASK 0x7f + /** * struct psp_dev_caps - PSP device capabilities */ diff --git a/net/psp/psp.h b/net/psp/psp.h index defd3e3fd5e7..0f34e1a23fdd 100644 --- a/net/psp/psp.h +++ b/net/psp/psp.h @@ -27,6 +27,7 @@ int psp_sock_assoc_set_rx(struct sock *sk, struct psp_assoc *pas, int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, u32 version, struct psp_key_parsed *key, struct netlink_ext_ack *extack); +void psp_assocs_key_rotated(struct psp_dev *psd); static inline void psp_dev_get(struct psp_dev *psd) { diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c index a1ae3c8920c3..98ad8c85b58e 100644 --- a/net/psp/psp_main.c +++ b/net/psp/psp_main.c @@ -72,6 +72,8 @@ psp_dev_create(struct net_device *netdev, mutex_init(&psd->lock); INIT_LIST_HEAD(&psd->active_assocs); + INIT_LIST_HEAD(&psd->prev_assocs); + INIT_LIST_HEAD(&psd->stale_assocs); refcount_set(&psd->refcnt, 1); mutex_lock(&psp_devs_lock); @@ -125,7 +127,9 @@ void psp_dev_unregister(struct psp_dev *psd) xa_store(&psp_devs, psd->id, NULL, GFP_KERNEL); mutex_unlock(&psp_devs_lock); - list_for_each_entry_safe(pas, next, &psd->active_assocs, assocs_list) + list_splice_init(&psd->active_assocs, &psd->prev_assocs); + list_splice_init(&psd->prev_assocs, &psd->stale_assocs); + list_for_each_entry_safe(pas, next, &psd->stale_assocs, assocs_list) psp_dev_tx_key_del(psd, pas); rcu_assign_pointer(psd->main_netdev->psp_dev, NULL); diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c index 1b1d08fce637..8aaca62744c3 100644 --- a/net/psp/psp_nl.c +++ b/net/psp/psp_nl.c @@ -230,6 +230,7 @@ int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info) struct psp_dev *psd = info->user_ptr[0]; struct genl_info ntf_info; struct sk_buff *ntf, *rsp; + u8 prev_gen; int err; rsp = psp_nl_reply_new(info); @@ -249,10 +250,19 @@ int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info) goto err_free_ntf; } + /* suggest the next gen number, driver can override */ + prev_gen = psd->generation; + psd->generation = (prev_gen + 1) & PSP_GEN_VALID_MASK; + err = psd->ops->key_rotate(psd, info->extack); if (err) goto err_free_ntf; + WARN_ON_ONCE((psd->generation && psd->generation == prev_gen) || + psd->generation & ~PSP_GEN_VALID_MASK); + + psp_assocs_key_rotated(psd); + nlmsg_end(ntf, (struct nlmsghdr *)ntf->data); genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf, 0, PSP_NLGRP_USE, GFP_KERNEL); diff --git a/net/psp/psp_sock.c b/net/psp/psp_sock.c index 10e1fda30aa0..afa966c6b69d 100644 --- a/net/psp/psp_sock.c +++ b/net/psp/psp_sock.c @@ -60,6 +60,7 @@ struct psp_assoc *psp_assoc_create(struct psp_dev *psd) pas->psd = psd; pas->dev_id = psd->id; + pas->generation = psd->generation; psp_dev_get(psd); refcount_set(&pas->refcnt, 1); @@ -248,6 +249,21 @@ exit_unlock: return err; } +void psp_assocs_key_rotated(struct psp_dev *psd) +{ + struct psp_assoc *pas, *next; + + /* Mark the stale associations as invalid, they will no longer + * be able to Rx any traffic. + */ + list_for_each_entry_safe(pas, next, &psd->prev_assocs, assocs_list) + pas->generation |= ~PSP_GEN_VALID_MASK; + list_splice_init(&psd->prev_assocs, &psd->stale_assocs); + list_splice_init(&psd->active_assocs, &psd->prev_assocs); + + /* TODO: we should inform the sockets that got shut down */ +} + void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) { struct psp_assoc *pas = psp_sk_assoc(sk); -- cgit v1.2.3 From fc724515741a1b86ca0457825fdb784ab038e92c Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Tue, 16 Sep 2025 17:09:40 -0700 Subject: psp: provide encapsulation helper for drivers Create a new function psp_encapsulate(), which takes a TCP packet and PSP encapsulates it according to the "Transport Mode Packet Format" section of the PSP Architecture Specification. psp_encapsulate() does not push a PSP trailer onto the skb. Both IPv6 and IPv4 are supported. Virtualization cookie is not included. Reviewed-by: Willem de Bruijn Signed-off-by: Raed Salem Signed-off-by: Rahul Rameshbabu Signed-off-by: Cosmin Ratiu Co-developed-by: Daniel Zahka Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-14-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- include/net/psp/functions.h | 2 ++ include/net/psp/types.h | 2 ++ net/psp/psp_main.c | 65 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+) (limited to 'net') diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index 183a3c9216b7..0a539e1b39f4 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -17,6 +17,8 @@ struct psp_dev * psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, struct psp_dev_caps *psd_caps, void *priv_ptr); void psp_dev_unregister(struct psp_dev *psd); +bool psp_dev_encapsulate(struct net *net, struct sk_buff *skb, __be32 spi, + u8 ver, __be16 sport); /* Kernel-facing API */ void psp_assoc_put(struct psp_assoc *pas); diff --git a/include/net/psp/types.h b/include/net/psp/types.h index ec218747ced0..d9688e66cf09 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -20,6 +20,8 @@ struct psphdr { __be64 vc[]; /* optional */ }; +#define PSP_ENCAP_HLEN (sizeof(struct udphdr) + sizeof(struct psphdr)) + #define PSP_SPI_KEY_ID GENMASK(30, 0) #define PSP_SPI_KEY_PHASE BIT(31) diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c index 98ad8c85b58e..e026880fa1a2 100644 --- a/net/psp/psp_main.c +++ b/net/psp/psp_main.c @@ -1,10 +1,12 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include #include #include #include #include +#include #include "psp.h" #include "psp-nl-gen.h" @@ -158,6 +160,69 @@ unsigned int psp_key_size(u32 version) } EXPORT_SYMBOL(psp_key_size); +static void psp_write_headers(struct net *net, struct sk_buff *skb, __be32 spi, + u8 ver, unsigned int udp_len, __be16 sport) +{ + struct udphdr *uh = udp_hdr(skb); + struct psphdr *psph = (struct psphdr *)(uh + 1); + + uh->dest = htons(PSP_DEFAULT_UDP_PORT); + uh->source = udp_flow_src_port(net, skb, 0, 0, false); + uh->check = 0; + uh->len = htons(udp_len); + + psph->nexthdr = IPPROTO_TCP; + psph->hdrlen = PSP_HDRLEN_NOOPT; + psph->crypt_offset = 0; + psph->verfl = FIELD_PREP(PSPHDR_VERFL_VERSION, ver) | + FIELD_PREP(PSPHDR_VERFL_ONE, 1); + psph->spi = spi; + memset(&psph->iv, 0, sizeof(psph->iv)); +} + +/* Encapsulate a TCP packet with PSP by adding the UDP+PSP headers and filling + * them in. + */ +bool psp_dev_encapsulate(struct net *net, struct sk_buff *skb, __be32 spi, + u8 ver, __be16 sport) +{ + u32 network_len = skb_network_header_len(skb); + u32 ethr_len = skb_mac_header_len(skb); + u32 bufflen = ethr_len + network_len; + + if (skb_cow_head(skb, PSP_ENCAP_HLEN)) + return false; + + skb_push(skb, PSP_ENCAP_HLEN); + skb->mac_header -= PSP_ENCAP_HLEN; + skb->network_header -= PSP_ENCAP_HLEN; + skb->transport_header -= PSP_ENCAP_HLEN; + memmove(skb->data, skb->data + PSP_ENCAP_HLEN, bufflen); + + if (skb->protocol == htons(ETH_P_IP)) { + ip_hdr(skb)->protocol = IPPROTO_UDP; + be16_add_cpu(&ip_hdr(skb)->tot_len, PSP_ENCAP_HLEN); + ip_hdr(skb)->check = 0; + ip_hdr(skb)->check = + ip_fast_csum((u8 *)ip_hdr(skb), ip_hdr(skb)->ihl); + } else if (skb->protocol == htons(ETH_P_IPV6)) { + ipv6_hdr(skb)->nexthdr = IPPROTO_UDP; + be16_add_cpu(&ipv6_hdr(skb)->payload_len, PSP_ENCAP_HLEN); + } else { + return false; + } + + skb_set_inner_ipproto(skb, IPPROTO_TCP); + skb_set_inner_transport_header(skb, skb_transport_offset(skb) + + PSP_ENCAP_HLEN); + skb->encapsulation = 1; + psp_write_headers(net, skb, spi, ver, + skb->len - skb_transport_offset(skb), sport); + + return true; +} +EXPORT_SYMBOL(psp_dev_encapsulate); + static int __init psp_init(void) { mutex_init(&psp_devs_lock); -- cgit v1.2.3 From 0eddb8023cee546eb05658ef3322234de8461f3b Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Tue, 16 Sep 2025 17:09:44 -0700 Subject: psp: provide decapsulation and receive helper for drivers Create psp_dev_rcv(), which drivers can call to psp decapsulate and attach a psp_skb_ext to an skb. psp_dev_rcv() only supports what the PSP architecture specification refers to as "transport mode" packets, where the L3 header is either IPv6 or IPv4. Reviewed-by: Willem de Bruijn Signed-off-by: Raed Salem Signed-off-by: Rahul Rameshbabu Signed-off-by: Cosmin Ratiu Co-developed-by: Daniel Zahka Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-18-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- include/net/psp/functions.h | 1 + net/psp/psp_main.c | 88 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) (limited to 'net') diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index 0a539e1b39f4..91ba06733321 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -19,6 +19,7 @@ psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, void psp_dev_unregister(struct psp_dev *psd); bool psp_dev_encapsulate(struct net *net, struct sk_buff *skb, __be32 spi, u8 ver, __be16 sport); +int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv); /* Kernel-facing API */ void psp_assoc_put(struct psp_assoc *pas); diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c index e026880fa1a2..b4b756f87382 100644 --- a/net/psp/psp_main.c +++ b/net/psp/psp_main.c @@ -223,6 +223,94 @@ bool psp_dev_encapsulate(struct net *net, struct sk_buff *skb, __be32 spi, } EXPORT_SYMBOL(psp_dev_encapsulate); +/* Receive handler for PSP packets. + * + * Presently it accepts only already-authenticated packets and does not + * support optional fields, such as virtualization cookies. The caller should + * ensure that skb->data is pointing to the mac header, and that skb->mac_len + * is set. + */ +int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv) +{ + int l2_hlen = 0, l3_hlen, encap; + struct psp_skb_ext *pse; + struct psphdr *psph; + struct ethhdr *eth; + struct udphdr *uh; + __be16 proto; + bool is_udp; + + eth = (struct ethhdr *)skb->data; + proto = __vlan_get_protocol(skb, eth->h_proto, &l2_hlen); + if (proto == htons(ETH_P_IP)) + l3_hlen = sizeof(struct iphdr); + else if (proto == htons(ETH_P_IPV6)) + l3_hlen = sizeof(struct ipv6hdr); + else + return -EINVAL; + + if (unlikely(!pskb_may_pull(skb, l2_hlen + l3_hlen + PSP_ENCAP_HLEN))) + return -EINVAL; + + if (proto == htons(ETH_P_IP)) { + struct iphdr *iph = (struct iphdr *)(skb->data + l2_hlen); + + is_udp = iph->protocol == IPPROTO_UDP; + l3_hlen = iph->ihl * 4; + if (l3_hlen != sizeof(struct iphdr) && + !pskb_may_pull(skb, l2_hlen + l3_hlen + PSP_ENCAP_HLEN)) + return -EINVAL; + } else { + struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + l2_hlen); + + is_udp = ipv6h->nexthdr == IPPROTO_UDP; + } + + if (unlikely(!is_udp)) + return -EINVAL; + + uh = (struct udphdr *)(skb->data + l2_hlen + l3_hlen); + if (unlikely(uh->dest != htons(PSP_DEFAULT_UDP_PORT))) + return -EINVAL; + + pse = skb_ext_add(skb, SKB_EXT_PSP); + if (!pse) + return -EINVAL; + + psph = (struct psphdr *)(skb->data + l2_hlen + l3_hlen + + sizeof(struct udphdr)); + pse->spi = psph->spi; + pse->dev_id = dev_id; + pse->generation = generation; + pse->version = FIELD_GET(PSPHDR_VERFL_VERSION, psph->verfl); + + encap = PSP_ENCAP_HLEN; + encap += strip_icv ? PSP_TRL_SIZE : 0; + + if (proto == htons(ETH_P_IP)) { + struct iphdr *iph = (struct iphdr *)(skb->data + l2_hlen); + + iph->protocol = psph->nexthdr; + iph->tot_len = htons(ntohs(iph->tot_len) - encap); + iph->check = 0; + iph->check = ip_fast_csum((u8 *)iph, iph->ihl); + } else { + struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + l2_hlen); + + ipv6h->nexthdr = psph->nexthdr; + ipv6h->payload_len = htons(ntohs(ipv6h->payload_len) - encap); + } + + memmove(skb->data + PSP_ENCAP_HLEN, skb->data, l2_hlen + l3_hlen); + skb_pull(skb, PSP_ENCAP_HLEN); + + if (strip_icv) + pskb_trim(skb, skb->len - PSP_TRL_SIZE); + + return 0; +} +EXPORT_SYMBOL(psp_dev_rcv); + static int __init psp_init(void) { mutex_init(&psp_devs_lock); -- cgit v1.2.3 From 0aeb54ac4cd5cf8f60131b4d9ec0b6dc9c27b20d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:28:13 -0700 Subject: tls: make sure to abort the stream if headers are bogus Normally we wait for the socket to buffer up the whole record before we service it. If the socket has a tiny buffer, however, we read out the data sooner, to prevent connection stalls. Make sure that we abort the connection when we find out late that the record is actually invalid. Retrying the parsing is fine in itself but since we copy some more data each time before we parse we can overflow the allocated skb space. Constructing a scenario in which we're under pressure without enough data in the socket to parse the length upfront is quite hard. syzbot figured out a way to do this by serving us the header in small OOB sends, and then filling in the recvbuf with a large normal send. Make sure that tls_rx_msg_size() aborts strp, if we reach an invalid record there's really no way to recover. Reported-by: Lee Jones Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser") Reviewed-by: Sabrina Dubroca Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250917002814.1743558-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/tls/tls.h | 1 + net/tls/tls_strp.c | 14 +++++++++----- net/tls/tls_sw.c | 3 +-- 3 files changed, 11 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/tls/tls.h b/net/tls/tls.h index 4e077068e6d9..e4c42731ce39 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -141,6 +141,7 @@ void update_sk_prot(struct sock *sk, struct tls_context *ctx); int wait_on_pending_writer(struct sock *sk, long *timeo); void tls_err_abort(struct sock *sk, int err); +void tls_strp_abort_strp(struct tls_strparser *strp, int err); int init_prot_info(struct tls_prot_info *prot, const struct tls_crypto_info *crypto_info, diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index d71643b494a1..98e12f0ff57e 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -13,7 +13,7 @@ static struct workqueue_struct *tls_strp_wq; -static void tls_strp_abort_strp(struct tls_strparser *strp, int err) +void tls_strp_abort_strp(struct tls_strparser *strp, int err) { if (strp->stopped) return; @@ -211,11 +211,17 @@ static int tls_strp_copyin_frag(struct tls_strparser *strp, struct sk_buff *skb, struct sk_buff *in_skb, unsigned int offset, size_t in_len) { + unsigned int nfrag = skb->len / PAGE_SIZE; size_t len, chunk; skb_frag_t *frag; int sz; - frag = &skb_shinfo(skb)->frags[skb->len / PAGE_SIZE]; + if (unlikely(nfrag >= skb_shinfo(skb)->nr_frags)) { + DEBUG_NET_WARN_ON_ONCE(1); + return -EMSGSIZE; + } + + frag = &skb_shinfo(skb)->frags[nfrag]; len = in_len; /* First make sure we got the header */ @@ -520,10 +526,8 @@ static int tls_strp_read_sock(struct tls_strparser *strp) tls_strp_load_anchor_with_queue(strp, inq); if (!strp->stm.full_len) { sz = tls_rx_msg_size(strp, strp->anchor); - if (sz < 0) { - tls_strp_abort_strp(strp, sz); + if (sz < 0) return sz; - } strp->stm.full_len = sz; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index bac65d0d4e3e..daac9fd4be7e 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2474,8 +2474,7 @@ int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb) return data_len + TLS_HEADER_SIZE; read_failure: - tls_err_abort(strp->sk, ret); - + tls_strp_abort_strp(strp, ret); return ret; } -- cgit v1.2.3 From 672beab06656f2f1bda4708cda2b9af61c58a7ea Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Sep 2025 11:35:46 +0000 Subject: psp: rename our psp_dev_destroy() psp_dev_destroy() was already used in drivers/crypto/ccp/psp-dev.c Use psp_dev_free() instead, to avoid a link error when CRYPTO_DEV_SP_CCP=y Fixes: 00c94ca2b99e ("psp: base PSP device support") Closes: https://lore.kernel.org/netdev/CANn89i+ZdBDEV6TE=Nw5gn9ycTzWw4mZOpPuCswgwEsrgOyNnw@mail.gmail.com/ Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Acked-by: Paolo Abeni Reviewed-by: Daniel Zahka Link: https://patch.msgid.link/20250918113546.177946-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/psp/psp.h | 4 ++-- net/psp/psp_main.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/psp/psp.h b/net/psp/psp.h index 0f34e1a23fdd..9f19137593a0 100644 --- a/net/psp/psp.h +++ b/net/psp/psp.h @@ -13,7 +13,7 @@ extern struct xarray psp_devs; extern struct mutex psp_devs_lock; -void psp_dev_destroy(struct psp_dev *psd); +void psp_dev_free(struct psp_dev *psd); int psp_dev_check_access(struct psp_dev *psd, struct net *net); void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd); @@ -42,7 +42,7 @@ static inline bool psp_dev_tryget(struct psp_dev *psd) static inline void psp_dev_put(struct psp_dev *psd) { if (refcount_dec_and_test(&psd->refcnt)) - psp_dev_destroy(psd); + psp_dev_free(psd); } static inline bool psp_dev_is_registered(struct psp_dev *psd) diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c index b4b756f87382..0f8c50c8e943 100644 --- a/net/psp/psp_main.c +++ b/net/psp/psp_main.c @@ -99,7 +99,7 @@ psp_dev_create(struct net_device *netdev, } EXPORT_SYMBOL(psp_dev_create); -void psp_dev_destroy(struct psp_dev *psd) +void psp_dev_free(struct psp_dev *psd) { mutex_lock(&psp_devs_lock); xa_erase(&psp_devs, psd->id); @@ -122,7 +122,7 @@ void psp_dev_unregister(struct psp_dev *psd) psp_nl_notify_dev(psd, PSP_CMD_DEV_DEL_NTF); - /* Wait until psp_dev_destroy() to call xa_erase() to prevent a + /* Wait until psp_dev_free() to call xa_erase() to prevent a * different psd from being added to the xarray with this id, while * there are still references to this psd being held. */ -- cgit v1.2.3 From 3efaede2e13b0c0372500a6acdfc5c31268a3ed4 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 17 Sep 2025 02:58:08 -0700 Subject: net: ethtool: pass the num of RX rings directly to ethtool_copy_validate_indir Modify ethtool_copy_validate_indir() and callers to validate indirection table entries against the number of RX rings as an integer instead of accessing rx_rings->data. This will be useful in the future, given that struct ethtool_rxnfc might not exist for native GRXRINGS call. Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20250917-gxrings-v4-1-dae520e2e1cb@debian.org Signed-off-by: Jakub Kicinski --- net/ethtool/ioctl.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 0b2a4d0573b3..15627afa4424 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1246,8 +1246,8 @@ err_out: } static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr, - struct ethtool_rxnfc *rx_rings, - u32 size) + int num_rx_rings, + u32 size) { int i; @@ -1256,7 +1256,7 @@ static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr, /* Validate ring indices */ for (i = 0; i < size; i++) - if (indir[i] >= rx_rings->data) + if (indir[i] >= num_rx_rings) return -EINVAL; return 0; @@ -1366,7 +1366,7 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, } else { ret = ethtool_copy_validate_indir(rxfh_dev.indir, useraddr + ringidx_offset, - &rx_rings, + rx_rings.data, rxfh_dev.indir_size); if (ret) goto out; @@ -1587,7 +1587,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, rxfh_dev.indir_size = dev_indir_size; ret = ethtool_copy_validate_indir(rxfh_dev.indir, useraddr + rss_cfg_offset, - &rx_rings, + rx_rings.data, rxfh.indir_size); if (ret) goto out_free; -- cgit v1.2.3 From 06fad5a4aeb2622b7aeefd6ad60b6cf76c0b7c5b Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 17 Sep 2025 02:58:09 -0700 Subject: net: ethtool: add support for ETHTOOL_GRXRINGS ioctl This patch adds handling for the ETHTOOL_GRXRINGS ioctl command in the ethtool ioctl dispatcher. It introduces a new helper function ethtool_get_rxrings() that calls the driver's get_rxnfc() callback with appropriate parameters to retrieve the number of RX rings supported by the device. By explicitly handling ETHTOOL_GRXRINGS, userspace queries through ethtool can now obtain RX ring information in a structured manner. In this patch, ethtool_get_rxrings() is a simply copy of ethtool_get_rxnfc(). Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20250917-gxrings-v4-2-dae520e2e1cb@debian.org Signed-off-by: Jakub Kicinski --- net/ethtool/ioctl.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'net') diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 15627afa4424..4214ab33c3c8 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1208,6 +1208,44 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, return 0; } +static noinline_for_stack int ethtool_get_rxrings(struct net_device *dev, + u32 cmd, + void __user *useraddr) +{ + struct ethtool_rxnfc info; + size_t info_size = sizeof(info); + const struct ethtool_ops *ops = dev->ethtool_ops; + int ret; + void *rule_buf = NULL; + + if (!ops->get_rxnfc) + return -EOPNOTSUPP; + + ret = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr); + if (ret) + return ret; + + if (info.cmd == ETHTOOL_GRXCLSRLALL) { + if (info.rule_cnt > 0) { + if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) + rule_buf = kcalloc(info.rule_cnt, sizeof(u32), + GFP_USER); + if (!rule_buf) + return -ENOMEM; + } + } + + ret = ops->get_rxnfc(dev, &info, rule_buf); + if (ret < 0) + goto err_out; + + ret = ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, rule_buf); +err_out: + kfree(rule_buf); + + return ret; +} + static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, u32 cmd, void __user *useraddr) { @@ -3377,6 +3415,8 @@ __dev_ethtool(struct net *net, struct ifreq *ifr, void __user *useraddr, rc = ethtool_set_rxfh_fields(dev, ethcmd, useraddr); break; case ETHTOOL_GRXRINGS: + rc = ethtool_get_rxrings(dev, ethcmd, useraddr); + break; case ETHTOOL_GRXCLSRLCNT: case ETHTOOL_GRXCLSRULE: case ETHTOOL_GRXCLSRLALL: -- cgit v1.2.3 From 87c76c2db002e747269fc5d91461786ce57976d7 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 17 Sep 2025 02:58:10 -0700 Subject: net: ethtool: remove the duplicated handling from ethtool_get_rxrings ethtool_get_rxrings() was a copy of ethtool_get_rxnfc(). Clean the code that will never be executed for GRXRINGS specifically. Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20250917-gxrings-v4-3-dae520e2e1cb@debian.org Signed-off-by: Jakub Kicinski --- net/ethtool/ioctl.c | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 4214ab33c3c8..a0f3de76cea0 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1212,52 +1212,39 @@ static noinline_for_stack int ethtool_get_rxrings(struct net_device *dev, u32 cmd, void __user *useraddr) { - struct ethtool_rxnfc info; - size_t info_size = sizeof(info); const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_rxnfc info; + size_t info_size; int ret; - void *rule_buf = NULL; if (!ops->get_rxnfc) return -EOPNOTSUPP; + info_size = sizeof(info); ret = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr); if (ret) return ret; - if (info.cmd == ETHTOOL_GRXCLSRLALL) { - if (info.rule_cnt > 0) { - if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) - rule_buf = kcalloc(info.rule_cnt, sizeof(u32), - GFP_USER); - if (!rule_buf) - return -ENOMEM; - } - } - - ret = ops->get_rxnfc(dev, &info, rule_buf); + ret = ops->get_rxnfc(dev, &info, NULL); if (ret < 0) - goto err_out; - - ret = ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, rule_buf); -err_out: - kfree(rule_buf); + return ret; - return ret; + return ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL); } static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, u32 cmd, void __user *useraddr) { - struct ethtool_rxnfc info; - size_t info_size = sizeof(info); const struct ethtool_ops *ops = dev->ethtool_ops; - int ret; + struct ethtool_rxnfc info; void *rule_buf = NULL; + size_t info_size; + int ret; if (!ops->get_rxnfc) return -EOPNOTSUPP; + info_size = sizeof(info); ret = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr); if (ret) return ret; -- cgit v1.2.3 From 84eaf4359c36b0ba888f571a964138d22ba5914f Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 17 Sep 2025 02:58:11 -0700 Subject: net: ethtool: add get_rx_ring_count callback to optimize RX ring queries Add a new optional get_rx_ring_count callback in ethtool_ops to allow drivers to provide the number of RX rings directly without going through the full get_rxnfc flow classification interface. Create ethtool_get_rx_ring_count() to use .get_rx_ring_count if available, falling back to get_rxnfc() otherwise. It needs to be non-static, given it will be called by other ethtool functions laters, as those calling get_rxfh(). Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20250917-gxrings-v4-4-dae520e2e1cb@debian.org Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 2 ++ net/ethtool/common.c | 20 ++++++++++++++++++++ net/ethtool/common.h | 2 ++ net/ethtool/ioctl.c | 8 +++----- 4 files changed, 27 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index d7d757e72554..c869b7f8bce8 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -968,6 +968,7 @@ struct kernel_ethtool_ts_info { * @reset: Reset (part of) the device, as specified by a bitmask of * flags from &enum ethtool_reset_flags. Returns a negative * error code or zero. + * @get_rx_ring_count: Return the number of RX rings * @get_rxfh_key_size: Get the size of the RX flow hash key. * Returns zero if not supported for this specific device. * @get_rxfh_indir_size: Get the size of the RX flow hash indirection table. @@ -1162,6 +1163,7 @@ struct ethtool_ops { int (*set_rxnfc)(struct net_device *, struct ethtool_rxnfc *); int (*flash_device)(struct net_device *, struct ethtool_flash *); int (*reset)(struct net_device *, u32 *); + u32 (*get_rx_ring_count)(struct net_device *dev); u32 (*get_rxfh_key_size)(struct net_device *); u32 (*get_rxfh_indir_size)(struct net_device *); int (*get_rxfh)(struct net_device *, struct ethtool_rxfh_param *); diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 4f58648a27ad..10460ea3717c 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -577,6 +577,26 @@ int __ethtool_get_link(struct net_device *dev) return netif_running(dev) && dev->ethtool_ops->get_link(dev); } +int ethtool_get_rx_ring_count(struct net_device *dev) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_rxnfc rx_rings = {}; + int ret; + + if (ops->get_rx_ring_count) + return ops->get_rx_ring_count(dev); + + if (!ops->get_rxnfc) + return -EOPNOTSUPP; + + rx_rings.cmd = ETHTOOL_GRXRINGS; + ret = ops->get_rxnfc(dev, &rx_rings, NULL); + if (ret < 0) + return ret; + + return rx_rings.data; +} + static int ethtool_get_rxnfc_rule_count(struct net_device *dev) { const struct ethtool_ops *ops = dev->ethtool_ops; diff --git a/net/ethtool/common.h b/net/ethtool/common.h index c4d084dde5bf..1609cf4e53eb 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -54,6 +54,8 @@ void ethtool_ringparam_get_cfg(struct net_device *dev, struct kernel_ethtool_ringparam *kparam, struct netlink_ext_ack *extack); +int ethtool_get_rx_ring_count(struct net_device *dev); + int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info); int ethtool_get_ts_info_by_phc(struct net_device *dev, struct kernel_ethtool_ts_info *info, diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index a0f3de76cea0..8493ee200601 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1212,23 +1212,21 @@ static noinline_for_stack int ethtool_get_rxrings(struct net_device *dev, u32 cmd, void __user *useraddr) { - const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_rxnfc info; size_t info_size; int ret; - if (!ops->get_rxnfc) - return -EOPNOTSUPP; - info_size = sizeof(info); ret = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr); if (ret) return ret; - ret = ops->get_rxnfc(dev, &info, NULL); + ret = ethtool_get_rx_ring_count(dev); if (ret < 0) return ret; + info.data = ret; + return ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL); } -- cgit v1.2.3 From d5544688d4217f370e9189e17b6536b469ee0f1d Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 17 Sep 2025 02:58:12 -0700 Subject: net: ethtool: update set_rxfh to use ethtool_get_rx_ring_count helper Modify ethtool_set_rxfh() to use the new ethtool_get_rx_ring_count() helper function for retrieving the number of RX rings instead of directly calling get_rxnfc with ETHTOOL_GRXRINGS. This way, we can leverage the new helper if it is available in ethtool_ops. Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20250917-gxrings-v4-5-dae520e2e1cb@debian.org Signed-off-by: Jakub Kicinski --- net/ethtool/ioctl.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 8493ee200601..d61e34751adc 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1531,14 +1531,14 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, struct ethtool_rxfh_param rxfh_dev = {}; struct ethtool_rxfh_context *ctx = NULL; struct netlink_ext_ack *extack = NULL; - struct ethtool_rxnfc rx_rings; struct ethtool_rxfh rxfh; bool create = false; + int num_rx_rings; u8 *rss_config; int ntf = 0; int ret; - if (!ops->get_rxnfc || !ops->set_rxfh) + if (!ops->set_rxfh) return -EOPNOTSUPP; if (ops->get_rxfh_indir_size) @@ -1594,10 +1594,11 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, if (!rss_config) return -ENOMEM; - rx_rings.cmd = ETHTOOL_GRXRINGS; - ret = ops->get_rxnfc(dev, &rx_rings, NULL); - if (ret) + num_rx_rings = ethtool_get_rx_ring_count(dev); + if (num_rx_rings < 0) { + ret = num_rx_rings; goto out_free; + } /* rxfh.indir_size == 0 means reset the indir table to default (master * context) or delete the context (other RSS contexts). @@ -1610,7 +1611,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, rxfh_dev.indir_size = dev_indir_size; ret = ethtool_copy_validate_indir(rxfh_dev.indir, useraddr + rss_cfg_offset, - rx_rings.data, + num_rx_rings, rxfh.indir_size); if (ret) goto out_free; @@ -1622,7 +1623,8 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, rxfh_dev.indir_size = dev_indir_size; indir = rxfh_dev.indir; for (i = 0; i < dev_indir_size; i++) - indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); + indir[i] = + ethtool_rxfh_indir_default(i, num_rx_rings); } else { rxfh_dev.rss_delete = true; } -- cgit v1.2.3 From dce08107f1f305b0fbef115410034b1fb3b7e070 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 17 Sep 2025 02:58:13 -0700 Subject: net: ethtool: update set_rxfh_indir to use ethtool_get_rx_ring_count helper Modify ethtool_set_rxfh() to use the new ethtool_get_rx_ring_count() helper function for retrieving the number of RX rings instead of directly calling get_rxnfc with ETHTOOL_GRXRINGS. This way, we can leverage the new helper if it is available in ethtool_ops. Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20250917-gxrings-v4-6-dae520e2e1cb@debian.org Signed-off-by: Jakub Kicinski --- net/ethtool/ioctl.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index d61e34751adc..fa83ddade4f8 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1350,13 +1350,12 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_rxfh_param rxfh_dev = {}; struct netlink_ext_ack *extack = NULL; - struct ethtool_rxnfc rx_rings; + int num_rx_rings; u32 user_size, i; int ret; u32 ringidx_offset = offsetof(struct ethtool_rxfh_indir, ring_index[0]); - if (!ops->get_rxfh_indir_size || !ops->set_rxfh || - !ops->get_rxnfc) + if (!ops->get_rxfh_indir_size || !ops->set_rxfh) return -EOPNOTSUPP; rxfh_dev.indir_size = ops->get_rxfh_indir_size(dev); @@ -1376,20 +1375,21 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, if (!rxfh_dev.indir) return -ENOMEM; - rx_rings.cmd = ETHTOOL_GRXRINGS; - ret = ops->get_rxnfc(dev, &rx_rings, NULL); - if (ret) + num_rx_rings = ethtool_get_rx_ring_count(dev); + if (num_rx_rings < 0) { + ret = num_rx_rings; goto out; + } if (user_size == 0) { u32 *indir = rxfh_dev.indir; for (i = 0; i < rxfh_dev.indir_size; i++) - indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); + indir[i] = ethtool_rxfh_indir_default(i, num_rx_rings); } else { ret = ethtool_copy_validate_indir(rxfh_dev.indir, useraddr + ringidx_offset, - rx_rings.data, + num_rx_rings, rxfh_dev.indir_size); if (ret) goto out; -- cgit v1.2.3 From 8b7c4b612decb79b157611225faee68c384102a9 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 17 Sep 2025 02:58:14 -0700 Subject: net: ethtool: use the new helper in rss_set_prep_indir() Refactor rss_set_prep_indir() to utilize the new ethtool_get_rx_ring_count() helper for determining the number of RX rings, replacing the direct use of get_rxnfc with ETHTOOL_GRXRINGS. This ensures compatibility with both legacy and new ethtool_ops interfaces by transparently multiplexing between them. Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20250917-gxrings-v4-7-dae520e2e1cb@debian.org Signed-off-by: Jakub Kicinski --- net/ethtool/rss.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index 202d95e8bf3e..4dced53be4b3 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -620,23 +620,22 @@ rss_set_prep_indir(struct net_device *dev, struct genl_info *info, struct rss_reply_data *data, struct ethtool_rxfh_param *rxfh, bool *reset, bool *mod) { - const struct ethtool_ops *ops = dev->ethtool_ops; struct netlink_ext_ack *extack = info->extack; struct nlattr **tb = info->attrs; - struct ethtool_rxnfc rx_rings; size_t alloc_size; + int num_rx_rings; u32 user_size; int i, err; if (!tb[ETHTOOL_A_RSS_INDIR]) return 0; - if (!data->indir_size || !ops->get_rxnfc) + if (!data->indir_size) return -EOPNOTSUPP; - rx_rings.cmd = ETHTOOL_GRXRINGS; - err = ops->get_rxnfc(dev, &rx_rings, NULL); - if (err) + err = ethtool_get_rx_ring_count(dev); + if (err < 0) return err; + num_rx_rings = err; if (nla_len(tb[ETHTOOL_A_RSS_INDIR]) % 4) { NL_SET_BAD_ATTR(info->extack, tb[ETHTOOL_A_RSS_INDIR]); @@ -665,7 +664,7 @@ rss_set_prep_indir(struct net_device *dev, struct genl_info *info, nla_memcpy(rxfh->indir, tb[ETHTOOL_A_RSS_INDIR], alloc_size); for (i = 0; i < user_size; i++) { - if (rxfh->indir[i] < rx_rings.data) + if (rxfh->indir[i] < num_rx_rings) continue; NL_SET_ERR_MSG_ATTR_FMT(extack, tb[ETHTOOL_A_RSS_INDIR], @@ -682,7 +681,7 @@ rss_set_prep_indir(struct net_device *dev, struct genl_info *info, } else { for (i = 0; i < data->indir_size; i++) rxfh->indir[i] = - ethtool_rxfh_indir_default(i, rx_rings.data); + ethtool_rxfh_indir_default(i, num_rx_rings); } *mod |= memcmp(rxfh->indir, data->indir_table, data->indir_size); -- cgit v1.2.3 From 3191df0a4882c827cac29925e80ecb1775b904bd Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Thu, 18 Sep 2025 13:15:06 +0300 Subject: devlink rate: Remove unnecessary 'static' from a couple places devlink_rate_node_get_by_name() and devlink_rate_nodes_destroy() have a couple of unnecessary static variables for iterating over devlink rates. This could lead to races/corruption/unhappiness if two concurrent operations execute the same function. Remove 'static' from both. It's amazing this was missed for 4+ years. While at it, I confirmed there are no more examples of this mistake in net/ with 1, 2 or 3 levels of indentation. Fixes: a8ecb93ef03d ("devlink: Introduce rate nodes") Signed-off-by: Cosmin Ratiu Signed-off-by: Jakub Kicinski --- net/devlink/rate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/devlink/rate.c b/net/devlink/rate.c index 110b3fa8a0b1..264fb82cba19 100644 --- a/net/devlink/rate.c +++ b/net/devlink/rate.c @@ -34,7 +34,7 @@ devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info) static struct devlink_rate * devlink_rate_node_get_by_name(struct devlink *devlink, const char *node_name) { - static struct devlink_rate *devlink_rate; + struct devlink_rate *devlink_rate; list_for_each_entry(devlink_rate, &devlink->rate_list, list) { if (devlink_rate_is_node(devlink_rate) && @@ -819,8 +819,8 @@ EXPORT_SYMBOL_GPL(devl_rate_leaf_destroy); */ void devl_rate_nodes_destroy(struct devlink *devlink) { - static struct devlink_rate *devlink_rate, *tmp; const struct devlink_ops *ops = devlink->ops; + struct devlink_rate *devlink_rate, *tmp; devl_assert_locked(devlink); -- cgit v1.2.3 From e8fe3f07a357c39d429e02ca34f740692d88967a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 19 Aug 2025 18:10:13 +0200 Subject: 9p/trans_fd: p9_fd_request: kick rx thread if EPOLLIN p9_read_work() doesn't set Rworksched and doesn't do schedule_work(m->rq) if list_empty(&m->req_list). However, if the pipe is full, we need to read more data and this used to work prior to commit aaec5a95d59615 ("pipe_read: don't wake up the writer if the pipe is still full"). p9_read_work() does p9_fd_read() -> ... -> anon_pipe_read() which (before the commit above) triggered the unnecessary wakeup. This wakeup calls p9_pollwake() which kicks p9_poll_workfn() -> p9_poll_mux(), p9_poll_mux() will notice EPOLLIN and schedule_work(&m->rq). This no longer happens after the optimization above, change p9_fd_request() to use p9_poll_mux() instead of only checking for EPOLLOUT. Reported-by: syzbot+d1b5dace43896bc386c3@syzkaller.appspotmail.com Tested-by: syzbot+d1b5dace43896bc386c3@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/68a2de8f.050a0220.e29e5.0097.GAE@google.com/ Link: https://lore.kernel.org/all/67dedd2f.050a0220.31a16b.003f.GAE@google.com/ Co-developed-by: K Prateek Nayak Signed-off-by: K Prateek Nayak Signed-off-by: Oleg Nesterov Tested-by: K Prateek Nayak Message-ID: <20250819161013.GB11345@redhat.com> Signed-off-by: Dominique Martinet --- net/9p/trans_fd.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'net') diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 8992d8bebbdd..a516745f732f 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -666,7 +666,6 @@ static void p9_poll_mux(struct p9_conn *m) static int p9_fd_request(struct p9_client *client, struct p9_req_t *req) { - __poll_t n; int err; struct p9_trans_fd *ts = client->trans; struct p9_conn *m = &ts->conn; @@ -686,13 +685,7 @@ static int p9_fd_request(struct p9_client *client, struct p9_req_t *req) list_add_tail(&req->req_list, &m->unsent_req_list); spin_unlock(&m->req_lock); - if (test_and_clear_bit(Wpending, &m->wsched)) - n = EPOLLOUT; - else - n = p9_fd_poll(m->client, NULL, NULL); - - if (n & EPOLLOUT && !test_and_set_bit(Wworksched, &m->wsched)) - schedule_work(&m->wq); + p9_poll_mux(m); return 0; } -- cgit v1.2.3 From 01b4a3061b1d4ded108e1a700b4414c00662954c Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Mon, 8 Sep 2025 14:12:55 +0300 Subject: wifi: nl80211: Add more configuration options for NAN commands Current NAN APIs have only basic configuration for master preference and operating bands. Add and parse additional parameters which provide more control over NAN synchronization. The newly added attributes allow to publish additional NAN attributes and vendor elements in NAN beacons, control scan and discovery beacons periodicity, enable/disable DW notifications etc. Signed-off-by: Andrei Otcheretianski tested: Miriam Rachel Korenblit Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.a4779492bf8e.I375feb919bd72358173766b9fe10010c40796b33@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 60 +++++++++ include/uapi/linux/nl80211.h | 110 +++++++++++++++- net/wireless/nl80211.c | 298 ++++++++++++++++++++++++++++++++++++++----- 3 files changed, 431 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 4072a67c9cc9..e2f4ca500ea3 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3912,6 +3912,38 @@ struct cfg80211_qos_map { struct cfg80211_dscp_range up[8]; }; +/** + * struct cfg80211_nan_band_config - NAN band specific configuration + * + * @chan: Pointer to the IEEE 802.11 channel structure. The channel to be used + * for NAN operations on this band. For 2.4 GHz band, this is always + * channel 6. For 5 GHz band, the channel is either 44 or 149, according + * to the regulatory constraints. If chan pointer is NULL the entire band + * configuration entry is considered invalid and should not be used. + * @rssi_close: RSSI close threshold used for NAN state transition algorithm + * as described in chapters 3.3.6 and 3.3.7 "NAN Device Role and State + * Transition" of Wi-Fi Aware Specification v4.0. If not + * specified (set to 0), default device value is used. The value should + * be greater than -60 dBm. + * @rssi_middle: RSSI middle threshold used for NAN state transition algorithm. + * as described in chapters 3.3.6 and 3.3.7 "NAN Device Role and State + * Transition" of Wi-Fi Aware Specification v4.0. If not + * specified (set to 0), default device value is used. The value should be + * greater than -75 dBm and less than rssi_close. + * @awake_dw_interval: Committed DW interval. Valid values range: 0-5. 0 + * indicates no wakeup for DW and can't be used on 2.4GHz band, otherwise + * 2^(n-1). + * @disable_scan: If true, the device will not scan this band for cluster + * merge. Disabling scan on 2.4 GHz band is not allowed. + */ +struct cfg80211_nan_band_config { + struct ieee80211_channel *chan; + s8 rssi_close; + s8 rssi_middle; + u8 awake_dw_interval; + bool disable_scan; +}; + /** * struct cfg80211_nan_conf - NAN configuration * @@ -3921,10 +3953,31 @@ struct cfg80211_qos_map { * @bands: operating bands, a bitmap of &enum nl80211_band values. * For instance, for NL80211_BAND_2GHZ, bit 0 would be set * (i.e. BIT(NL80211_BAND_2GHZ)). + * @cluster_id: cluster ID used for NAN synchronization. This is a MAC address + * that can take a value from 50-6F-9A-01-00-00 to 50-6F-9A-01-FF-FF. + * If NULL, the device will pick a random Cluster ID. + * @scan_period: period (in seconds) between NAN scans. + * @scan_dwell_time: dwell time (in milliseconds) for NAN scans. + * @discovery_beacon_interval: interval (in TUs) for discovery beacons. + * @band_cfgs: array of band specific configurations, indexed by + * &enum nl80211_band values. + * @extra_nan_attrs: pointer to additional NAN attributes. + * @extra_nan_attrs_len: length of the additional NAN attributes. + * @vendor_elems: pointer to vendor-specific elements. + * @vendor_elems_len: length of the vendor-specific elements. */ struct cfg80211_nan_conf { u8 master_pref; u8 bands; + const u8 *cluster_id; + u16 scan_period; + u16 scan_dwell_time; + u8 discovery_beacon_interval; + struct cfg80211_nan_band_config band_cfgs[NUM_NL80211_BANDS]; + const u8 *extra_nan_attrs; + u16 extra_nan_attrs_len; + const u8 *vendor_elems; + u16 vendor_elems_len; }; /** @@ -3933,10 +3986,17 @@ struct cfg80211_nan_conf { * * @CFG80211_NAN_CONF_CHANGED_PREF: master preference * @CFG80211_NAN_CONF_CHANGED_BANDS: operating bands + * @CFG80211_NAN_CONF_CHANGED_CONFIG: changed additional configuration. + * When this flag is set, it indicates that some additional attribute(s) + * (other then master_pref and bands) have been changed. In this case, + * all the unchanged attributes will be properly configured to their + * previous values. The driver doesn't need to store any + * previous configuration besides master_pref and bands. */ enum cfg80211_nan_conf_changes { CFG80211_NAN_CONF_CHANGED_PREF = BIT(0), CFG80211_NAN_CONF_CHANGED_BANDS = BIT(1), + CFG80211_NAN_CONF_CHANGED_CONFIG = BIT(2), }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index aed0b4c5d5e8..20b8202a3d58 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1085,8 +1085,9 @@ * %NL80211_ATTR_NAN_MASTER_PREF attribute and optional * %NL80211_ATTR_BANDS attributes. If %NL80211_ATTR_BANDS is * omitted or set to 0, it means don't-care and the device will - * decide what to use. After this command NAN functions can be - * added. + * decide what to use. Additional cluster configuration may be + * optionally provided with %NL80211_ATTR_NAN_CONFIG. + * After this command NAN functions can be added. * @NL80211_CMD_STOP_NAN: Stop the NAN operation, identified by * its %NL80211_ATTR_WDEV interface. * @NL80211_CMD_ADD_NAN_FUNCTION: Add a NAN function. The function is defined @@ -1115,6 +1116,10 @@ * current configuration is not changed. If it is present but * set to zero, the configuration is changed to don't-care * (i.e. the device can decide what to do). + * Additional parameters may be provided with + * %NL80211_ATTR_NAN_CONFIG. User space should provide all previously + * configured nested attributes under %NL80211_ATTR_NAN_CONFIG, even if + * only a subset was changed. * @NL80211_CMD_NAN_MATCH: Notification sent when a match is reported. * This will contain a %NL80211_ATTR_NAN_MATCH nested attribute and * %NL80211_ATTR_COOKIE. @@ -2936,6 +2941,12 @@ enum nl80211_commands { * indicate that it wants strict checking on the BSS parameters to be * modified. * + * @NL80211_ATTR_NAN_CONFIG: Nested attribute for + * extended NAN cluster configuration. This is used with + * %NL80211_CMD_START_NAN and %NL80211_CMD_CHANGE_NAN_CONFIG. + * See &enum nl80211_nan_conf_attributes for details. + * This attribute is optional. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3498,6 +3509,7 @@ enum nl80211_attrs { NL80211_ATTR_S1G_LONG_BEACON_PERIOD, NL80211_ATTR_S1G_SHORT_BEACON, NL80211_ATTR_BSS_PARAM, + NL80211_ATTR_NAN_CONFIG, /* add attributes here, update the policy in nl80211.c */ @@ -7323,6 +7335,100 @@ enum nl80211_nan_match_attributes { NL80211_NAN_MATCH_ATTR_MAX = NUM_NL80211_NAN_MATCH_ATTR - 1 }; +/** + * enum nl80211_nan_band_conf_attributes - NAN band configuration attributes + * @__NL80211_NAN_BAND_CONF_INVALID: Invalid. + * @NL80211_NAN_BAND_CONF_BAND: Band for which the configuration is + * being set. The value is according to &enum nl80211_band (u8). + * @NL80211_NAN_BAND_CONF_FREQ: Discovery frequency. This attribute shall not + * be present on 2.4 GHZ band. On 5 GHz band its presence is optional. + * The allowed values are 5220 (channel 44) or 5745 (channel 149). + * If not present, channel 149 is used if allowed, otherwise channel 44 + * will be selected. The value is in MHz (u16). + * @NL80211_NAN_BAND_CONF_RSSI_CLOSE: RSSI close threshold used for NAN state + * transition algorithm as described in chapters 3.3.6 and 3.3.7 "NAN + * Device Role and State Transition" of Wi-Fi Aware (TM) Specification + * v4.0. If not specified, default device value is used. The value should + * be greater than -60 dBm (s8). + * @NL80211_NAN_BAND_CONF_RSSI_MIDDLE: RSSI middle threshold used for NAN state + * transition algorithm as described in chapters 3.3.6 and 3.3.7 "NAN + * Device Role and State Transition" of Wi-Fi Aware (TM) Specification + * v4.0. If not present, default device value is used. The value should be + * greater than -75 dBm and less than %NL80211_NAN_BAND_CONF_RSSI_CLOSE + * (s8). + * @NL80211_NAN_BAND_CONF_WAKE_DW: Committed DW information (values 0-5). + * Value 0 means that the device will not wake up during the + * discovery window. Values 1-5 mean that the device will wake up + * during each 2^(n - 1) discovery window, where n is the value of + * this attribute. Setting this attribute to 0 is not allowed on + * 2.4 GHz band (u8). This is an optional parameter (default is 1). + * @NL80211_NAN_BAND_CONF_DISABLE_SCAN: Optional flag attribute to disable + * scanning (for cluster merge) on the band. If set, the device will not + * scan on this band anymore. Disabling scanning on 2.4 GHz band is not + * allowed. + * @NUM_NL80211_NAN_BAND_CONF_ATTR: Internal. + * @NL80211_NAN_BAND_CONF_ATTR_MAX: Highest NAN band configuration attribute. + * + * These attributes are used to configure NAN band-specific parameters. Note, + * that both RSSI attributes should be configured (or both left unset). + */ +enum nl80211_nan_band_conf_attributes { + __NL80211_NAN_BAND_CONF_INVALID, + NL80211_NAN_BAND_CONF_BAND, + NL80211_NAN_BAND_CONF_FREQ, + NL80211_NAN_BAND_CONF_RSSI_CLOSE, + NL80211_NAN_BAND_CONF_RSSI_MIDDLE, + NL80211_NAN_BAND_CONF_WAKE_DW, + NL80211_NAN_BAND_CONF_DISABLE_SCAN, + + /* keep last */ + NUM_NL80211_NAN_BAND_CONF_ATTR, + NL80211_NAN_BAND_CONF_ATTR_MAX = NUM_NL80211_NAN_BAND_CONF_ATTR - 1, +}; + +/** + * enum nl80211_nan_conf_attributes - NAN configuration attributes + * @__NL80211_NAN_CONF_INVALID: Invalid attribute, used for validation. + * @NL80211_NAN_CONF_CLUSTER_ID: ID for the NAN cluster. This is a MAC + * address that can take values from 50-6F-9A-01-00-00 to + * 50-6F-9A-01-FF-FF. This attribute is optional. If not present, + * a random Cluster ID will be chosen. + * @NL80211_NAN_CONF_EXTRA_ATTRS: Additional NAN attributes to be + * published in the beacons. This is an optional byte array. + * @NL80211_NAN_CONF_VENDOR_ELEMS: Vendor-specific elements that will + * be published in the beacons. This is an optional byte array. + * @NL80211_NAN_CONF_BAND_CONFIGS: This is a nested array attribute, + * containing multiple entries for each supported band. Each band + * configuration consists of &enum nl80211_nan_band_conf_attributes. + * @NL80211_NAN_CONF_SCAN_PERIOD: Scan period in seconds. If not configured, + * device default is used. Zero value will disable scanning. + * This is u16 (optional). + * @NL80211_NAN_CONF_SCAN_DWELL_TIME: Scan dwell time in TUs per channel. + * Only non-zero values are valid. If not configured the device default + * value is used. This is u16 (optional) + * @NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL: Discovery beacon interval + * in TUs. Valid range is 50-200 TUs. If not configured the device default + * value is used. This is u8 (optional) + * @NUM_NL80211_NAN_CONF_ATTR: Internal. + * @NL80211_NAN_CONF_ATTR_MAX: Highest NAN configuration attribute. + * + * These attributes are used to configure NAN-specific parameters. + */ +enum nl80211_nan_conf_attributes { + __NL80211_NAN_CONF_INVALID, + NL80211_NAN_CONF_CLUSTER_ID, + NL80211_NAN_CONF_EXTRA_ATTRS, + NL80211_NAN_CONF_VENDOR_ELEMS, + NL80211_NAN_CONF_BAND_CONFIGS, + NL80211_NAN_CONF_SCAN_PERIOD, + NL80211_NAN_CONF_SCAN_DWELL_TIME, + NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL, + + /* keep last */ + NUM_NL80211_NAN_CONF_ATTR, + NL80211_NAN_CONF_ATTR_MAX = NUM_NL80211_NAN_CONF_ATTR - 1, +}; + /** * enum nl80211_external_auth_action - Action to perform with external * authentication request. Used by NL80211_ATTR_EXTERNAL_AUTH_ACTION. diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index b7bc7e5e81dd..04679acc8135 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -312,6 +312,26 @@ static int validate_supported_selectors(const struct nlattr *attr, return 0; } +static int validate_nan_cluster_id(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + const u8 *data = nla_data(attr); + unsigned int len = nla_len(attr); + static const u8 cluster_id_prefix[4] = {0x50, 0x6f, 0x9a, 0x1}; + + if (len != ETH_ALEN) { + NL_SET_ERR_MSG_ATTR(extack, attr, "bad cluster id length"); + return -EINVAL; + } + + if (memcmp(data, cluster_id_prefix, sizeof(cluster_id_prefix))) { + NL_SET_ERR_MSG_ATTR(extack, attr, "invalid cluster id prefix"); + return -EINVAL; + } + + return 0; +} + /* policy for the attributes */ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR]; @@ -500,6 +520,35 @@ nl80211_s1g_short_beacon[NL80211_S1G_SHORT_BEACON_ATTR_MAX + 1] = { IEEE80211_MAX_DATA_LEN), }; +static const struct nla_policy +nl80211_nan_band_conf_policy[NL80211_NAN_BAND_CONF_ATTR_MAX + 1] = { + [NL80211_NAN_BAND_CONF_BAND] = NLA_POLICY_MAX(NLA_U8, + NUM_NL80211_BANDS - 1), + [NL80211_NAN_BAND_CONF_FREQ] = { .type = NLA_U16 }, + [NL80211_NAN_BAND_CONF_RSSI_CLOSE] = NLA_POLICY_MIN(NLA_S8, -59), + [NL80211_NAN_BAND_CONF_RSSI_MIDDLE] = NLA_POLICY_MIN(NLA_S8, -74), + [NL80211_NAN_BAND_CONF_WAKE_DW] = NLA_POLICY_MAX(NLA_U8, 5), + [NL80211_NAN_BAND_CONF_DISABLE_SCAN] = { .type = NLA_FLAG }, +}; + +static const struct nla_policy +nl80211_nan_conf_policy[NL80211_NAN_CONF_ATTR_MAX + 1] = { + [NL80211_NAN_CONF_CLUSTER_ID] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_nan_cluster_id, + ETH_ALEN), + [NL80211_NAN_CONF_EXTRA_ATTRS] = { .type = NLA_BINARY, + .len = IEEE80211_MAX_DATA_LEN}, + [NL80211_NAN_CONF_VENDOR_ELEMS] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr, + IEEE80211_MAX_DATA_LEN), + [NL80211_NAN_CONF_BAND_CONFIGS] = + NLA_POLICY_NESTED_ARRAY(nl80211_nan_band_conf_policy), + [NL80211_NAN_CONF_SCAN_PERIOD] = { .type = NLA_U16 }, + [NL80211_NAN_CONF_SCAN_DWELL_TIME] = NLA_POLICY_RANGE(NLA_U16, 50, 512), + [NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL] = + NLA_POLICY_RANGE(NLA_U8, 50, 200), +}; + static const struct netlink_range_validation nl80211_punct_bitmap_range = { .min = 0, .max = 0xffff, @@ -769,6 +818,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_ATTR_NAN_MASTER_PREF] = NLA_POLICY_MIN(NLA_U8, 1), [NL80211_ATTR_BANDS] = { .type = NLA_U32 }, + [NL80211_ATTR_NAN_CONFIG] = NLA_POLICY_NESTED(nl80211_nan_conf_policy), [NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED }, [NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY, .len = FILS_MAX_KEK_LEN }, @@ -15398,6 +15448,211 @@ static int nl80211_stop_p2p_device(struct sk_buff *skb, struct genl_info *info) return 0; } +static struct ieee80211_channel *nl80211_get_nan_channel(struct wiphy *wiphy, + int freq) +{ + struct ieee80211_channel *chan; + struct cfg80211_chan_def def; + + /* Check if the frequency is valid for NAN */ + if (freq != 5220 && freq != 5745 && freq != 2437) + return NULL; + + chan = ieee80211_get_channel(wiphy, freq); + if (!chan) + return NULL; + + cfg80211_chandef_create(&def, chan, NL80211_CHAN_NO_HT); + + /* Check if the channel is allowed */ + if (cfg80211_reg_can_beacon(wiphy, &def, NL80211_IFTYPE_NAN)) + return chan; + + return NULL; +} + +static int nl80211_parse_nan_band_config(struct wiphy *wiphy, + struct nlattr **tb, + struct cfg80211_nan_band_config *cfg, + enum nl80211_band band) +{ + if (BIT(band) & ~(u32)wiphy->nan_supported_bands) + return -EINVAL; + + if (tb[NL80211_NAN_BAND_CONF_FREQ]) { + u16 freq = nla_get_u16(tb[NL80211_NAN_BAND_CONF_FREQ]); + + if (band != NL80211_BAND_5GHZ) + return -EINVAL; + + cfg->chan = nl80211_get_nan_channel(wiphy, freq); + if (!cfg->chan) + return -EINVAL; + } + + if (tb[NL80211_NAN_BAND_CONF_RSSI_CLOSE]) { + cfg->rssi_close = + nla_get_s8(tb[NL80211_NAN_BAND_CONF_RSSI_CLOSE]); + if (!tb[NL80211_NAN_BAND_CONF_RSSI_MIDDLE]) + return -EINVAL; + } + + if (tb[NL80211_NAN_BAND_CONF_RSSI_MIDDLE]) { + cfg->rssi_middle = + nla_get_s8(tb[NL80211_NAN_BAND_CONF_RSSI_MIDDLE]); + if (!cfg->rssi_close || cfg->rssi_middle >= cfg->rssi_close) + return -EINVAL; + } + + if (tb[NL80211_NAN_BAND_CONF_WAKE_DW]) { + cfg->awake_dw_interval = + nla_get_u8(tb[NL80211_NAN_BAND_CONF_WAKE_DW]); + + if (band == NL80211_BAND_2GHZ && cfg->awake_dw_interval == 0) + return -EINVAL; + } + + cfg->disable_scan = + nla_get_flag(tb[NL80211_NAN_BAND_CONF_DISABLE_SCAN]); + return 0; +} + +static int nl80211_parse_nan_conf(struct wiphy *wiphy, + struct genl_info *info, + struct cfg80211_nan_conf *conf, + u32 *changed_flags) +{ + struct nlattr *attrs[NL80211_NAN_CONF_ATTR_MAX + 1]; + int err, rem; + u32 changed = 0; + struct nlattr *band_config; + + if (info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) { + conf->master_pref = + nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); + + changed |= CFG80211_NAN_CONF_CHANGED_PREF; + } + + if (info->attrs[NL80211_ATTR_BANDS]) { + u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]); + + if (bands & ~(u32)wiphy->nan_supported_bands) + return -EOPNOTSUPP; + + if (bands && !(bands & BIT(NL80211_BAND_2GHZ))) + return -EINVAL; + + conf->bands = bands; + changed |= CFG80211_NAN_CONF_CHANGED_BANDS; + } + + conf->band_cfgs[NL80211_BAND_2GHZ].awake_dw_interval = 1; + if (conf->bands & BIT(NL80211_BAND_5GHZ) || !conf->bands) + conf->band_cfgs[NL80211_BAND_5GHZ].awake_dw_interval = 1; + + /* On 2.4 GHz band use channel 6 */ + conf->band_cfgs[NL80211_BAND_2GHZ].chan = + nl80211_get_nan_channel(wiphy, 2437); + if (!conf->band_cfgs[NL80211_BAND_2GHZ].chan) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_NAN_CONFIG]) + goto out; + + err = nla_parse_nested(attrs, NL80211_NAN_CONF_ATTR_MAX, + info->attrs[NL80211_ATTR_NAN_CONFIG], NULL, + info->extack); + if (err) + return err; + + changed |= CFG80211_NAN_CONF_CHANGED_CONFIG; + if (attrs[NL80211_NAN_CONF_CLUSTER_ID]) + conf->cluster_id = + nla_data(attrs[NL80211_NAN_CONF_CLUSTER_ID]); + + if (attrs[NL80211_NAN_CONF_EXTRA_ATTRS]) { + conf->extra_nan_attrs = + nla_data(attrs[NL80211_NAN_CONF_EXTRA_ATTRS]); + conf->extra_nan_attrs_len = + nla_len(attrs[NL80211_NAN_CONF_EXTRA_ATTRS]); + } + + if (attrs[NL80211_NAN_CONF_VENDOR_ELEMS]) { + conf->vendor_elems = + nla_data(attrs[NL80211_NAN_CONF_VENDOR_ELEMS]); + conf->vendor_elems_len = + nla_len(attrs[NL80211_NAN_CONF_VENDOR_ELEMS]); + } + + if (attrs[NL80211_NAN_CONF_BAND_CONFIGS]) { + nla_for_each_nested(band_config, + attrs[NL80211_NAN_CONF_BAND_CONFIGS], + rem) { + enum nl80211_band band; + struct cfg80211_nan_band_config *cfg; + struct nlattr *tb[NL80211_NAN_BAND_CONF_ATTR_MAX + 1]; + + err = nla_parse_nested(tb, + NL80211_NAN_BAND_CONF_ATTR_MAX, + band_config, NULL, + info->extack); + if (err) + return err; + + if (!tb[NL80211_NAN_BAND_CONF_BAND]) + return -EINVAL; + + band = nla_get_u8(tb[NL80211_NAN_BAND_CONF_BAND]); + if (conf->bands && !(conf->bands & BIT(band))) + return -EINVAL; + + cfg = &conf->band_cfgs[band]; + + err = nl80211_parse_nan_band_config(wiphy, tb, cfg, + band); + if (err) + return err; + } + } + + if (attrs[NL80211_NAN_CONF_SCAN_PERIOD]) + conf->scan_period = + nla_get_u16(attrs[NL80211_NAN_CONF_SCAN_PERIOD]); + + if (attrs[NL80211_NAN_CONF_SCAN_DWELL_TIME]) + conf->scan_dwell_time = + nla_get_u16(attrs[NL80211_NAN_CONF_SCAN_DWELL_TIME]); + + if (attrs[NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL]) + conf->discovery_beacon_interval = + nla_get_u8(attrs[NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL]); +out: + if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan && + (!conf->bands || conf->bands & BIT(NL80211_BAND_5GHZ))) { + /* If no 5GHz channel is specified use default, if possible */ + conf->band_cfgs[NL80211_BAND_5GHZ].chan = + nl80211_get_nan_channel(wiphy, 5745); + if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan) + conf->band_cfgs[NL80211_BAND_5GHZ].chan = + nl80211_get_nan_channel(wiphy, 5220); + + /* Return error if user space asked explicitly for 5 GHz */ + if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan && + conf->bands & BIT(NL80211_BAND_5GHZ)) { + NL_SET_ERR_MSG_ATTR(info->extack, + info->attrs[NL80211_ATTR_BANDS], + "5 GHz band operation is not allowed"); + return -EINVAL; + } + } + + if (changed_flags) + *changed_flags = changed; + + return 0; +} + static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -15414,23 +15669,13 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info) if (rfkill_blocked(rdev->wiphy.rfkill)) return -ERFKILL; + /* Master preference is mandatory for START_NAN */ if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) return -EINVAL; - conf.master_pref = - nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); - - if (info->attrs[NL80211_ATTR_BANDS]) { - u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]); - - if (bands & ~(u32)wdev->wiphy->nan_supported_bands) - return -EOPNOTSUPP; - - if (bands && !(bands & BIT(NL80211_BAND_2GHZ))) - return -EINVAL; - - conf.bands = bands; - } + err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, NULL); + if (err) + return err; err = rdev_start_nan(rdev, wdev, &conf); if (err) @@ -15786,6 +16031,7 @@ static int nl80211_nan_change_config(struct sk_buff *skb, struct wireless_dev *wdev = info->user_ptr[1]; struct cfg80211_nan_conf conf = {}; u32 changed = 0; + int err; if (wdev->iftype != NL80211_IFTYPE_NAN) return -EOPNOTSUPP; @@ -15793,27 +16039,9 @@ static int nl80211_nan_change_config(struct sk_buff *skb, if (!wdev_running(wdev)) return -ENOTCONN; - if (info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) { - conf.master_pref = - nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); - if (conf.master_pref <= 1 || conf.master_pref == 255) - return -EINVAL; - - changed |= CFG80211_NAN_CONF_CHANGED_PREF; - } - - if (info->attrs[NL80211_ATTR_BANDS]) { - u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]); - - if (bands & ~(u32)wdev->wiphy->nan_supported_bands) - return -EOPNOTSUPP; - - if (bands && !(bands & BIT(NL80211_BAND_2GHZ))) - return -EINVAL; - - conf.bands = bands; - changed |= CFG80211_NAN_CONF_CHANGED_BANDS; - } + err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, &changed); + if (err) + return err; if (!changed) return -EINVAL; -- cgit v1.2.3 From ba9b2ceaa2558a38a5da59fd654b641610a8568e Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Mon, 8 Sep 2025 14:12:56 +0300 Subject: wifi: nl80211: Add NAN Discovery Window (DW) notification This notification will be used by the device to inform user space about upcoming DW. When received, user space will be able to prepare multicast Service Discovery Frames (SDFs) to be transmitted during the next DW using %NL80211_CMD_FRAME command on the NAN management interface. The device/driver will take care to transmit the frames in the correct timing. This allows to implement a synchronized Discovery Engine (DE) in user space, if the device doesn't support DE offload. Note that this notification can be sent before the actual DW starts as long as the driver/device handles the actual timing of the SDF transmission. Signed-off-by: Andrei Otcheretianski Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.0e1d15031bab.I5b1721e61b63910452b3c5cdcdc1e94cb094d4c9@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 12 ++++++++++++ include/uapi/linux/nl80211.h | 16 ++++++++++++++++ net/wireless/nl80211.c | 45 ++++++++++++++++++++++++++++++++++++++++++++ net/wireless/trace.h | 16 ++++++++++++++++ 4 files changed, 89 insertions(+) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index e2f4ca500ea3..0c1311d254be 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3959,6 +3959,8 @@ struct cfg80211_nan_band_config { * @scan_period: period (in seconds) between NAN scans. * @scan_dwell_time: dwell time (in milliseconds) for NAN scans. * @discovery_beacon_interval: interval (in TUs) for discovery beacons. + * @enable_dw_notification: flag to enable/disable discovery window + * notifications. * @band_cfgs: array of band specific configurations, indexed by * &enum nl80211_band values. * @extra_nan_attrs: pointer to additional NAN attributes. @@ -3973,6 +3975,7 @@ struct cfg80211_nan_conf { u16 scan_period; u16 scan_dwell_time; u8 discovery_beacon_interval; + bool enable_dw_notification; struct cfg80211_nan_band_config band_cfgs[NUM_NL80211_BANDS]; const u8 *extra_nan_attrs; u16 extra_nan_attrs_len; @@ -10062,6 +10065,15 @@ void cfg80211_schedule_channels_check(struct wireless_dev *wdev); */ void cfg80211_epcs_changed(struct net_device *netdev, bool enabled); +/** + * cfg80211_next_nan_dw_notif - Notify about the next NAN Discovery Window (DW) + * @wdev: Pointer to the wireless device structure + * @chan: DW channel (6, 44 or 149) + * @gfp: Memory allocation flags + */ +void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev, + struct ieee80211_channel *chan, gfp_t gfp); + #ifdef CONFIG_CFG80211_DEBUGFS /** * wiphy_locked_debugfs_read - do a locked read in debugfs diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 20b8202a3d58..d674608e2635 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1349,6 +1349,15 @@ * control EPCS configuration. Used to notify userland on the current state * of EPCS. * + * @NL80211_CMD_NAN_NEXT_DW_NOTIFICATION: This command is used to notify + * user space about the next NAN Discovery Window (DW). User space may use + * it to prepare frames to be sent in the next DW. + * %NL80211_ATTR_WIPHY_FREQ is used to indicate the frequency of the next + * DW. SDF transmission should be requested with %NL80211_CMD_FRAME and + * the device/driver shall take care of the actual transmission timing. + * This notification is only sent to the NAN interface owning socket + * (see %NL80211_ATTR_SOCKET_OWNER flag). + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1609,6 +1618,8 @@ enum nl80211_commands { NL80211_CMD_ASSOC_MLO_RECONF, NL80211_CMD_EPCS_CFG, + NL80211_CMD_NAN_NEXT_DW_NOTIFICATION, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -7409,6 +7420,10 @@ enum nl80211_nan_band_conf_attributes { * @NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL: Discovery beacon interval * in TUs. Valid range is 50-200 TUs. If not configured the device default * value is used. This is u8 (optional) + * @NL80211_NAN_CONF_NOTIFY_DW: If set, the driver will notify userspace about + * the upcoming discovery window with + * %NL80211_CMD_NAN_NEXT_DW_NOTIFICATION. + * This is a flag attribute. * @NUM_NL80211_NAN_CONF_ATTR: Internal. * @NL80211_NAN_CONF_ATTR_MAX: Highest NAN configuration attribute. * @@ -7423,6 +7438,7 @@ enum nl80211_nan_conf_attributes { NL80211_NAN_CONF_SCAN_PERIOD, NL80211_NAN_CONF_SCAN_DWELL_TIME, NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL, + NL80211_NAN_CONF_NOTIFY_DW, /* keep last */ NUM_NL80211_NAN_CONF_ATTR, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 04679acc8135..d64145746b65 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -547,6 +547,7 @@ nl80211_nan_conf_policy[NL80211_NAN_CONF_ATTR_MAX + 1] = { [NL80211_NAN_CONF_SCAN_DWELL_TIME] = NLA_POLICY_RANGE(NLA_U16, 50, 512), [NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL] = NLA_POLICY_RANGE(NLA_U8, 50, 200), + [NL80211_NAN_CONF_NOTIFY_DW] = { .type = NLA_FLAG }, }; static const struct netlink_range_validation nl80211_punct_bitmap_range = { @@ -15627,6 +15628,11 @@ static int nl80211_parse_nan_conf(struct wiphy *wiphy, if (attrs[NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL]) conf->discovery_beacon_interval = nla_get_u8(attrs[NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL]); + + if (attrs[NL80211_NAN_CONF_NOTIFY_DW]) + conf->enable_dw_notification = + nla_get_flag(attrs[NL80211_NAN_CONF_NOTIFY_DW]); + out: if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan && (!conf->bands || conf->bands & BIT(NL80211_BAND_5GHZ))) { @@ -21764,6 +21770,45 @@ void cfg80211_epcs_changed(struct net_device *netdev, bool enabled) } EXPORT_SYMBOL(cfg80211_epcs_changed); +void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev, + struct ieee80211_channel *chan, gfp_t gfp) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct sk_buff *msg; + void *hdr; + + trace_cfg80211_next_nan_dw_notif(wdev, chan); + + if (!wdev->owner_nlportid) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, + NL80211_CMD_NAN_NEXT_DW_NOTIFICATION); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, chan->center_freq)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + genlmsg_unicast(wiphy_net(wiphy), msg, wdev->owner_nlportid); + + return; + + nla_put_failure: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_next_nan_dw_notif); + /* initialisation/exit functions */ int __init nl80211_init(void) diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 9b6074155d59..ff47e9bffd4f 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -4166,6 +4166,22 @@ TRACE_EVENT(cfg80211_epcs_changed, WDEV_PR_ARG, __entry->enabled) ); +TRACE_EVENT(cfg80211_next_nan_dw_notif, + TP_PROTO(struct wireless_dev *wdev, + struct ieee80211_channel *chan), + TP_ARGS(wdev, chan), + TP_STRUCT__entry( + WDEV_ENTRY + CHAN_ENTRY + ), + TP_fast_assign( + WDEV_ASSIGN; + CHAN_ASSIGN(chan); + ), + TP_printk(WDEV_PR_FMT " " CHAN_PR_FMT, + WDEV_PR_ARG, CHAN_PR_ARG) +); + #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3 From 1ccfd8db34fb3b1852284668094d7207499c2415 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Mon, 8 Sep 2025 14:12:57 +0300 Subject: wifi: cfg80211: Add cluster joined notification APIs The drivers should notify upper layers and user space when a NAN device joins a cluster. This is needed, for example, to set the correct addr3 in SDF frames. Add API to report cluster join event. Signed-off-by: Andrei Otcheretianski Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.ad27b7b6e4d9.I70b213a2a49f18d1ba2ad325e67e8eff51cc7a1f@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 14 ++++++++++++++ include/uapi/linux/nl80211.h | 8 ++++++++ net/wireless/nl80211.c | 41 +++++++++++++++++++++++++++++++++++++++++ net/wireless/trace.h | 19 +++++++++++++++++++ 4 files changed, 82 insertions(+) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 0c1311d254be..1b10bd31bdd6 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -10074,6 +10074,20 @@ void cfg80211_epcs_changed(struct net_device *netdev, bool enabled); void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev, struct ieee80211_channel *chan, gfp_t gfp); +/** + * cfg80211_nan_cluster_joined - Notify about NAN cluster join + * @wdev: Pointer to the wireless device structure + * @cluster_id: Cluster ID of the NAN cluster that was joined or started + * @new_cluster: Indicates if this is a new cluster or an existing one + * @gfp: Memory allocation flags + * + * This function is used to notify user space when a NAN cluster has been + * joined, providing the cluster ID and a flag whether it is a new cluster. + */ +void cfg80211_nan_cluster_joined(struct wireless_dev *wdev, + const u8 *cluster_id, bool new_cluster, + gfp_t gfp); + #ifdef CONFIG_CFG80211_DEBUGFS /** * wiphy_locked_debugfs_read - do a locked read in debugfs diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index d674608e2635..c5a7658b7297 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1357,6 +1357,9 @@ * the device/driver shall take care of the actual transmission timing. * This notification is only sent to the NAN interface owning socket * (see %NL80211_ATTR_SOCKET_OWNER flag). + * @NL80211_CMD_NAN_CLUSTER_JOINED: This command is used to notify + * user space that the NAN new cluster has been joined. The cluster ID is + * indicated by %NL80211_ATTR_MAC. * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use @@ -1619,6 +1622,7 @@ enum nl80211_commands { NL80211_CMD_EPCS_CFG, NL80211_CMD_NAN_NEXT_DW_NOTIFICATION, + NL80211_CMD_NAN_CLUSTER_JOINED, /* add new commands above here */ @@ -2957,6 +2961,9 @@ enum nl80211_commands { * %NL80211_CMD_START_NAN and %NL80211_CMD_CHANGE_NAN_CONFIG. * See &enum nl80211_nan_conf_attributes for details. * This attribute is optional. + * @NL80211_ATTR_NAN_NEW_CLUSTER: Flag attribute indicating that a new + * NAN cluster has been created. This is used with + * %NL80211_CMD_NAN_CLUSTER_JOINED * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined @@ -3521,6 +3528,7 @@ enum nl80211_attrs { NL80211_ATTR_S1G_SHORT_BEACON, NL80211_ATTR_BSS_PARAM, NL80211_ATTR_NAN_CONFIG, + NL80211_ATTR_NAN_NEW_CLUSTER, /* add attributes here, update the policy in nl80211.c */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d64145746b65..904a725a4f4a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -21809,6 +21809,47 @@ void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev, } EXPORT_SYMBOL(cfg80211_next_nan_dw_notif); +void cfg80211_nan_cluster_joined(struct wireless_dev *wdev, + const u8 *cluster_id, bool new_cluster, + gfp_t gfp) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct sk_buff *msg; + void *hdr; + + trace_cfg80211_nan_cluster_joined(wdev, cluster_id, new_cluster); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_NAN_CLUSTER_JOINED); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD) || + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, cluster_id) || + (new_cluster && nla_put_flag(msg, NL80211_ATTR_NAN_NEW_CLUSTER))) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + if (!wdev->owner_nlportid) + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(wiphy), + msg, 0, NL80211_MCGRP_NAN, gfp); + else + genlmsg_unicast(wiphy_net(wiphy), msg, + wdev->owner_nlportid); + return; + + nla_put_failure: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_nan_cluster_joined); + /* initialisation/exit functions */ int __init nl80211_init(void) diff --git a/net/wireless/trace.h b/net/wireless/trace.h index ff47e9bffd4f..8a4c34112eb5 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -4182,6 +4182,25 @@ TRACE_EVENT(cfg80211_next_nan_dw_notif, WDEV_PR_ARG, CHAN_PR_ARG) ); +TRACE_EVENT(cfg80211_nan_cluster_joined, + TP_PROTO(struct wireless_dev *wdev, + const u8 *cluster_id, + bool new_cluster), + TP_ARGS(wdev, cluster_id, new_cluster), + TP_STRUCT__entry( + WDEV_ENTRY + MAC_ENTRY(cluster_id) + __field(bool, new_cluster) + ), + TP_fast_assign( + WDEV_ASSIGN; + MAC_ASSIGN(cluster_id, cluster_id); + __entry->new_cluster = new_cluster; + ), + TP_printk(WDEV_PR_FMT " cluster_id %pMF%s", + WDEV_PR_ARG, __entry->cluster_id, + __entry->new_cluster ? " [new]" : "") +); #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3 From b9c3d426c8a5823b3a1e5078719750c6abb0d2c1 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 8 Sep 2025 14:12:59 +0300 Subject: wifi: cfg80211: Advertise supported NAN capabilities Allow drivers to specify the supported NAN capabilities and support advertising the NAN capabilities to user space. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.2976966556f5.Ic6e43b10049573180c909dad806f279cfb31143e@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 17 +++++++++++++++++ include/net/cfg80211.h | 38 ++++++++++++++++++++++++++++++++++++++ net/wireless/nl80211.c | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+) (limited to 'net') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index d350263f23f3..2110345de8ef 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -6065,4 +6065,21 @@ static inline u32 ieee80211_eml_trans_timeout_in_us(u16 eml_cap) _data + ieee80211_mle_common_size(_data),\ _len - ieee80211_mle_common_size(_data)) +/* NAN operation mode, as defined in Wi-Fi Aware (TM) specification Table 81 */ +#define NAN_OP_MODE_PHY_MODE_VHT 0x01 +#define NAN_OP_MODE_PHY_MODE_HE 0x10 +#define NAN_OP_MODE_PHY_MODE_MASK 0x11 +#define NAN_OP_MODE_80P80MHZ 0x02 +#define NAN_OP_MODE_160MHZ 0x04 +#define NAN_OP_MODE_PNDL_SUPPRTED 0x08 + +/* NAN Device capabilities, as defined in Wi-Fi Aware (TM) specification + * Table 79 + */ +#define NAN_DEV_CAPA_DFS_OWNER 0x01 +#define NAN_DEV_CAPA_EXT_KEY_ID_SUPPORTED 0x02 +#define NAN_DEV_CAPA_SIM_NDP_RX_SUPPORTED 0x04 +#define NAN_DEV_CAPA_NDPE_SUPPORTED 0x08 +#define NAN_DEV_CAPA_S3_SUPPORTED 0x10 + #endif /* LINUX_IEEE80211_H */ diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 1b10bd31bdd6..e30c1886c530 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5711,6 +5711,42 @@ struct wiphy_radio { u32 antenna_mask; }; +/** + * enum wiphy_nan_flags - NAN capabilities + * + * @WIPHY_NAN_FLAGS_CONFIGURABLE_SYNC: Device supports NAN configurable + * synchronization. + * @WIPHY_NAN_FLAGS_USERSPACE_DE: Device doesn't support DE offload. + */ +enum wiphy_nan_flags { + WIPHY_NAN_FLAGS_CONFIGURABLE_SYNC = BIT(0), + WIPHY_NAN_FLAGS_USERSPACE_DE = BIT(1), +}; + +/** + * struct wiphy_nan_capa - NAN capabilities + * + * This structure describes the NAN capabilities of a wiphy. + * + * @flags: NAN capabilities flags, see &enum wiphy_nan_flags + * @op_mode: NAN operation mode, as defined in Wi-Fi Aware (TM) specification + * Table 81. + * @n_antennas: number of antennas supported by the device for Tx/Rx. Lower + * nibble indicates the number of TX antennas and upper nibble indicates the + * number of RX antennas. Value 0 indicates the information is not + * available. + * @max_channel_switch_time: maximum channel switch time in milliseconds. + * @dev_capabilities: NAN device capabilities as defined in Wi-Fi Aware (TM) + * specification Table 79 (Capabilities field). + */ +struct wiphy_nan_capa { + u32 flags; + u8 op_mode; + u8 n_antennas; + u16 max_channel_switch_time; + u8 dev_capabilities; +}; + #define CFG80211_HW_TIMESTAMP_ALL_PEERS 0xffff /** @@ -5884,6 +5920,7 @@ struct wiphy_radio { * bitmap of &enum nl80211_band values. For instance, for * NL80211_BAND_2GHZ, bit 0 would be set * (i.e. BIT(NL80211_BAND_2GHZ)). + * @nan_capa: NAN capabilities * * @txq_limit: configuration of internal TX queue frame limit * @txq_memory_limit: configuration internal TX queue memory limit @@ -6065,6 +6102,7 @@ struct wiphy { u32 bss_select_support; u8 nan_supported_bands; + struct wiphy_nan_capa nan_capa; u32 txq_limit; u32 txq_memory_limit; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 904a725a4f4a..bcd18ae59e84 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2605,6 +2605,41 @@ fail: return -ENOBUFS; } +static int nl80211_put_nan_capa(struct wiphy *wiphy, struct sk_buff *msg) +{ + struct nlattr *nan_caps; + + nan_caps = nla_nest_start(msg, NL80211_ATTR_NAN_CAPABILITIES); + if (!nan_caps) + return -ENOBUFS; + + if (wiphy->nan_capa.flags & WIPHY_NAN_FLAGS_CONFIGURABLE_SYNC && + nla_put_flag(msg, NL80211_NAN_CAPA_CONFIGURABLE_SYNC)) + goto fail; + + if ((wiphy->nan_capa.flags & WIPHY_NAN_FLAGS_USERSPACE_DE) && + nla_put_flag(msg, NL80211_NAN_CAPA_USERSPACE_DE)) + goto fail; + + if (nla_put_u8(msg, NL80211_NAN_CAPA_OP_MODE, + wiphy->nan_capa.op_mode) || + nla_put_u8(msg, NL80211_NAN_CAPA_NUM_ANTENNAS, + wiphy->nan_capa.n_antennas) || + nla_put_u16(msg, NL80211_NAN_CAPA_MAX_CHANNEL_SWITCH_TIME, + wiphy->nan_capa.max_channel_switch_time) || + nla_put_u8(msg, NL80211_NAN_CAPA_CAPABILITIES, + wiphy->nan_capa.dev_capabilities)) + goto fail; + + nla_nest_end(msg, nan_caps); + + return 0; + +fail: + nla_nest_cancel(msg, nan_caps); + return -ENOBUFS; +} + struct nl80211_dump_wiphy_state { s64 filter_wiphy; long start; @@ -3257,6 +3292,12 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, if (nl80211_put_radios(&rdev->wiphy, msg)) goto nla_put_failure; + state->split_start++; + break; + case 18: + if (nl80211_put_nan_capa(&rdev->wiphy, msg)) + goto nla_put_failure; + /* done */ state->split_start = 0; break; -- cgit v1.2.3 From 78e3bd0133f1981755fd0372013a77819e22c825 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 8 Sep 2025 14:13:00 +0300 Subject: wifi: cfg80211: Support Tx/Rx of action frame for NAN Add support for sending and receiving action frames over a NAN Device interface: - For Synchronized NAN operation NAN Service Discovery Frames (SDFs) and NAN Action Frames (NAFs) transmissions over a NAN Device interface, a channel parameter is not mandatory as the frame can be transmitted based on the NAN Device schedule. - For Unsynchronized NAN Discovery (USD) operation the SDFs and NAFs could be transmitted using NL80211_CMD_FRAME where a specific channel and dwell time are configured. As Synchronized NAN Operation and USD can be done concurrently, both modes need to be supported. Thus, allow sending NAN action frames when user space handles the NAN Discovery Engine (DE) with and without providing a channel as a parameter. To support reception of NAN Action frames and Authentication frames (used for NAN paring and verification) allow to register for management frame reception of NAN Device interface when user space handles the NAN DE. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.71da2b062929.I0166d51dcf14393f628cd5da366c21114f518618@changeid Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index bcd18ae59e84..72f68a17c92b 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -13782,7 +13782,9 @@ static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info) break; case NL80211_IFTYPE_NAN: if (!wiphy_ext_feature_isset(wdev->wiphy, - NL80211_EXT_FEATURE_SECURE_NAN)) + NL80211_EXT_FEATURE_SECURE_NAN) && + !(wdev->wiphy->nan_capa.flags & + WIPHY_NAN_FLAGS_USERSPACE_DE)) return -EOPNOTSUPP; break; default: @@ -13843,7 +13845,9 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) break; case NL80211_IFTYPE_NAN: if (!wiphy_ext_feature_isset(wdev->wiphy, - NL80211_EXT_FEATURE_SECURE_NAN)) + NL80211_EXT_FEATURE_SECURE_NAN) && + !(wdev->wiphy->nan_capa.flags & + WIPHY_NAN_FLAGS_USERSPACE_DE)) return -EOPNOTSUPP; break; default: -- cgit v1.2.3 From 1884e2594b084a6b1eb438e5eda586f284d80fee Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 8 Sep 2025 14:13:01 +0300 Subject: wifi: cfg80211: Store the NAN cluster ID When the driver indicates that the device has joined a cluster, store the cluster ID. This is needed for data path operations, e.g., filtering received frames etc. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.63e9fef2a3aa.I6c858185c9e71f84bd2c5174d7ee45902b4391c3@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ net/wireless/nl80211.c | 2 ++ 2 files changed, 5 insertions(+) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index e30c1886c530..26fd42e189ce 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -6681,6 +6681,9 @@ struct wireless_dev { struct { struct cfg80211_chan_def chandef; } ocb; + struct { + u8 cluster_id[ETH_ALEN] __aligned(2); + } nan; } u; struct { diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 72f68a17c92b..4e0d40865441 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -21865,6 +21865,8 @@ void cfg80211_nan_cluster_joined(struct wireless_dev *wdev, trace_cfg80211_nan_cluster_joined(wdev, cluster_id, new_cluster); + memcpy(wdev->u.nan.cluster_id, cluster_id, ETH_ALEN); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); if (!msg) return; -- cgit v1.2.3 From fc41f4a28ac4d462487903229494eeb266f68a40 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 8 Sep 2025 14:13:02 +0300 Subject: wifi: mac80211: Support Tx of action frame for NAN Add support for sending management frame over a NAN Device interface: - Declare support for the supported management frames types. - Since action frame transmissions over a NAN Device interface do not necessarily require a channel configuration, e.g., they can be transmitted during DW, modify the Tx path to avoid accessing channel information for NAN Device interface. - In addition modify the points in the Tx path logic to account for cases that a band is not specified in the Tx information. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.23b160089228.I65a58af753bcbcfb5c4ad8ef372d546f889725ba@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 ++++ net/mac80211/main.c | 5 +++++ net/mac80211/offchannel.c | 5 ++++- net/mac80211/rate.c | 11 ++++++++++- net/mac80211/tx.c | 12 ++++++++++-- 5 files changed, 33 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index a45e4bee65d4..a5140ecf334b 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3192,6 +3192,10 @@ ieee80211_get_tx_rate(const struct ieee80211_hw *hw, { if (WARN_ON_ONCE(c->control.rates[0].idx < 0)) return NULL; + + if (c->band >= NUM_NL80211_BANDS) + return NULL; + return &hw->wiphy->bands[c->band]->bitrates[c->control.rates[0].idx]; } diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 437f1363c982..27b3ec5deabe 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -746,6 +746,11 @@ ieee80211_default_mgmt_stypes[NUM_NL80211_IFTYPES] = { BIT(IEEE80211_STYPE_PROBE_REQ >> 4) | BIT(IEEE80211_STYPE_AUTH >> 4), }, + [NL80211_IFTYPE_NAN] = { + .tx = 0xffff, + .rx = BIT(IEEE80211_STYPE_ACTION >> 4) | + BIT(IEEE80211_STYPE_AUTH >> 4), + }, }; static const struct ieee80211_ht_cap mac80211_ht_capa_mod_mask = { diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 13df6321634d..ae82533e3c02 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -8,7 +8,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007, Michael Wu * Copyright 2009 Johannes Berg - * Copyright (C) 2019, 2022-2024 Intel Corporation + * Copyright (C) 2019, 2022-2025 Intel Corporation */ #include #include @@ -897,6 +897,7 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, need_offchan = true; break; case NL80211_IFTYPE_NAN: + break; default: return -EOPNOTSUPP; } @@ -910,6 +911,8 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, /* Check if the operating channel is the requested channel */ if (!params->chan && mlo_sta) { need_offchan = false; + } else if (sdata->vif.type == NL80211_IFTYPE_NAN) { + /* Frames can be sent during NAN schedule */ } else if (!need_offchan) { struct ieee80211_chanctx_conf *chanctx_conf = NULL; int i; diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 3cb2ad6d0b28..e441f8541603 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -4,7 +4,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2019, 2022-2024 Intel Corporation + * Copyright (C) 2019, 2022-2025 Intel Corporation */ #include @@ -98,6 +98,9 @@ void rate_control_tx_status(struct ieee80211_local *local, if (!ref || !test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) return; + if (st->info->band >= NUM_NL80211_BANDS) + return; + sband = local->hw.wiphy->bands[st->info->band]; spin_lock_bh(&sta->rate_ctrl_lock); @@ -419,6 +422,9 @@ static bool rate_control_send_low(struct ieee80211_sta *pubsta, int mcast_rate; bool use_basicrate = false; + if (!sband) + return false; + if (!pubsta || rc_no_data_or_no_ack_use_min(txrc)) { __rate_control_send_low(txrc->hw, sband, pubsta, info, txrc->rate_idx_mask); @@ -898,6 +904,9 @@ void ieee80211_get_tx_rates(struct ieee80211_vif *vif, return; sdata = vif_to_sdata(vif); + if (info->band >= NUM_NL80211_BANDS) + return; + sband = sdata->local->hw.wiphy->bands[info->band]; if (ieee80211_is_tx_data(skb)) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index a27e2af5d569..ba51198be94a 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -59,6 +59,9 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, if (WARN_ON_ONCE(tx->rate.idx < 0)) return 0; + if (info->band >= NUM_NL80211_BANDS) + return 0; + sband = local->hw.wiphy->bands[info->band]; txrate = &sband->bitrates[tx->rate.idx]; @@ -683,7 +686,10 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) memset(&txrc, 0, sizeof(txrc)); - sband = tx->local->hw.wiphy->bands[info->band]; + if (info->band < NUM_NL80211_BANDS) + sband = tx->local->hw.wiphy->bands[info->band]; + else + return TX_CONTINUE; len = min_t(u32, tx->skb->len + FCS_LEN, tx->local->hw.wiphy->frag_threshold); @@ -6288,7 +6294,9 @@ void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata, enum nl80211_band band; rcu_read_lock(); - if (!ieee80211_vif_is_mld(&sdata->vif)) { + if (sdata->vif.type == NL80211_IFTYPE_NAN) { + band = NUM_NL80211_BANDS; + } else if (!ieee80211_vif_is_mld(&sdata->vif)) { WARN_ON(link_id >= 0); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); -- cgit v1.2.3 From 488d2e0bba65257cd0e723c413f02a9caf95b27c Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 8 Sep 2025 14:13:03 +0300 Subject: wifi: mac80211: Accept management frames on NAN interface Accept Public Action frames and Authentication frames on NAN Device interface to support flows that require these frames: - SDFs: For user space Discovery Engine (DE) implementation. - NAFs: For user space NAN Data Path (NDP) establishment. - Authentication frames: For NAN Pairing and Verification. Accept only frames from devices that are part of the NAN cluster. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.46528d69e881.Ifccd87fb2a49a3af05238f74f52fa6da8de28811@changeid Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 4d4ff4d4917a..feb81ffa4f8c 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4502,8 +4502,16 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) (ieee80211_is_auth(hdr->frame_control) && ether_addr_equal(sdata->vif.addr, hdr->addr1)); case NL80211_IFTYPE_NAN: - /* Currently no frames on NAN interface are allowed */ - return false; + /* Accept only frames that are addressed to the NAN cluster + * (based on the Cluster ID). From these frames, accept only + * action frames or authentication frames that are addressed to + * the local NAN interface. + */ + return memcmp(sdata->wdev.u.nan.cluster_id, + hdr->addr3, ETH_ALEN) == 0 && + (ieee80211_is_public_action(hdr, skb->len) || + (ieee80211_is_auth(hdr->frame_control) && + ether_addr_equal(sdata->vif.addr, hdr->addr1))); default: break; } -- cgit v1.2.3 From 8f79d2f13dd3b0af00a5303d4ff913767dd7684e Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 8 Sep 2025 14:13:04 +0300 Subject: wifi: mac80211: Track NAN interface start/stop In case that NAN is started, mark the device as non idle, and set LED triggering similar to scan and ROC. Set the device to idle once NAN is stopped. Signed-off-by: Ilan Peer Reviewed-by: Andrei Otcheretianski Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.2711d62fce22.I9b9f826490e50967a66788d713b0eba985879873@changeid Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 20 +++++++++++++++++--- net/mac80211/ieee80211_i.h | 2 ++ net/mac80211/iface.c | 9 +++++++++ 3 files changed, 28 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index b26f61f13605..252c0804de2f 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -320,6 +320,9 @@ static int ieee80211_start_nan(struct wiphy *wiphy, lockdep_assert_wiphy(sdata->local->hw.wiphy); + if (sdata->u.nan.started) + return -EALREADY; + ret = ieee80211_check_combinations(sdata, NULL, 0, 0, -1); if (ret < 0) return ret; @@ -329,12 +332,18 @@ static int ieee80211_start_nan(struct wiphy *wiphy, return ret; ret = drv_start_nan(sdata->local, sdata, conf); - if (ret) + if (ret) { ieee80211_sdata_stop(sdata); + return ret; + } - sdata->u.nan.conf = *conf; + sdata->u.nan.started = true; + ieee80211_recalc_idle(sdata->local); - return ret; + sdata->u.nan.conf.master_pref = conf->master_pref; + sdata->u.nan.conf.bands = conf->bands; + + return 0; } static void ieee80211_stop_nan(struct wiphy *wiphy, @@ -342,8 +351,13 @@ static void ieee80211_stop_nan(struct wiphy *wiphy, { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + if (!sdata->u.nan.started) + return; + drv_stop_nan(sdata->local, sdata); + sdata->u.nan.started = false; ieee80211_sdata_stop(sdata); + ieee80211_recalc_idle(sdata->local); } static int ieee80211_nan_change_conf(struct wiphy *wiphy, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 8a666faeb1ec..48e1ba919fba 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -985,11 +985,13 @@ struct ieee80211_if_mntr { * struct ieee80211_if_nan - NAN state * * @conf: current NAN configuration + * @started: true iff NAN is started * @func_lock: lock for @func_inst_ids * @function_inst_ids: a bitmap of available instance_id's */ struct ieee80211_if_nan { struct cfg80211_nan_conf conf; + bool started; /* protects function_inst_ids */ spinlock_t func_lock; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 07ba68f7cd81..4a9175d9f51c 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -107,6 +107,7 @@ static u32 __ieee80211_recalc_idle(struct ieee80211_local *local, { bool working, scanning, active; unsigned int led_trig_start = 0, led_trig_stop = 0; + struct ieee80211_sub_if_data *iter; lockdep_assert_wiphy(local->hw.wiphy); @@ -117,6 +118,14 @@ static u32 __ieee80211_recalc_idle(struct ieee80211_local *local, working = !local->ops->remain_on_channel && !list_empty(&local->roc_list); + list_for_each_entry(iter, &local->interfaces, list) { + if (iter->vif.type == NL80211_IFTYPE_NAN && + iter->u.nan.started) { + working = true; + break; + } + } + scanning = test_bit(SCAN_SW_SCANNING, &local->scanning) || test_bit(SCAN_ONCHANNEL_SCANNING, &local->scanning); -- cgit v1.2.3 From c7b5355b37a59c927b2374e9f783acd004d00960 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 8 Sep 2025 14:13:05 +0300 Subject: wifi: mac80211: Get the correct interface for non-netdev skb status The function ieee80211_sdata_from_skb() always returned the P2P Device interface in case the skb was not associated with a netdev and didn't consider the possibility that an NAN Device interface is also enabled. To support configurations where both P2P Device and a NAN Device interface are active, extend the function to match the correct interface based on address 2 in the 802.11 MAC header. Since the 'p2p_sdata' field in struct ieee80211_local is no longer needed, remove it. Signed-off-by: Ilan Peer Reviewed-by: Andrei Otcheretianski Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.5252d2579a49.Id4576531c6b2ad83c9498b708dc0ade6b0214fa8@changeid Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 2 -- net/mac80211/iface.c | 16 +--------------- net/mac80211/status.c | 21 +++++++++++++++++++-- 3 files changed, 20 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 48e1ba919fba..242cb109b232 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1676,8 +1676,6 @@ struct ieee80211_local { struct idr ack_status_frames; spinlock_t ack_status_lock; - struct ieee80211_sub_if_data __rcu *p2p_sdata; - /* virtual monitor interface */ struct ieee80211_sub_if_data __rcu *monitor_sdata; struct ieee80211_chan_req monitor_chanreq; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 4a9175d9f51c..a7873832d4fa 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -620,10 +620,6 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do spin_unlock_bh(&sdata->u.nan.func_lock); break; - case NL80211_IFTYPE_P2P_DEVICE: - /* relies on synchronize_rcu() below */ - RCU_INIT_POINTER(local->p2p_sdata, NULL); - fallthrough; default: wiphy_work_cancel(sdata->local->hw.wiphy, &sdata->work); /* @@ -1414,6 +1410,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) ieee80211_recalc_idle(local); netif_carrier_on(dev); + list_add_tail_rcu(&sdata->u.mntr.list, &local->mon_list); break; default: if (coming_up) { @@ -1477,17 +1474,6 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) sdata->vif.type != NL80211_IFTYPE_STATION); } - switch (sdata->vif.type) { - case NL80211_IFTYPE_P2P_DEVICE: - rcu_assign_pointer(local->p2p_sdata, sdata); - break; - case NL80211_IFTYPE_MONITOR: - list_add_tail_rcu(&sdata->u.mntr.list, &local->mon_list); - break; - default: - break; - } - /* * set_multicast_list will be invoked by the networking core * which will check whether any increments here were done in diff --git a/net/mac80211/status.c b/net/mac80211/status.c index a362254b310c..4b38aa0e902a 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2008-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright 2021-2024 Intel Corporation + * Copyright 2021-2025 Intel Corporation */ #include @@ -572,6 +572,7 @@ static struct ieee80211_sub_if_data * ieee80211_sdata_from_skb(struct ieee80211_local *local, struct sk_buff *skb) { struct ieee80211_sub_if_data *sdata; + struct ieee80211_hdr *hdr = (void *)skb->data; if (skb->dev) { list_for_each_entry_rcu(sdata, &local->interfaces, list) { @@ -585,7 +586,23 @@ ieee80211_sdata_from_skb(struct ieee80211_local *local, struct sk_buff *skb) return NULL; } - return rcu_dereference(local->p2p_sdata); + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + switch (sdata->vif.type) { + case NL80211_IFTYPE_P2P_DEVICE: + break; + case NL80211_IFTYPE_NAN: + if (sdata->u.nan.started) + break; + fallthrough; + default: + continue; + } + + if (ether_addr_equal(sdata->vif.addr, hdr->addr2)) + return sdata; + } + + return NULL; } static void ieee80211_report_ack_skb(struct ieee80211_local *local, -- cgit v1.2.3 From 04f17cfea2442ef2ed01da7ba1f686a58a50048e Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 8 Sep 2025 14:13:06 +0300 Subject: wifi: mac80211: Export an API to check if NAN is started So it can be used by drivers to check if NAN Device interface is started or not. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.c69652f77eb6.Ie4f3d197e0706e742e3d97614fadc11b22adfbc6@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 ++++++ net/mac80211/util.c | 8 ++++++++ 2 files changed, 14 insertions(+) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index a5140ecf334b..a55085cf4ec4 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -7838,4 +7838,10 @@ int ieee80211_emulate_switch_vif_chanctx(struct ieee80211_hw *hw, int n_vifs, enum ieee80211_chanctx_switch_mode mode); +/** + * ieee80211_vif_nan_started - Return whether a NAN vif is started + * @vif: the vif + * Return: %true iff the vif is a NAN interface and NAN is started + */ +bool ieee80211_vif_nan_started(struct ieee80211_vif *vif); #endif /* MAC80211_H */ diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 9eb35e3b9e52..123842b841f2 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -4512,3 +4512,11 @@ void ieee80211_clear_tpe(struct ieee80211_parsed_tpe *tpe) sizeof(tpe->psd_reg_client[i].power)); } } + +bool ieee80211_vif_nan_started(struct ieee80211_vif *vif) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + + return vif->type == NL80211_IFTYPE_NAN && sdata->u.nan.started; +} +EXPORT_SYMBOL_GPL(ieee80211_vif_nan_started); -- cgit v1.2.3 From 1d04fad3a495062a33940278536c15a29d0f0dbb Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 8 Sep 2025 14:13:07 +0300 Subject: wifi: mac80211: Extend support for changing NAN configuration As 'struct cfg80211_nan_config' was updated, update the relevant logic to accommodate these changes. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.92b530ddaedf.I2b6d6f6074e25487303fde573ce764a64f87bdcd@changeid Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 136 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 113 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 252c0804de2f..da15ccfcf4a2 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -311,6 +311,96 @@ static void ieee80211_stop_p2p_device(struct wiphy *wiphy, ieee80211_sdata_stop(IEEE80211_WDEV_TO_SUB_IF(wdev)); } +static void ieee80211_nan_conf_free(struct cfg80211_nan_conf *conf) +{ + kfree(conf->cluster_id); + kfree(conf->extra_nan_attrs); + kfree(conf->vendor_elems); + memset(conf, 0, sizeof(*conf)); +} + +static void ieee80211_stop_nan(struct wiphy *wiphy, + struct wireless_dev *wdev) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + + if (!sdata->u.nan.started) + return; + + drv_stop_nan(sdata->local, sdata); + sdata->u.nan.started = false; + + ieee80211_nan_conf_free(&sdata->u.nan.conf); + + ieee80211_sdata_stop(sdata); + ieee80211_recalc_idle(sdata->local); +} + +static int ieee80211_nan_conf_copy(struct cfg80211_nan_conf *dst, + struct cfg80211_nan_conf *src, + u32 changes) +{ + if (changes & CFG80211_NAN_CONF_CHANGED_PREF) + dst->master_pref = src->master_pref; + + if (changes & CFG80211_NAN_CONF_CHANGED_BANDS) + dst->bands = src->bands; + + if (changes & CFG80211_NAN_CONF_CHANGED_CONFIG) { + dst->scan_period = src->scan_period; + dst->scan_dwell_time = src->scan_dwell_time; + dst->discovery_beacon_interval = + src->discovery_beacon_interval; + dst->enable_dw_notification = src->enable_dw_notification; + memcpy(&dst->band_cfgs, &src->band_cfgs, + sizeof(dst->band_cfgs)); + + kfree(dst->cluster_id); + dst->cluster_id = NULL; + + kfree(dst->extra_nan_attrs); + dst->extra_nan_attrs = NULL; + dst->extra_nan_attrs_len = 0; + + kfree(dst->vendor_elems); + dst->vendor_elems = NULL; + dst->vendor_elems_len = 0; + + if (src->cluster_id) { + dst->cluster_id = kmemdup(src->cluster_id, ETH_ALEN, + GFP_KERNEL); + if (!dst->cluster_id) + goto no_mem; + } + + if (src->extra_nan_attrs && src->extra_nan_attrs_len) { + dst->extra_nan_attrs = kmemdup(src->extra_nan_attrs, + src->extra_nan_attrs_len, + GFP_KERNEL); + if (!dst->extra_nan_attrs) + goto no_mem; + + dst->extra_nan_attrs_len = src->extra_nan_attrs_len; + } + + if (src->vendor_elems && src->vendor_elems_len) { + dst->vendor_elems = kmemdup(src->vendor_elems, + src->vendor_elems_len, + GFP_KERNEL); + if (!dst->vendor_elems) + goto no_mem; + + dst->vendor_elems_len = src->vendor_elems_len; + } + } + + return 0; + +no_mem: + ieee80211_nan_conf_free(dst); + return -ENOMEM; +} + static int ieee80211_start_nan(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_nan_conf *conf) @@ -340,33 +430,22 @@ static int ieee80211_start_nan(struct wiphy *wiphy, sdata->u.nan.started = true; ieee80211_recalc_idle(sdata->local); - sdata->u.nan.conf.master_pref = conf->master_pref; - sdata->u.nan.conf.bands = conf->bands; + ret = ieee80211_nan_conf_copy(&sdata->u.nan.conf, conf, 0xFFFFFFFF); + if (ret) { + ieee80211_stop_nan(wiphy, wdev); + return ret; + } return 0; } -static void ieee80211_stop_nan(struct wiphy *wiphy, - struct wireless_dev *wdev) -{ - struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); - - if (!sdata->u.nan.started) - return; - - drv_stop_nan(sdata->local, sdata); - sdata->u.nan.started = false; - ieee80211_sdata_stop(sdata); - ieee80211_recalc_idle(sdata->local); -} - static int ieee80211_nan_change_conf(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_nan_conf *conf, u32 changes) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); - struct cfg80211_nan_conf new_conf; + struct cfg80211_nan_conf new_conf = {}; int ret = 0; if (sdata->vif.type != NL80211_IFTYPE_NAN) @@ -375,17 +454,28 @@ static int ieee80211_nan_change_conf(struct wiphy *wiphy, if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; - new_conf = sdata->u.nan.conf; + if (!changes) + return 0; - if (changes & CFG80211_NAN_CONF_CHANGED_PREF) - new_conf.master_pref = conf->master_pref; + /* First make a full copy of the previous configuration and then apply + * the changes. This might be a little wasteful, but it is simpler. + */ + ret = ieee80211_nan_conf_copy(&new_conf, &sdata->u.nan.conf, + 0xFFFFFFFF); + if (ret < 0) + return ret; - if (changes & CFG80211_NAN_CONF_CHANGED_BANDS) - new_conf.bands = conf->bands; + ret = ieee80211_nan_conf_copy(&new_conf, conf, changes); + if (ret < 0) + return ret; ret = drv_nan_change_conf(sdata->local, sdata, &new_conf, changes); - if (!ret) + if (ret) { + ieee80211_nan_conf_free(&new_conf); + } else { + ieee80211_nan_conf_free(&sdata->u.nan.conf); sdata->u.nan.conf = new_conf; + } return ret; } -- cgit v1.2.3 From 872e397d62a67843ea09cf6641819cb1a7e5ee98 Mon Sep 17 00:00:00 2001 From: pengdonglin Date: Tue, 16 Sep 2025 12:47:31 +0800 Subject: wifi: mac80211: Remove redundant rcu_read_lock/unlock() in spin_lock Since commit a8bb74acd8efe ("rcu: Consolidate RCU-sched update-side function definitions") there is no difference between rcu_read_lock(), rcu_read_lock_bh() and rcu_read_lock_sched() in terms of RCU read section and the relevant grace period. That means that spin_lock(), which implies rcu_read_lock_sched(), also implies rcu_read_lock(). There is no need no explicitly start a RCU read section if one has already been started implicitly by spin_lock(). Simplify the code and remove the inner rcu_read_lock() invocation. Cc: Johannes Berg Signed-off-by: pengdonglin Signed-off-by: pengdonglin Link: https://patch.msgid.link/20250916044735.2316171-11-dolinux.peng@gmail.com Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 2 -- net/mac80211/debugfs.c | 2 -- net/mac80211/debugfs_netdev.c | 2 -- net/mac80211/debugfs_sta.c | 2 -- net/mac80211/sta_info.c | 2 -- 5 files changed, 10 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index da15ccfcf4a2..d9aca1c3c097 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -4935,7 +4935,6 @@ static int ieee80211_get_txq_stats(struct wiphy *wiphy, int ret = 0; spin_lock_bh(&local->fq.lock); - rcu_read_lock(); if (wdev) { sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); @@ -4961,7 +4960,6 @@ static int ieee80211_get_txq_stats(struct wiphy *wiphy, } out: - rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); return ret; diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index e8b78ec682da..82099f4cedbe 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -82,7 +82,6 @@ static ssize_t aqm_read(struct file *file, int len = 0; spin_lock_bh(&local->fq.lock); - rcu_read_lock(); len = scnprintf(buf, sizeof(buf), "access name value\n" @@ -105,7 +104,6 @@ static ssize_t aqm_read(struct file *file, fq->limit, fq->quantum); - rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); return simple_read_from_buffer(user_buf, count, ppos, diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 1dac78271045..30a5a978a678 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -625,7 +625,6 @@ static ssize_t ieee80211_if_fmt_aqm( txqi = to_txq_info(sdata->vif.txq); spin_lock_bh(&local->fq.lock); - rcu_read_lock(); len = scnprintf(buf, buflen, @@ -642,7 +641,6 @@ static ssize_t ieee80211_if_fmt_aqm( txqi->tin.tx_bytes, txqi->tin.tx_packets); - rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); return len; diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 49061bd4151b..ef75255d47d5 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -148,7 +148,6 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf, return -ENOMEM; spin_lock_bh(&local->fq.lock); - rcu_read_lock(); p += scnprintf(p, bufsz + buf - p, @@ -178,7 +177,6 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf, test_bit(IEEE80211_TXQ_DIRTY, &txqi->flags) ? " DIRTY" : ""); } - rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 1bd75e0375a0..f4d3b67fda06 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2637,13 +2637,11 @@ static void sta_set_tidstats(struct sta_info *sta, if (link_id < 0 && tid < IEEE80211_NUM_TIDS) { spin_lock_bh(&local->fq.lock); - rcu_read_lock(); tidstats->filled |= BIT(NL80211_TID_STATS_TXQ_STATS); ieee80211_fill_txq_stats(&tidstats->txq_stats, to_txq_info(sta->sta.txq[tid])); - rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); } } -- cgit v1.2.3 From ccdc96fa0ed888e89e617fffdd5c11915568d7a0 Mon Sep 17 00:00:00 2001 From: Sarika Sharma Date: Thu, 18 Sep 2025 09:38:46 +0530 Subject: wifi: mac80211: remove tx_handlers_drop debugfs stats Commit 906a5a8c7152 ("wifi: mac80211: add tx_handlers_drop statistics to ethtool") added a tx_handlers_drop counter to ethtool stats. During review [1], Johannes noted that the existing debugfs counter is now redundant. Remove the debugfs stat to avoid duplication and streamline statistics reporting. Link: https://lore.kernel.org/linux-wireless/ce5f2bd899caa2de32f36ce554d9cada073979c0.camel@sipsolutions.net/ # [1] Signed-off-by: Sarika Sharma Link: https://patch.msgid.link/20250918040846.4032734-1-quic_sarishar@quicinc.com Signed-off-by: Johannes Berg --- net/mac80211/debugfs.c | 1 - net/mac80211/ieee80211_i.h | 1 - net/mac80211/tx.c | 2 -- 3 files changed, 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 82099f4cedbe..d02f07368c51 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -715,7 +715,6 @@ void debugfs_hw_add(struct ieee80211_local *local) DEBUGFS_STATS_ADD(dot11ReceivedFragmentCount); DEBUGFS_STATS_ADD(dot11MulticastReceivedFrameCount); DEBUGFS_STATS_ADD(dot11TransmittedFrameCount); - DEBUGFS_STATS_ADD(tx_handlers_drop); DEBUGFS_STATS_ADD(tx_handlers_queued); DEBUGFS_STATS_ADD(tx_handlers_drop_wep); DEBUGFS_STATS_ADD(tx_handlers_drop_not_assoc); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 242cb109b232..414058bced1a 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1610,7 +1610,6 @@ struct ieee80211_local { u32 dot11TransmittedFrameCount; /* TX/RX handler statistics */ - unsigned int tx_handlers_drop; unsigned int tx_handlers_queued; unsigned int tx_handlers_drop_wep; unsigned int tx_handlers_drop_not_assoc; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index ba51198be94a..e7b141c55f7a 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1821,7 +1821,6 @@ static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx) txh_done: if (unlikely(res == TX_DROP)) { tx->sdata->tx_handlers_drop++; - I802_DEBUG_INC(tx->local->tx_handlers_drop); if (tx->skb) ieee80211_free_txskb(&tx->local->hw, tx->skb); else @@ -1866,7 +1865,6 @@ static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx) txh_done: if (unlikely(res == TX_DROP)) { tx->sdata->tx_handlers_drop++; - I802_DEBUG_INC(tx->local->tx_handlers_drop); if (tx->skb) ieee80211_free_txskb(&tx->local->hw, tx->skb); else -- cgit v1.2.3 From d0688dc2b172d19e20fdb8be8c37930da12aaf88 Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Thu, 18 Sep 2025 15:19:11 +1000 Subject: wifi: cfg80211: correctly implement and validate S1G chandef Currently, the S1G channelisation implementation differs from that of VHT, which is the PHY that S1G is based on. The major difference between the clock rate is 1/10th of VHT. However how their channelisation is represented within cfg80211 and mac80211 vastly differ. To rectify this, remove the use of IEEE80211_CHAN_1/2/4.. flags that were previously used to indicate the control channel width, however it should be implied that the control channels are 1MHz in the case of S1G. Additionally, introduce the invert - being IEEE80211_CHAN_NO_4/8/16MHz - that imply the control channel may not be used for a certain bandwidth. With these new flags, we can perform regulatory and chandef validation just as we would for VHT. To deal with the notion that S1G PHYs may contain a 2MHz primary channel, introduce a new variable, s1g_primary_2mhz, which indicates whether we are operating on a 2MHz primary channel. In this case, the chandef::chan points to the 1MHz primary channel pointed to by the primary channel location. Alongside this, introduce some new helper routines that can extract the sibling 1MHz channel. The sibling being the alternate 1MHz primary subchannel within the 2MHz primary channel that is not pointed to by chandef::chan. Furthermore, due to unique restrictions imposed on S1G PHYs, introduce a new flag, IEEE80211_CHAN_S1G_NO_PRIMARY, which states that the 1MHz channel cannot be used as a primary channel. This is assumed to be set by vendors as it is hardware and regdom specific, When we validate a 2MHz primary channel, we need to ensure both 1MHz subchannels do not contain this flag. If one or both of the 1MHz subchannels contain this flag then the 2MHz primary is not permitted for use as a primary channel. Properly integrate S1G channel validation such that it is implemented according with other PHY types such as VHT. Additionally, implement a new S1G-specific regulatory flag to allow cfg80211 to understand specific vendor requirements for S1G PHYs. Signed-off-by: Arien Judge Signed-off-by: Andrew Pope Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250918051913.500781-2-lachlan.hodges@morsemicro.com [remove redundant NL80211_ATTR_S1G_PRIMARY_2MHZ check] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 95 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/nl80211.h | 15 +++++++ net/wireless/chan.c | 103 +++++++++++++++++++++++++++++-------------- net/wireless/nl80211.c | 37 +++++++++------- net/wireless/reg.c | 76 ++++++++++--------------------- 5 files changed, 225 insertions(+), 101 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 26fd42e189ce..2d612c760dd1 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -129,6 +129,13 @@ struct wiphy; * with very low power (VLP), even if otherwise set to NO_IR. * @IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY: Allow activity on a 20 MHz channel, * even if otherwise set to NO_IR. + * @IEEE80211_CHAN_S1G_NO_PRIMARY: Prevents the channel for use as an S1G + * primary channel. Does not prevent the wider operating channel + * described by the chandef from being used. In order for a 2MHz primary + * to be used, both 1MHz subchannels shall not contain this flag. + * @IEEE80211_CHAN_NO_4MHZ: 4 MHz bandwidth is not permitted on this channel. + * @IEEE80211_CHAN_NO_8MHZ: 8 MHz bandwidth is not permitted on this channel. + * @IEEE80211_CHAN_NO_16MHZ: 16 MHz bandwidth is not permitted on this channel. */ enum ieee80211_channel_flags { IEEE80211_CHAN_DISABLED = BIT(0), @@ -158,6 +165,10 @@ enum ieee80211_channel_flags { IEEE80211_CHAN_CAN_MONITOR = BIT(24), IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP = BIT(25), IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY = BIT(26), + IEEE80211_CHAN_S1G_NO_PRIMARY = BIT(27), + IEEE80211_CHAN_NO_4MHZ = BIT(28), + IEEE80211_CHAN_NO_8MHZ = BIT(29), + IEEE80211_CHAN_NO_16MHZ = BIT(30), }; #define IEEE80211_CHAN_NO_HT40 \ @@ -821,6 +832,9 @@ struct key_params { * @punctured: mask of the punctured 20 MHz subchannels, with * bits turned on being disabled (punctured); numbered * from lower to higher frequency (like in the spec) + * @s1g_primary_2mhz: Indicates if the control channel pointed to + * by 'chan' exists as a 1MHz primary subchannel within an + * S1G 2MHz primary channel. */ struct cfg80211_chan_def { struct ieee80211_channel *chan; @@ -830,6 +844,7 @@ struct cfg80211_chan_def { struct ieee80211_edmg edmg; u16 freq1_offset; u16 punctured; + bool s1g_primary_2mhz; }; /* @@ -990,6 +1005,18 @@ cfg80211_chandef_is_edmg(const struct cfg80211_chan_def *chandef) return chandef->edmg.channels || chandef->edmg.bw_config; } +/** + * cfg80211_chandef_is_s1g - check if chandef represents an S1G channel + * @chandef: the channel definition + * + * Return: %true if S1G. + */ +static inline bool +cfg80211_chandef_is_s1g(const struct cfg80211_chan_def *chandef) +{ + return chandef->chan->band == NL80211_BAND_S1GHZ; +} + /** * cfg80211_chandef_compatible - check if two channel definitions are compatible * @chandef1: first channel definition @@ -10179,4 +10206,72 @@ ssize_t wiphy_locked_debugfs_write(struct wiphy *wiphy, struct file *file, void *data); #endif +/** + * cfg80211_s1g_get_start_freq_khz - get S1G chandef start frequency + * @chandef: the chandef to use + * + * Return: the chandefs starting frequency in KHz + */ +static inline u32 +cfg80211_s1g_get_start_freq_khz(const struct cfg80211_chan_def *chandef) +{ + u32 bw_mhz = cfg80211_chandef_get_width(chandef); + u32 center_khz = + MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset; + return center_khz - bw_mhz * 500 + 500; +} + +/** + * cfg80211_s1g_get_end_freq_khz - get S1G chandef end frequency + * @chandef: the chandef to use + * + * Return: the chandefs ending frequency in KHz + */ +static inline u32 +cfg80211_s1g_get_end_freq_khz(const struct cfg80211_chan_def *chandef) +{ + u32 bw_mhz = cfg80211_chandef_get_width(chandef); + u32 center_khz = + MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset; + return center_khz + bw_mhz * 500 - 500; +} + +/** + * cfg80211_s1g_get_primary_sibling - retrieve the sibling 1MHz subchannel + * for an S1G chandef using a 2MHz primary channel. + * @wiphy: wiphy the channel belongs to + * @chandef: the chandef to use + * + * When chandef::s1g_primary_2mhz is set to true, we are operating on a 2MHz + * primary channel. The 1MHz subchannel designated by the primary channel + * location exists within chandef::chan, whilst the 'sibling' is denoted as + * being the other 1MHz subchannel that make up the 2MHz primary channel. + * + * Returns: the sibling 1MHz &struct ieee80211_channel, or %NULL on failure. + */ +static inline struct ieee80211_channel * +cfg80211_s1g_get_primary_sibling(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef) +{ + int width_mhz = cfg80211_chandef_get_width(chandef); + u32 pri_1mhz_khz, sibling_1mhz_khz, op_low_1mhz_khz, pri_index; + + if (!chandef->s1g_primary_2mhz || width_mhz < 2) + return NULL; + + pri_1mhz_khz = ieee80211_channel_to_khz(chandef->chan); + op_low_1mhz_khz = cfg80211_s1g_get_start_freq_khz(chandef); + + /* + * Compute the index of the primary 1 MHz subchannel within the + * operating channel, relative to the lowest 1 MHz center frequency. + * Flip the least significant bit to select the even/odd sibling, + * then translate that index back into a channel frequency. + */ + pri_index = (pri_1mhz_khz - op_low_1mhz_khz) / 1000; + sibling_1mhz_khz = op_low_1mhz_khz + ((pri_index ^ 1) * 1000); + + return ieee80211_get_channel_khz(wiphy, sibling_1mhz_khz); +} + #endif /* __NET_CFG80211_H */ diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 423e258cdbd2..8134f10e4e6c 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2969,6 +2969,10 @@ enum nl80211_commands { * capabilities supported by the driver. See &enum nl80211_nan_capabilities * for details. * + * @NL80211_ATTR_S1G_PRIMARY_2MHZ: flag attribute indicating that the S1G + * primary channel is 2 MHz wide, and the control channel designates + * the 1 MHz primary subchannel within that 2 MHz primary. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3535,6 +3539,8 @@ enum nl80211_attrs { NL80211_ATTR_NAN_NEW_CLUSTER, NL80211_ATTR_NAN_CAPABILITIES, + NL80211_ATTR_S1G_PRIMARY_2MHZ, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -4432,6 +4438,12 @@ enum nl80211_wmm_rule { * very low power (VLP) AP, despite being NO_IR. * @NL80211_FREQUENCY_ATTR_ALLOW_20MHZ_ACTIVITY: This channel can be active in * 20 MHz bandwidth, despite being NO_IR. + * @NL80211_FREQUENCY_ATTR_NO_4MHZ: 4 MHz operation is not allowed on this + * channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_NO_8MHZ: 8 MHz operation is not allowed on this + * channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_NO_16MHZ: 16 MHz operation is not allowed on this + * channel in current regulatory domain. * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number * currently defined * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use @@ -4477,6 +4489,9 @@ enum nl80211_frequency_attr { NL80211_FREQUENCY_ATTR_CAN_MONITOR, NL80211_FREQUENCY_ATTR_ALLOW_6GHZ_VLP_AP, NL80211_FREQUENCY_ATTR_ALLOW_20MHZ_ACTIVITY, + NL80211_FREQUENCY_ATTR_NO_4MHZ, + NL80211_FREQUENCY_ATTR_NO_8MHZ, + NL80211_FREQUENCY_ATTR_NO_16MHZ, /* keep last */ __NL80211_FREQUENCY_ATTR_AFTER_LAST, diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 193734b7f9dc..68221b1ab45e 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -100,6 +100,11 @@ static u32 cfg80211_get_end_freq(const struct cfg80211_chan_def *chandef, punctured = 0) : (punctured >>= 1))) \ if (!(punctured & 1)) +#define for_each_s1g_subchan(chandef, freq_khz) \ + for (freq_khz = cfg80211_s1g_get_start_freq_khz(chandef); \ + freq_khz <= cfg80211_s1g_get_end_freq_khz(chandef); \ + freq_khz += MHZ_TO_KHZ(1)) + struct cfg80211_per_bw_puncturing_values { u8 len; const u16 *valid_values; @@ -336,8 +341,7 @@ static bool cfg80211_valid_center_freq(u32 center, bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) { - u32 control_freq, oper_freq; - int oper_width, control_width; + u32 control_freq, control_freq_khz, start_khz, end_khz; if (!chandef->chan) return false; @@ -363,27 +367,16 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) case NL80211_CHAN_WIDTH_4: case NL80211_CHAN_WIDTH_8: case NL80211_CHAN_WIDTH_16: - if (chandef->chan->band != NL80211_BAND_S1GHZ) - return false; - - control_freq = ieee80211_channel_to_khz(chandef->chan); - oper_freq = ieee80211_chandef_to_khz(chandef); - control_width = nl80211_chan_width_to_mhz( - ieee80211_s1g_channel_width( - chandef->chan)); - oper_width = cfg80211_chandef_get_width(chandef); - - if (oper_width < 0 || control_width < 0) + if (!cfg80211_chandef_is_s1g(chandef)) return false; if (chandef->center_freq2) return false; - if (control_freq + MHZ_TO_KHZ(control_width) / 2 > - oper_freq + MHZ_TO_KHZ(oper_width) / 2) - return false; + control_freq_khz = ieee80211_channel_to_khz(chandef->chan); + start_khz = cfg80211_s1g_get_start_freq_khz(chandef); + end_khz = cfg80211_s1g_get_end_freq_khz(chandef); - if (control_freq - MHZ_TO_KHZ(control_width) / 2 < - oper_freq - MHZ_TO_KHZ(oper_width) / 2) + if (control_freq_khz < start_khz || control_freq_khz > end_khz) return false; break; case NL80211_CHAN_WIDTH_80P80: @@ -461,6 +454,9 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) !cfg80211_edmg_chandef_valid(chandef)) return false; + if (!cfg80211_chandef_is_s1g(chandef) && chandef->s1g_primary_2mhz) + return false; + return valid_puncturing_bitmap(chandef); } EXPORT_SYMBOL(cfg80211_chandef_valid); @@ -725,6 +721,10 @@ static int cfg80211_get_chans_dfs_required(struct wiphy *wiphy, { struct ieee80211_channel *c; + /* DFS is not required for S1G */ + if (cfg80211_chandef_is_s1g(chandef)) + return 0; + for_each_subchan(chandef, freq, cf) { c = ieee80211_get_channel_khz(wiphy, freq); if (!c) @@ -1130,6 +1130,55 @@ static bool cfg80211_edmg_usable(struct wiphy *wiphy, u8 edmg_channels, return true; } +static bool cfg80211_s1g_usable(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef) +{ + u32 freq_khz; + const struct ieee80211_channel *chan; + u32 pri_khz = ieee80211_channel_to_khz(chandef->chan); + u32 end_khz = cfg80211_s1g_get_end_freq_khz(chandef); + u32 start_khz = cfg80211_s1g_get_start_freq_khz(chandef); + int width_mhz = cfg80211_chandef_get_width(chandef); + u32 prohibited_flags = IEEE80211_CHAN_DISABLED; + + if (width_mhz >= 16) + prohibited_flags |= IEEE80211_CHAN_NO_16MHZ; + if (width_mhz >= 8) + prohibited_flags |= IEEE80211_CHAN_NO_8MHZ; + if (width_mhz >= 4) + prohibited_flags |= IEEE80211_CHAN_NO_4MHZ; + + if (chandef->chan->flags & IEEE80211_CHAN_S1G_NO_PRIMARY) + return false; + + if (pri_khz < start_khz || pri_khz > end_khz) + return false; + + for_each_s1g_subchan(chandef, freq_khz) { + chan = ieee80211_get_channel_khz(wiphy, freq_khz); + if (!chan || (chan->flags & prohibited_flags)) + return false; + } + + if (chandef->s1g_primary_2mhz) { + u32 sib_khz; + const struct ieee80211_channel *sibling; + + sibling = cfg80211_s1g_get_primary_sibling(wiphy, chandef); + if (!sibling) + return false; + + if (sibling->flags & IEEE80211_CHAN_S1G_NO_PRIMARY) + return false; + + sib_khz = ieee80211_channel_to_khz(sibling); + if (sib_khz < start_khz || sib_khz > end_khz) + return false; + } + + return true; +} + bool _cfg80211_chandef_usable(struct wiphy *wiphy, const struct cfg80211_chan_def *chandef, u32 prohibited_flags, @@ -1154,6 +1203,9 @@ bool _cfg80211_chandef_usable(struct wiphy *wiphy, ext_nss_cap = __le16_to_cpu(vht_cap->vht_mcs.tx_highest) & IEEE80211_VHT_EXT_NSS_BW_CAPABLE; + if (cfg80211_chandef_is_s1g(chandef)) + return cfg80211_s1g_usable(wiphy, chandef); + if (edmg_cap->channels && !cfg80211_edmg_usable(wiphy, chandef->edmg.channels, @@ -1165,21 +1217,6 @@ bool _cfg80211_chandef_usable(struct wiphy *wiphy, control_freq = chandef->chan->center_freq; switch (chandef->width) { - case NL80211_CHAN_WIDTH_1: - width = 1; - break; - case NL80211_CHAN_WIDTH_2: - width = 2; - break; - case NL80211_CHAN_WIDTH_4: - width = 4; - break; - case NL80211_CHAN_WIDTH_8: - width = 8; - break; - case NL80211_CHAN_WIDTH_16: - width = 16; - break; case NL80211_CHAN_WIDTH_5: width = 5; break; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 4e0d40865441..de34a1d14073 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -931,6 +931,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_S1G_SHORT_BEACON] = NLA_POLICY_NESTED(nl80211_s1g_short_beacon), [NL80211_ATTR_BSS_PARAM] = { .type = NLA_FLAG }, + [NL80211_ATTR_S1G_PRIMARY_2MHZ] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -1319,6 +1320,15 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, nla_put_flag(msg, NL80211_FREQUENCY_ATTR_ALLOW_20MHZ_ACTIVITY)) goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_4MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_4MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_8MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_8MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_16MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_16MHZ)) + goto nla_put_failure; } if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER, @@ -3541,6 +3551,7 @@ static int _nl80211_parse_chandef(struct cfg80211_registered_device *rdev, chandef->center_freq1 = KHZ_TO_MHZ(control_freq); chandef->freq1_offset = control_freq % 1000; chandef->center_freq2 = 0; + chandef->s1g_primary_2mhz = false; if (!chandef->chan) { NL_SET_ERR_MSG_ATTR(extack, attrs[NL80211_ATTR_WIPHY_FREQ], @@ -3584,27 +3595,20 @@ static int _nl80211_parse_chandef(struct cfg80211_registered_device *rdev, return -EINVAL; } } else if (attrs[NL80211_ATTR_CHANNEL_WIDTH]) { - chandef->width = - nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]); - if (chandef->chan->band == NL80211_BAND_S1GHZ) { - /* User input error for channel width doesn't match channel */ - if (chandef->width != ieee80211_s1g_channel_width(chandef->chan)) { - NL_SET_ERR_MSG_ATTR(extack, - attrs[NL80211_ATTR_CHANNEL_WIDTH], - "bad channel width"); - return -EINVAL; - } - } + chandef->width = nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]); if (attrs[NL80211_ATTR_CENTER_FREQ1]) { chandef->center_freq1 = nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ1]); - chandef->freq1_offset = - nla_get_u32_default(attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET], - 0); + chandef->freq1_offset = nla_get_u32_default( + attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET], 0); } + if (attrs[NL80211_ATTR_CENTER_FREQ2]) chandef->center_freq2 = nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ2]); + + chandef->s1g_primary_2mhz = nla_get_flag( + attrs[NL80211_ATTR_S1G_PRIMARY_2MHZ]); } if (info->attrs[NL80211_ATTR_WIPHY_EDMG_CHANNELS]) { @@ -10455,8 +10459,9 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) goto out_free; } - /* ignore disabled channels */ + /* Ignore disabled / no primary channels */ if (chan->flags & IEEE80211_CHAN_DISABLED || + chan->flags & IEEE80211_CHAN_S1G_NO_PRIMARY || !cfg80211_wdev_channel_allowed(wdev, chan)) continue; @@ -10478,6 +10483,8 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) chan = &wiphy->bands[band]->channels[j]; if (chan->flags & IEEE80211_CHAN_DISABLED || + chan->flags & + IEEE80211_CHAN_S1G_NO_PRIMARY || !cfg80211_wdev_channel_allowed(wdev, chan)) continue; diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 3b0ac3437f81..73cab51f6379 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1707,6 +1707,16 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd if (reg_rule->flags & NL80211_RRF_AUTO_BW) max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); + if (is_s1g) { + if (max_bandwidth_khz < MHZ_TO_KHZ(16)) + bw_flags |= IEEE80211_CHAN_NO_16MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(8)) + bw_flags |= IEEE80211_CHAN_NO_8MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(4)) + bw_flags |= IEEE80211_CHAN_NO_4MHZ; + return bw_flags; + } + /* If we get a reg_rule we can assume that at least 5Mhz fit */ if (!cfg80211_does_bw_fit_range(freq_range, center_freq_khz, @@ -1717,59 +1727,19 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd MHZ_TO_KHZ(20))) bw_flags |= IEEE80211_CHAN_NO_20MHZ; - if (is_s1g) { - /* S1G is strict about non overlapping channels. We can - * calculate which bandwidth is allowed per channel by finding - * the largest bandwidth which cleanly divides the freq_range. - */ - int edge_offset; - int ch_bw = max_bandwidth_khz; - - while (ch_bw) { - edge_offset = (center_freq_khz - ch_bw / 2) - - freq_range->start_freq_khz; - if (edge_offset % ch_bw == 0) { - switch (KHZ_TO_MHZ(ch_bw)) { - case 1: - bw_flags |= IEEE80211_CHAN_1MHZ; - break; - case 2: - bw_flags |= IEEE80211_CHAN_2MHZ; - break; - case 4: - bw_flags |= IEEE80211_CHAN_4MHZ; - break; - case 8: - bw_flags |= IEEE80211_CHAN_8MHZ; - break; - case 16: - bw_flags |= IEEE80211_CHAN_16MHZ; - break; - default: - /* If we got here, no bandwidths fit on - * this frequency, ie. band edge. - */ - bw_flags |= IEEE80211_CHAN_DISABLED; - break; - } - break; - } - ch_bw /= 2; - } - } else { - if (max_bandwidth_khz < MHZ_TO_KHZ(10)) - bw_flags |= IEEE80211_CHAN_NO_10MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(20)) - bw_flags |= IEEE80211_CHAN_NO_20MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(40)) - bw_flags |= IEEE80211_CHAN_NO_HT40; - if (max_bandwidth_khz < MHZ_TO_KHZ(80)) - bw_flags |= IEEE80211_CHAN_NO_80MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(160)) - bw_flags |= IEEE80211_CHAN_NO_160MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(320)) - bw_flags |= IEEE80211_CHAN_NO_320MHZ; - } + if (max_bandwidth_khz < MHZ_TO_KHZ(10)) + bw_flags |= IEEE80211_CHAN_NO_10MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(20)) + bw_flags |= IEEE80211_CHAN_NO_20MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(40)) + bw_flags |= IEEE80211_CHAN_NO_HT40; + if (max_bandwidth_khz < MHZ_TO_KHZ(80)) + bw_flags |= IEEE80211_CHAN_NO_80MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(160)) + bw_flags |= IEEE80211_CHAN_NO_160MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(320)) + bw_flags |= IEEE80211_CHAN_NO_320MHZ; + return bw_flags; } -- cgit v1.2.3 From 31e7681da78d7e8d2d83185c0e640012a018f229 Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Thu, 18 Sep 2025 15:19:12 +1000 Subject: wifi: mac80211: correctly initialise S1G chandef for STA When moving to the APs channel, ensure we correctly initialise the chandef and perform the required validation. Additionally, if the AP is beaconing on a 2MHz primary, calculate the 2MHz primary center frequency by extracting the sibling 1MHz primary and averaging the frequencies to find the 2MHz primary center frequency. Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250918051913.500781-3-lachlan.hodges@morsemicro.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 18 +++++++++++++++- net/mac80211/ieee80211_i.h | 3 ++- net/mac80211/main.c | 6 ++++-- net/mac80211/mlme.c | 53 ++++++++++++++++++++++++++++++++++++++++------ net/mac80211/scan.c | 13 ++++++------ net/mac80211/util.c | 39 ++++++++++++++++++++++++++++------ 6 files changed, 109 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 2110345de8ef..ddff9102f633 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1182,6 +1182,18 @@ enum ieee80211_s1g_chanwidth { IEEE80211_S1G_CHANWIDTH_16MHZ = 15, }; +/** + * enum ieee80211_s1g_pri_chanwidth - S1G primary channel widths + * described in IEEE80211-2024 Table 10-39. + * + * @IEEE80211_S1G_PRI_CHANWIDTH_2MHZ: 2MHz primary channel + * @IEEE80211_S1G_PRI_CHANWIDTH_1MHZ: 1MHz primary channel + */ +enum ieee80211_s1g_pri_chanwidth { + IEEE80211_S1G_PRI_CHANWIDTH_2MHZ = 0, + IEEE80211_S1G_PRI_CHANWIDTH_1MHZ = 1, +}; + #define WLAN_SA_QUERY_TR_ID_LEN 2 #define WLAN_MEMBERSHIP_LEN 8 #define WLAN_USER_POSITION_LEN 16 @@ -3170,8 +3182,12 @@ ieee80211_he_spr_size(const u8 *he_spr_ie) #define S1G_CAP9_LINK_ADAPT_PER_CONTROL_RESPONSE BIT(0) -#define S1G_OPER_CH_WIDTH_PRIMARY_1MHZ BIT(0) +#define S1G_OPER_CH_WIDTH_PRIMARY BIT(0) #define S1G_OPER_CH_WIDTH_OPER GENMASK(4, 1) +#define S1G_OPER_CH_PRIMARY_LOCATION BIT(5) + +#define S1G_2M_PRIMARY_LOCATION_LOWER 0 +#define S1G_2M_PRIMARY_LOCATION_UPPER 1 /* EHT MAC capabilities as defined in P802.11be_D2.0 section 9.4.2.313.2 */ #define IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS 0x01 diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 414058bced1a..73fd86ec1bce 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2710,7 +2710,8 @@ bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_local *local, const struct ieee80211_he_operation *he_oper, const struct ieee80211_eht_operation *eht_oper, struct cfg80211_chan_def *chandef); -bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, +bool ieee80211_chandef_s1g_oper(struct ieee80211_local *local, + const struct ieee80211_s1g_oper_ie *oper, struct cfg80211_chan_def *chandef); void ieee80211_chandef_downgrade(struct cfg80211_chan_def *chandef, struct ieee80211_conn_settings *conn); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 27b3ec5deabe..eefa6f7e899b 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1249,11 +1249,13 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) if (!dflt_chandef.chan) { /* * Assign the first enabled channel to dflt_chandef - * from the list of channels + * from the list of channels. For S1G interfaces + * ensure it can be used as a primary. */ for (i = 0; i < sband->n_channels; i++) if (!(sband->channels[i].flags & - IEEE80211_CHAN_DISABLED)) + (IEEE80211_CHAN_DISABLED | + IEEE80211_CHAN_S1G_NO_PRIMARY))) break; /* if none found then use the first anyway */ if (i == sband->n_channels) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 0e12309accbe..3b5827ea438e 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -180,10 +180,11 @@ ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata, /* get special S1G case out of the way */ if (sband->band == NL80211_BAND_S1GHZ) { - if (!ieee80211_chandef_s1g_oper(elems->s1g_oper, chandef)) { - sdata_info(sdata, - "Missing S1G Operation Element? Trying operating == primary\n"); - chandef->width = ieee80211_s1g_channel_width(channel); + if (!ieee80211_chandef_s1g_oper(sdata->local, elems->s1g_oper, + chandef)) { + /* Fallback to default 1MHz */ + chandef->width = NL80211_CHAN_WIDTH_1; + chandef->s1g_primary_2mhz = false; } return IEEE80211_CONN_MODE_S1G; @@ -1046,6 +1047,14 @@ again: ret = -EINVAL; goto free; } + + chanreq->oper = *ap_chandef; + if (!cfg80211_chandef_usable(sdata->wdev.wiphy, &chanreq->oper, + IEEE80211_CHAN_DISABLED)) { + ret = -EINVAL; + goto free; + } + return elems; case NL80211_BAND_6GHZ: if (ap_mode < IEEE80211_CONN_MODE_HE) { @@ -7292,6 +7301,38 @@ static bool ieee80211_mgd_ssid_mismatch(struct ieee80211_sub_if_data *sdata, return memcmp(elems->ssid, cfg->ssid, cfg->ssid_len); } +static bool +ieee80211_rx_beacon_freq_valid(struct ieee80211_local *local, + struct ieee80211_mgmt *mgmt, + struct ieee80211_rx_status *rx_status, + struct ieee80211_chanctx_conf *chanctx) +{ + u32 pri_2mhz_khz; + struct ieee80211_channel *s1g_sibling_1mhz; + u32 pri_khz = ieee80211_channel_to_khz(chanctx->def.chan); + u32 rx_khz = ieee80211_rx_status_to_khz(rx_status); + + if (rx_khz == pri_khz) + return true; + + if (!chanctx->def.s1g_primary_2mhz) + return false; + + /* + * If we have an S1G interface with a 2MHz primary, beacons are + * sent on the center frequency of the 2MHz primary. Find the sibling + * 1MHz channel and calculate the 2MHz primary center frequency. + */ + s1g_sibling_1mhz = cfg80211_s1g_get_primary_sibling(local->hw.wiphy, + &chanctx->def); + if (!s1g_sibling_1mhz) + return false; + + pri_2mhz_khz = + (pri_khz + ieee80211_channel_to_khz(s1g_sibling_1mhz)) / 2; + return rx_khz == pri_2mhz_khz; +} + static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, struct ieee80211_hdr *hdr, size_t len, struct ieee80211_rx_status *rx_status) @@ -7346,8 +7387,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, return; } - if (ieee80211_rx_status_to_khz(rx_status) != - ieee80211_channel_to_khz(chanctx_conf->def.chan)) { + if (!ieee80211_rx_beacon_freq_valid(local, mgmt, rx_status, + chanctx_conf)) { rcu_read_unlock(); return; } diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index dbf98aa4cd67..bb9563f50e7b 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -996,15 +996,15 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local, local->scan_chandef.freq1_offset = chan->freq_offset; local->scan_chandef.center_freq2 = 0; - /* For scanning on the S1G band, detect the channel width according to - * the channel being scanned. - */ + /* For S1G, only scan the 1MHz primaries. */ if (chan->band == NL80211_BAND_S1GHZ) { - local->scan_chandef.width = ieee80211_s1g_channel_width(chan); + local->scan_chandef.width = NL80211_CHAN_WIDTH_1; + local->scan_chandef.s1g_primary_2mhz = false; goto set_channel; } - /* If scanning on oper channel, use whatever channel-type + /* + * If scanning on oper channel, use whatever channel-type * is currently in use. */ if (chan == local->hw.conf.chandef.chan) @@ -1213,7 +1213,8 @@ int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, for (band = 0; band < NUM_NL80211_BANDS; band++) { if (!local->hw.wiphy->bands[band] || - band == NL80211_BAND_6GHZ) + band == NL80211_BAND_6GHZ || + band == NL80211_BAND_S1GHZ) continue; max_n = local->hw.wiphy->bands[band]->n_channels; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 123842b841f2..c9931537f9d2 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3199,10 +3199,11 @@ bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_local *local, return true; } -bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, +bool ieee80211_chandef_s1g_oper(struct ieee80211_local *local, + const struct ieee80211_s1g_oper_ie *oper, struct cfg80211_chan_def *chandef) { - u32 oper_freq; + u32 oper_khz, pri_1mhz_khz, pri_2mhz_khz; if (!oper) return false; @@ -3227,12 +3228,36 @@ bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, return false; } - oper_freq = ieee80211_channel_to_freq_khz(oper->oper_ch, - NL80211_BAND_S1GHZ); - chandef->center_freq1 = KHZ_TO_MHZ(oper_freq); - chandef->freq1_offset = oper_freq % 1000; + chandef->s1g_primary_2mhz = false; - return true; + switch (u8_get_bits(oper->ch_width, S1G_OPER_CH_WIDTH_PRIMARY)) { + case IEEE80211_S1G_PRI_CHANWIDTH_1MHZ: + pri_1mhz_khz = ieee80211_channel_to_freq_khz( + oper->primary_ch, NL80211_BAND_S1GHZ); + break; + case IEEE80211_S1G_PRI_CHANWIDTH_2MHZ: + chandef->s1g_primary_2mhz = true; + pri_2mhz_khz = ieee80211_channel_to_freq_khz( + oper->primary_ch, NL80211_BAND_S1GHZ); + + if (u8_get_bits(oper->ch_width, S1G_OPER_CH_PRIMARY_LOCATION) == + S1G_2M_PRIMARY_LOCATION_LOWER) + pri_1mhz_khz = pri_2mhz_khz - 500; + else + pri_1mhz_khz = pri_2mhz_khz + 500; + break; + default: + return false; + } + + oper_khz = ieee80211_channel_to_freq_khz(oper->oper_ch, + NL80211_BAND_S1GHZ); + chandef->center_freq1 = KHZ_TO_MHZ(oper_khz); + chandef->freq1_offset = oper_khz % 1000; + chandef->chan = + ieee80211_get_channel_khz(local->hw.wiphy, pri_1mhz_khz); + + return chandef->chan; } int ieee80211_put_srates_elem(struct sk_buff *skb, -- cgit v1.2.3 From cbcd507f01deb983d5cad0a25b6495930ab59593 Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Thu, 18 Sep 2025 15:19:13 +1000 Subject: wifi: cfg80211: remove ieee80211_s1g_channel_width With the introduction of proper S1G channel flags, this function is no longer used. Remove it. Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250918051913.500781-4-lachlan.hodges@morsemicro.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 10 ---------- net/wireless/util.c | 27 --------------------------- 2 files changed, 37 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 2d612c760dd1..1c041ce7a03b 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -6818,16 +6818,6 @@ ieee80211_channel_to_khz(const struct ieee80211_channel *chan) return MHZ_TO_KHZ(chan->center_freq) + chan->freq_offset; } -/** - * ieee80211_s1g_channel_width - get allowed channel width from @chan - * - * Only allowed for band NL80211_BAND_S1GHZ - * @chan: channel - * Return: The allowed channel width for this center_freq - */ -enum nl80211_chan_width -ieee80211_s1g_channel_width(const struct ieee80211_channel *chan); - /** * ieee80211_channel_to_freq_khz - convert channel number to frequency * @chan: channel number diff --git a/net/wireless/util.c b/net/wireless/util.c index d12d49134c88..f26440d18ad3 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -106,33 +106,6 @@ u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band) } EXPORT_SYMBOL(ieee80211_channel_to_freq_khz); -enum nl80211_chan_width -ieee80211_s1g_channel_width(const struct ieee80211_channel *chan) -{ - if (WARN_ON(!chan || chan->band != NL80211_BAND_S1GHZ)) - return NL80211_CHAN_WIDTH_20_NOHT; - - /*S1G defines a single allowed channel width per channel. - * Extract that width here. - */ - if (chan->flags & IEEE80211_CHAN_1MHZ) - return NL80211_CHAN_WIDTH_1; - else if (chan->flags & IEEE80211_CHAN_2MHZ) - return NL80211_CHAN_WIDTH_2; - else if (chan->flags & IEEE80211_CHAN_4MHZ) - return NL80211_CHAN_WIDTH_4; - else if (chan->flags & IEEE80211_CHAN_8MHZ) - return NL80211_CHAN_WIDTH_8; - else if (chan->flags & IEEE80211_CHAN_16MHZ) - return NL80211_CHAN_WIDTH_16; - - pr_err("unknown channel width for channel at %dKHz?\n", - ieee80211_channel_to_khz(chan)); - - return NL80211_CHAN_WIDTH_1; -} -EXPORT_SYMBOL(ieee80211_s1g_channel_width); - int ieee80211_freq_khz_to_channel(u32 freq) { /* TODO: just handle MHz for now */ -- cgit v1.2.3 From 32d340ae675800672e1219444a17940a8efe5cca Mon Sep 17 00:00:00 2001 From: Aditya Kumar Singh Date: Wed, 17 Sep 2025 12:42:03 +0530 Subject: wifi: mac80211: fix Rx packet handling when pubsta information is not available In ieee80211_rx_handle_packet(), if the caller does not provide pubsta information, an attempt is made to find the station using the address 2 (source address) field in the header. Since pubsta is missing, link information such as link_valid and link_id is also unavailable. Now if such a situation comes, and if a matching ML station entry is found based on the source address, currently the packet is dropped due to missing link ID in the status field which is not correct. Hence, to fix this issue, if link_valid is not set and the station is an ML station, make an attempt to find a link station entry using the source address. If a valid link station is found, derive the link ID and proceed with packet processing. Otherwise, drop the packet as per the existing flow. Fixes: ea9d807b5642 ("wifi: mac80211: add link information in ieee80211_rx_status") Suggested-by: Vasanthakumar Thiagarajan Signed-off-by: Aditya Kumar Singh Link: https://patch.msgid.link/20250917-fix_data_packet_rx_with_mlo_and_no_pubsta-v1-1-8cf971a958ac@oss.qualcomm.com Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index feb81ffa4f8c..6af43dfefdd6 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -5238,12 +5238,20 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, } rx.sdata = prev_sta->sdata; + if (!status->link_valid && prev_sta->sta.mlo) { + struct link_sta_info *link_sta; + + link_sta = link_sta_info_get_bss(rx.sdata, + hdr->addr2); + if (!link_sta) + continue; + + link_id = link_sta->link_id; + } + if (!ieee80211_rx_data_set_sta(&rx, prev_sta, link_id)) goto out; - if (!status->link_valid && prev_sta->sta.mlo) - continue; - ieee80211_prepare_and_rx_handle(&rx, skb, false); prev_sta = sta; @@ -5251,10 +5259,18 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, if (prev_sta) { rx.sdata = prev_sta->sdata; - if (!ieee80211_rx_data_set_sta(&rx, prev_sta, link_id)) - goto out; + if (!status->link_valid && prev_sta->sta.mlo) { + struct link_sta_info *link_sta; - if (!status->link_valid && prev_sta->sta.mlo) + link_sta = link_sta_info_get_bss(rx.sdata, + hdr->addr2); + if (!link_sta) + goto out; + + link_id = link_sta->link_id; + } + + if (!ieee80211_rx_data_set_sta(&rx, prev_sta, link_id)) goto out; if (ieee80211_prepare_and_rx_handle(&rx, skb, true)) -- cgit v1.2.3 From 08027f6b790be1e444e4182fb4dc53faa6539d16 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:34 +0200 Subject: net: use ns_common_init() Don't cargo-cult the same thing over and over. Signed-off-by: Christian Brauner --- net/core/net_namespace.c | 46 +++++++++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 1b6f3826dd0e..5fb7bd8ac45a 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -397,10 +397,22 @@ static __net_init void preinit_net_sysctl(struct net *net) } /* init code that must occur even if setup_net() is not called. */ -static __net_init void preinit_net(struct net *net, struct user_namespace *user_ns) +static __net_init int preinit_net(struct net *net, struct user_namespace *user_ns) { + const struct proc_ns_operations *ns_ops; + int ret; + +#ifdef CONFIG_NET_NS + ns_ops = &netns_operations; +#else + ns_ops = NULL; +#endif + + ret = ns_common_init(&net->ns, ns_ops, false); + if (ret) + return ret; + refcount_set(&net->passive, 1); - refcount_set(&net->ns.count, 1); ref_tracker_dir_init(&net->refcnt_tracker, 128, "net_refcnt"); ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net_notrefcnt"); @@ -420,6 +432,7 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_ INIT_LIST_HEAD(&net->ptype_all); INIT_LIST_HEAD(&net->ptype_specific); preinit_net_sysctl(net); + return 0; } /* @@ -559,7 +572,9 @@ struct net *copy_net_ns(unsigned long flags, goto dec_ucounts; } - preinit_net(net, user_ns); + rv = preinit_net(net, user_ns); + if (rv < 0) + goto dec_ucounts; net->ucounts = ucounts; get_user_ns(user_ns); @@ -812,15 +827,15 @@ static void net_ns_net_debugfs(struct net *net) static __net_init int net_ns_net_init(struct net *net) { -#ifdef CONFIG_NET_NS - net->ns.ops = &netns_operations; -#endif - net->ns.inum = PROC_NET_INIT_INO; - if (net != &init_net) { - int ret = ns_alloc_inum(&net->ns); - if (ret) - return ret; - } + int ret = 0; + + if (net == &init_net) + net->ns.inum = PROC_NET_INIT_INO; + else + ret = proc_alloc_inum(&to_ns_common(net)->inum); + if (ret) + return ret; + net_ns_net_debugfs(net); return 0; } @@ -1282,7 +1297,12 @@ void __init net_ns_init(void) #ifdef CONFIG_KEYS init_net.key_domain = &init_net_key_domain; #endif - preinit_net(&init_net, &init_user_ns); + /* + * This currently cannot fail as the initial network namespace + * has a static inode number. + */ + if (preinit_net(&init_net, &init_user_ns)) + panic("Could not preinitialize the initial network namespace"); down_write(&pernet_ops_rwsem); if (setup_net(&init_net)) -- cgit v1.2.3 From 195f7422298d711e89643369988ed285d484dd74 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:44 +0200 Subject: net: support ns lookup Support the generic ns lookup infrastructure to support file handles for namespaces. The network namespace has a separate list with different lifetime rules which we can just leave in tact. We have a similar concept for mount namespaces as well where it is on two differenet lists for different purposes. Signed-off-by: Christian Brauner --- net/core/net_namespace.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 5fb7bd8ac45a..169ec22c4758 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -445,7 +446,7 @@ static __net_init int setup_net(struct net *net) LIST_HEAD(net_exit_list); int error = 0; - net->net_cookie = atomic64_inc_return(&net_aligned_data.net_cookie); + net->net_cookie = ns_tree_gen_id(&net->ns); list_for_each_entry(ops, &pernet_list, list) { error = ops_init(ops, net); @@ -455,6 +456,7 @@ static __net_init int setup_net(struct net *net) down_write(&net_rwsem); list_add_tail_rcu(&net->list, &net_namespace_list); up_write(&net_rwsem); + ns_tree_add_raw(net); out: return error; @@ -674,8 +676,10 @@ static void cleanup_net(struct work_struct *work) /* Don't let anyone else find us. */ down_write(&net_rwsem); - llist_for_each_entry(net, net_kill_list, cleanup_list) + llist_for_each_entry(net, net_kill_list, cleanup_list) { + ns_tree_remove(net); list_del_rcu(&net->list); + } /* Cache last net. After we unlock rtnl, no one new net * added to net_namespace_list can assign nsid pointer * to a net from net_kill_list (see peernet2id_alloc()). -- cgit v1.2.3 From d7afdf889561058068ab46fd8f306c70ef29216a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:49 +0200 Subject: ns: add to__ns() to respective headers Every namespace type has a container_of(ns, , ns) static inline function that is currently not exposed in the header. So we have a bunch of places that open-code it via container_of(). Move it to the headers so we can use it directly. Reviewed-by: Aleksa Sarai Signed-off-by: Christian Brauner --- include/linux/cgroup.h | 5 +++++ include/linux/ipc_namespace.h | 5 +++++ include/linux/pid_namespace.h | 5 +++++ include/linux/time_namespace.h | 4 ++++ include/linux/user_namespace.h | 5 +++++ include/linux/utsname.h | 5 +++++ include/net/net_namespace.h | 5 +++++ ipc/namespace.c | 5 ----- kernel/cgroup/namespace.c | 5 ----- kernel/pid_namespace.c | 5 ----- kernel/time/namespace.c | 5 ----- kernel/user_namespace.c | 5 ----- kernel/utsname.c | 5 ----- net/core/net_namespace.c | 5 ----- 14 files changed, 34 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b18fb5fcb38e..9ca25346f7cb 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -794,6 +794,11 @@ extern struct cgroup_namespace init_cgroup_ns; #ifdef CONFIG_CGROUPS +static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) +{ + return container_of(ns, struct cgroup_namespace, ns); +} + void free_cgroup_ns(struct cgroup_namespace *ns); struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index e8240cf2611a..924e4754374f 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -129,6 +129,11 @@ static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; } #endif #if defined(CONFIG_IPC_NS) +static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns) +{ + return container_of(ns, struct ipc_namespace, ns); +} + extern struct ipc_namespace *copy_ipcs(unsigned long flags, struct user_namespace *user_ns, struct ipc_namespace *ns); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 7c67a5811199..ba0efc8c8596 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -54,6 +54,11 @@ extern struct pid_namespace init_pid_ns; #define PIDNS_ADDING (1U << 31) #ifdef CONFIG_PID_NS +static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) +{ + return container_of(ns, struct pid_namespace, ns); +} + static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { if (ns != &init_pid_ns) diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 7f6af7a9771e..a47a4ce4183e 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -33,6 +33,10 @@ struct time_namespace { extern struct time_namespace init_time_ns; #ifdef CONFIG_TIME_NS +static inline struct time_namespace *to_time_ns(struct ns_common *ns) +{ + return container_of(ns, struct time_namespace, ns); +} void __init time_ns_init(void); extern int vdso_join_timens(struct task_struct *task, struct time_namespace *ns); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index a0bb6d012137..a09056ad090e 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -168,6 +168,11 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns, #ifdef CONFIG_USER_NS +static inline struct user_namespace *to_user_ns(struct ns_common *ns) +{ + return container_of(ns, struct user_namespace, ns); +} + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) diff --git a/include/linux/utsname.h b/include/linux/utsname.h index bf7613ba412b..5d34c4f0f945 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -30,6 +30,11 @@ struct uts_namespace { extern struct uts_namespace init_uts_ns; #ifdef CONFIG_UTS_NS +static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) +{ + return container_of(ns, struct uts_namespace, ns); +} + static inline void get_uts_ns(struct uts_namespace *ns) { refcount_inc(&ns->ns.count); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 025a7574b275..fd090ceb80bf 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -262,6 +262,11 @@ void ipx_unregister_sysctl(void); #ifdef CONFIG_NET_NS void __put_net(struct net *net); +static inline struct net *to_net_ns(struct ns_common *ns) +{ + return container_of(ns, struct net, ns); +} + /* Try using get_net_track() instead */ static inline struct net *get_net(struct net *net) { diff --git a/ipc/namespace.c b/ipc/namespace.c index 9f923c1a1eb3..89588819956b 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -209,11 +209,6 @@ void put_ipc_ns(struct ipc_namespace *ns) } } -static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns) -{ - return container_of(ns, struct ipc_namespace, ns); -} - static struct ns_common *ipcns_get(struct task_struct *task) { struct ipc_namespace *ns = NULL; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index fc12c416dfeb..5a327914b565 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -89,11 +89,6 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, return new_ns; } -static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) -{ - return container_of(ns, struct cgroup_namespace, ns); -} - static int cgroupns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 228ae20299f9..9b327420309e 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -345,11 +345,6 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) return 0; } -static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) -{ - return container_of(ns, struct pid_namespace, ns); -} - static struct ns_common *pidns_get(struct task_struct *task) { struct pid_namespace *ns; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 408f60d0a3b6..20b65f90549e 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -261,11 +261,6 @@ void free_time_ns(struct time_namespace *ns) kfree_rcu(ns, ns.ns_rcu); } -static struct time_namespace *to_time_ns(struct ns_common *ns) -{ - return container_of(ns, struct time_namespace, ns); -} - static struct ns_common *timens_get(struct task_struct *task) { struct time_namespace *ns = NULL; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index ade5b6806c5c..cfb0e28f2779 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1325,11 +1325,6 @@ bool current_in_userns(const struct user_namespace *target_ns) } EXPORT_SYMBOL(current_in_userns); -static inline struct user_namespace *to_user_ns(struct ns_common *ns) -{ - return container_of(ns, struct user_namespace, ns); -} - static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; diff --git a/kernel/utsname.c b/kernel/utsname.c index 64155417ae0c..a682830742d3 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -103,11 +103,6 @@ void free_uts_ns(struct uts_namespace *ns) kfree_rcu(ns, ns.ns_rcu); } -static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) -{ - return container_of(ns, struct uts_namespace, ns); -} - static struct ns_common *utsns_get(struct task_struct *task) { struct uts_namespace *ns = NULL; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 169ec22c4758..a57b3cda8dbc 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1541,11 +1541,6 @@ static struct ns_common *netns_get(struct task_struct *task) return net ? &net->ns : NULL; } -static inline struct net *to_net_ns(struct ns_common *ns) -{ - return container_of(ns, struct net, ns); -} - static void netns_put(struct ns_common *ns) { put_net(to_net_ns(ns)); -- cgit v1.2.3 From d5b27cb8c5f30c972e041b30bc38fa5875b1a469 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:06 +0200 Subject: net: centralize ns_common initialization Centralize ns_common initialization. Signed-off-by: Christian Brauner --- net/core/net_namespace.c | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index a57b3cda8dbc..9df236811454 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -409,7 +409,7 @@ static __net_init int preinit_net(struct net *net, struct user_namespace *user_n ns_ops = NULL; #endif - ret = ns_common_init(&net->ns, ns_ops, false); + ret = ns_common_init(&net->ns, ns_ops, true); if (ret) return ret; @@ -590,6 +590,7 @@ struct net *copy_net_ns(unsigned long flags, if (rv < 0) { put_userns: + ns_free_inum(&net->ns); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); #endif @@ -712,6 +713,7 @@ static void cleanup_net(struct work_struct *work) /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); + ns_free_inum(&net->ns); dec_net_namespaces(net->ucounts); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); @@ -831,31 +833,12 @@ static void net_ns_net_debugfs(struct net *net) static __net_init int net_ns_net_init(struct net *net) { - int ret = 0; - - if (net == &init_net) - net->ns.inum = PROC_NET_INIT_INO; - else - ret = proc_alloc_inum(&to_ns_common(net)->inum); - if (ret) - return ret; - net_ns_net_debugfs(net); return 0; } -static __net_exit void net_ns_net_exit(struct net *net) -{ - /* - * Initial network namespace doesn't exit so we don't need any - * special checks here. - */ - ns_free_inum(&net->ns); -} - static struct pernet_operations __net_initdata net_ns_ops = { .init = net_ns_net_init, - .exit = net_ns_net_exit, }; static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { -- cgit v1.2.3 From 5612ff3ec588be09f11a9424db6d1186bcdeb3fa Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:07 +0200 Subject: nscommon: simplify initialization There's a lot of information that namespace implementers don't need to know about at all. Encapsulate this all in the initialization helper. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 5 +++-- include/linux/ns_common.h | 39 +++++++++++++++++++++++++++++++++++++-- ipc/namespace.c | 2 +- kernel/cgroup/namespace.c | 2 +- kernel/nscommon.c | 17 ++++++++--------- kernel/pid_namespace.c | 2 +- kernel/time/namespace.c | 2 +- kernel/user_namespace.c | 2 +- kernel/utsname.c | 2 +- net/core/net_namespace.c | 2 +- 10 files changed, 55 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/fs/namespace.c b/fs/namespace.c index b2fcb901ad8c..699b8c770c47 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4104,8 +4104,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a } if (anon) - new_ns->ns.inum = MNT_NS_ANON_INO; - ret = ns_common_init(&new_ns->ns, &mntns_operations, !anon); + ret = ns_common_init_inum(new_ns, &mntns_operations, MNT_NS_ANON_INO); + else + ret = ns_common_init(new_ns, &mntns_operations); if (ret) { kfree(new_ns); dec_mnt_namespaces(ucounts); diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 78b17fe80b62..05c7a7dd211b 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -16,6 +16,15 @@ struct time_namespace; struct user_namespace; struct uts_namespace; +extern struct cgroup_namespace init_cgroup_ns; +extern struct ipc_namespace init_ipc_ns; +extern struct mnt_namespace init_mnt_ns; +extern struct net init_net; +extern struct pid_namespace init_pid_ns; +extern struct time_namespace init_time_ns; +extern struct user_namespace init_user_ns; +extern struct uts_namespace init_uts_ns; + struct ns_common { struct dentry *stashed; const struct proc_ns_operations *ops; @@ -31,8 +40,7 @@ struct ns_common { }; }; -int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, - bool alloc_inum); +int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum); #define to_ns_common(__ns) \ _Generic((__ns), \ @@ -45,4 +53,31 @@ int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, struct user_namespace *: &(__ns)->ns, \ struct uts_namespace *: &(__ns)->ns) +#define ns_init_inum(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CGROUP_NS_INIT_INO, \ + struct ipc_namespace *: IPC_NS_INIT_INO, \ + struct mnt_namespace *: MNT_NS_INIT_INO, \ + struct net *: NET_NS_INIT_INO, \ + struct pid_namespace *: PID_NS_INIT_INO, \ + struct time_namespace *: TIME_NS_INIT_INO, \ + struct user_namespace *: USER_NS_INIT_INO, \ + struct uts_namespace *: UTS_NS_INIT_INO) + +#define ns_init_ns(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &init_cgroup_ns, \ + struct ipc_namespace *: &init_ipc_ns, \ + struct mnt_namespace *: &init_mnt_ns, \ + struct net *: &init_net, \ + struct pid_namespace *: &init_pid_ns, \ + struct time_namespace *: &init_time_ns, \ + struct user_namespace *: &init_user_ns, \ + struct uts_namespace *: &init_uts_ns) + +#define ns_common_init(__ns, __ops) \ + __ns_common_init(to_ns_common(__ns), __ops, (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) + +#define ns_common_init_inum(__ns, __ops, __inum) __ns_common_init(to_ns_common(__ns), __ops, __inum) + #endif diff --git a/ipc/namespace.c b/ipc/namespace.c index 89588819956b..0f8bbd18a475 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -62,7 +62,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, if (ns == NULL) goto fail_dec; - err = ns_common_init(&ns->ns, &ipcns_operations, true); + err = ns_common_init(ns, &ipcns_operations); if (err) goto fail_free; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 5a327914b565..d928c557e28b 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -27,7 +27,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); - ret = ns_common_init(&new_ns->ns, &cgroupns_operations, true); + ret = ns_common_init(new_ns, &cgroupns_operations); if (ret) return ERR_PTR(ret); ns_tree_add(new_ns); diff --git a/kernel/nscommon.c b/kernel/nscommon.c index e10fad8afe61..c3a90bb665ad 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -1,21 +1,20 @@ // SPDX-License-Identifier: GPL-2.0-only #include +#include -int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, - bool alloc_inum) +int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum) { - if (alloc_inum && !ns->inum) { - int ret; - ret = proc_alloc_inum(&ns->inum); - if (ret) - return ret; - } refcount_set(&ns->count, 1); ns->stashed = NULL; ns->ops = ops; ns->ns_id = 0; RB_CLEAR_NODE(&ns->ns_tree_node); INIT_LIST_HEAD(&ns->ns_list_node); - return 0; + + if (inum) { + ns->inum = inum; + return 0; + } + return proc_alloc_inum(&ns->inum); } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 9b327420309e..170757c265c2 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -103,7 +103,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns->pid_cachep == NULL) goto out_free_idr; - err = ns_common_init(&ns->ns, &pidns_operations, true); + err = ns_common_init(ns, &pidns_operations); if (err) goto out_free_idr; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 20b65f90549e..ce8e952104a7 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -97,7 +97,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, if (!ns->vvar_page) goto fail_free; - err = ns_common_init(&ns->ns, &timens_operations, true); + err = ns_common_init(ns, &timens_operations); if (err) goto fail_free_page; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index cfb0e28f2779..db9f0463219c 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -126,7 +126,7 @@ int create_user_ns(struct cred *new) ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); - ret = ns_common_init(&ns->ns, &userns_operations, true); + ret = ns_common_init(ns, &userns_operations); if (ret) goto fail_free; diff --git a/kernel/utsname.c b/kernel/utsname.c index a682830742d3..399888be66bd 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -50,7 +50,7 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, if (!ns) goto fail_dec; - err = ns_common_init(&ns->ns, &utsns_operations, true); + err = ns_common_init(ns, &utsns_operations); if (err) goto fail_free; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 9df236811454..e50897fba8cd 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -409,7 +409,7 @@ static __net_init int preinit_net(struct net *net, struct user_namespace *user_n ns_ops = NULL; #endif - ret = ns_common_init(&net->ns, ns_ops, true); + ret = ns_common_init(net, ns_ops); if (ret) return ret; -- cgit v1.2.3 From 17f1b7711e81107de60ff1f74b93fe5111dd3b0a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Sep 2025 11:52:38 +0000 Subject: psp: do not use sk_dst_get() in psp_dev_get_for_sock() Use __sk_dst_get() and dst_dev_rcu(), because dst->dev could be changed under us. Fixes: 6b46ca260e22 ("net: psp: add socket security association code") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Tested-by: Daniel Zahka Reviewed-by: Daniel Zahka Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250918115238.237475-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/psp/psp_sock.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/psp/psp_sock.c b/net/psp/psp_sock.c index afa966c6b69d..d19e37e93967 100644 --- a/net/psp/psp_sock.c +++ b/net/psp/psp_sock.c @@ -11,21 +11,18 @@ struct psp_dev *psp_dev_get_for_sock(struct sock *sk) { + struct psp_dev *psd = NULL; struct dst_entry *dst; - struct psp_dev *psd; - - dst = sk_dst_get(sk); - if (!dst) - return NULL; rcu_read_lock(); - psd = rcu_dereference(dst->dev->psp_dev); - if (psd && !psp_dev_tryget(psd)) - psd = NULL; + dst = __sk_dst_get(sk); + if (dst) { + psd = rcu_dereference(dst_dev_rcu(dst)->psp_dev); + if (psd && !psp_dev_tryget(psd)) + psd = NULL; + } rcu_read_unlock(); - dst_release(dst); - return psd; } -- cgit v1.2.3 From 833d4313bc1e9e194814917d23e8874d6b651649 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 18 Sep 2025 10:50:18 +0200 Subject: mptcp: reset blackhole on success with non-loopback ifaces When a first MPTCP connection gets successfully established after a blackhole period, 'active_disable_times' was supposed to be reset when this connection was done via any non-loopback interfaces. Unfortunately, the opposite condition was checked: only reset when the connection was established via a loopback interface. Fixing this by simply looking at the opposite. This is similar to what is done with TCP FastOpen, see tcp_fastopen_active_disable_ofo_check(). This patch is a follow-up of a previous discussion linked to commit 893c49a78d9f ("mptcp: Use __sk_dst_get() and dst_dev_rcu() in mptcp_active_enable()."), see [1]. Fixes: 27069e7cb3d1 ("mptcp: disable active MPTCP in case of blackhole") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/4209a283-8822-47bd-95b7-87e96d9b7ea3@kernel.org [1] Signed-off-by: Matthieu Baerts (NGI0) Reviewed-by: Simon Horman Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250918-net-next-mptcp-blackhole-reset-loopback-v1-1-bf5818326639@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index e8ffa62ec183..d96130e49942 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -507,7 +507,7 @@ void mptcp_active_enable(struct sock *sk) rcu_read_lock(); dst = __sk_dst_get(sk); dev = dst ? dst_dev_rcu(dst) : NULL; - if (dev && (dev->flags & IFF_LOOPBACK)) + if (!(dev && (dev->flags & IFF_LOOPBACK))) atomic_set(&pernet->active_disable_times, 0); rcu_read_unlock(); } -- cgit v1.2.3 From be5f21d3985f00827e09b798f7a07ebd6dd7f54a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:08 +0200 Subject: ns: add ns_common_free() And drop ns_free_inum(). Anything common that can be wasted centrally should be wasted in the new common helper. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 4 ++-- include/linux/ns_common.h | 3 +++ include/linux/proc_ns.h | 2 -- ipc/namespace.c | 4 ++-- kernel/cgroup/namespace.c | 2 +- kernel/nscommon.c | 5 +++++ kernel/pid_namespace.c | 4 ++-- kernel/time/namespace.c | 2 +- kernel/user_namespace.c | 4 ++-- kernel/utsname.c | 2 +- net/core/net_namespace.c | 4 ++-- 11 files changed, 21 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/fs/namespace.c b/fs/namespace.c index 699b8c770c47..b9f94769ec11 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4082,7 +4082,7 @@ static void dec_mnt_namespaces(struct ucounts *ucounts) static void free_mnt_ns(struct mnt_namespace *ns) { if (!is_anon_ns(ns)) - ns_free_inum(&ns->ns); + ns_common_free(ns); dec_mnt_namespaces(ns->ucounts); mnt_ns_tree_remove(ns); } @@ -4154,7 +4154,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { namespace_unlock(); - ns_free_inum(&new_ns->ns); + ns_common_free(ns); dec_mnt_namespaces(new_ns->ucounts); mnt_ns_release(new_ns); return ERR_CAST(new); diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 05c7a7dd211b..19833ac547f9 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -41,6 +41,7 @@ struct ns_common { }; int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum); +void __ns_common_free(struct ns_common *ns); #define to_ns_common(__ns) \ _Generic((__ns), \ @@ -80,4 +81,6 @@ int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, #define ns_common_init_inum(__ns, __ops, __inum) __ns_common_init(to_ns_common(__ns), __ops, __inum) +#define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) + #endif diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 9f21670b5824..08016f6e0e6f 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -66,8 +66,6 @@ static inline void proc_free_inum(unsigned int inum) {} #endif /* CONFIG_PROC_FS */ -#define ns_free_inum(ns) proc_free_inum((ns)->inum) - #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) #endif /* _LINUX_PROC_NS_H */ diff --git a/ipc/namespace.c b/ipc/namespace.c index 0f8bbd18a475..09d261a1a2aa 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -97,7 +97,7 @@ fail_mq: fail_put: put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); fail_free: kfree(ns); fail_dec: @@ -161,7 +161,7 @@ static void free_ipc_ns(struct ipc_namespace *ns) dec_ipc_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); kfree(ns); } diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index d928c557e28b..16ead7508371 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -40,7 +40,7 @@ void free_cgroup_ns(struct cgroup_namespace *ns) put_css_set(ns->root_cset); dec_cgroup_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); } diff --git a/kernel/nscommon.c b/kernel/nscommon.c index c3a90bb665ad..7c1b07e2a6c9 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -18,3 +18,8 @@ int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, } return proc_alloc_inum(&ns->inum); } + +void __ns_common_free(struct ns_common *ns) +{ + proc_free_inum(ns->inum); +} diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 170757c265c2..27e2dd9ee051 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -127,7 +127,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns return ns; out_free_inum: - ns_free_inum(&ns->ns); + ns_common_free(ns); out_free_idr: idr_destroy(&ns->idr); kmem_cache_free(pid_ns_cachep, ns); @@ -152,7 +152,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns) ns_tree_remove(ns); unregister_pidns_sysctls(ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); idr_destroy(&ns->idr); call_rcu(&ns->rcu, delayed_free_pidns); diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index ce8e952104a7..d49c73015d6e 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -255,7 +255,7 @@ void free_time_ns(struct time_namespace *ns) ns_tree_remove(ns); dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); __free_page(ns->vvar_page); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index db9f0463219c..32406bcab526 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -165,7 +165,7 @@ fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif - ns_free_inum(&ns->ns); + ns_common_free(ns); fail_free: kmem_cache_free(user_ns_cachep, ns); fail_dec: @@ -220,7 +220,7 @@ static void free_user_ns(struct work_struct *work) #endif retire_userns_sysctls(ns); key_free_user_ns(ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); dec_user_namespaces(ucounts); diff --git a/kernel/utsname.c b/kernel/utsname.c index 399888be66bd..95d733eb2c98 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -98,7 +98,7 @@ void free_uts_ns(struct uts_namespace *ns) ns_tree_remove(ns); dec_uts_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index e50897fba8cd..a6a3de56a81c 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -590,7 +590,7 @@ struct net *copy_net_ns(unsigned long flags, if (rv < 0) { put_userns: - ns_free_inum(&net->ns); + ns_common_free(net); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); #endif @@ -713,7 +713,7 @@ static void cleanup_net(struct work_struct *work) /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); - ns_free_inum(&net->ns); + ns_common_free(net); dec_net_namespaces(net->ucounts); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); -- cgit v1.2.3 From 83914de1c1d39dca4a3196a03bcd64d0a861d551 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:53 +0200 Subject: net-sysfs: use check_net() Don't directly acess the namespace count. There's even a dedicated helper for this. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- net/core/net-sysfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index c28cd6665444..3c2dc4c5e683 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1328,7 +1328,7 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) struct netdev_rx_queue *queue = &dev->_rx[i]; struct kobject *kobj = &queue->kobj; - if (!refcount_read(&dev_net(dev)->ns.count)) + if (!check_net(dev_net(dev))) kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); @@ -2061,7 +2061,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct netdev_queue *queue = dev->_tx + i; - if (!refcount_read(&dev_net(dev)->ns.count)) + if (!check_net(dev_net(dev))) queue->kobj.uevent_suppress = 1; if (netdev_uses_bql(dev)) @@ -2315,7 +2315,7 @@ void netdev_unregister_kobject(struct net_device *ndev) { struct device *dev = &ndev->dev; - if (!refcount_read(&dev_net(ndev)->ns.count)) + if (!check_net(dev_net(ndev))) dev_set_uevent_suppress(dev, 1); kobject_get(&dev->kobj); -- cgit v1.2.3 From dc41b844da530e94f5b8384deb2af602cbeb312a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:54 +0200 Subject: net: use check_net() Don't directly acess the namespace count. There's even a dedicated helper for this. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- net/core/net_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index a6a3de56a81c..d5e3fd819163 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -315,7 +315,7 @@ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) { int id; - if (refcount_read(&net->ns.count) == 0) + if (!check_net(net)) return NETNSA_NSID_NOT_ASSIGNED; spin_lock(&net->nsid_lock); -- cgit v1.2.3 From f12021e68a13f4f867b8d55212254f1f83f75e00 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:55 +0200 Subject: ipv4: use check_net() Don't directly acess the namespace count. There's even a dedicated helper for this. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- net/ipv4/inet_timewait_sock.c | 4 ++-- net/ipv4/tcp_metrics.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 875ff923a8ed..56a117560c0c 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -329,13 +329,13 @@ restart: TCPF_NEW_SYN_RECV)) continue; - if (refcount_read(&sock_net(sk)->ns.count)) + if (check_net(sock_net(sk))) continue; if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) continue; - if (refcount_read(&sock_net(sk)->ns.count)) { + if (check_net(sock_net(sk))) { sock_gen_put(sk); goto restart; } diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 03c068ea27b6..b67f94c60f9f 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -912,7 +912,7 @@ static void tcp_metrics_flush_all(struct net *net) spin_lock_bh(&tcp_metrics_lock); for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { match = net ? net_eq(tm_net(tm), net) : - !refcount_read(&tm_net(tm)->ns.count); + !check_net(tm_net(tm)); if (match) { rcu_assign_pointer(*pp, tm->tcpm_next); kfree_rcu(tm, rcu_head); -- cgit v1.2.3 From fc8418eca43d5872e3976636d7c4924094bd07fd Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Wed, 17 Sep 2025 13:48:24 +0900 Subject: can: raw: reorder struct uniqframe's members to optimise packing struct uniqframe has one hole. Reorder the fields to save 8 bytes. Statistics before: $ pahole --class_name=uniqframe net/can/raw.o struct uniqframe { int skbcnt; /* 0 4 */ /* XXX 4 bytes hole, try to pack */ const struct sk_buff * skb; /* 8 8 */ unsigned int join_rx_count; /* 16 4 */ /* size: 24, cachelines: 1, members: 3 */ /* sum members: 16, holes: 1, sum holes: 4 */ /* padding: 4 */ /* last cacheline: 24 bytes */ }; ...and after: $ pahole --class_name=uniqframe net/can/raw.o struct uniqframe { const struct sk_buff * skb; /* 0 8 */ int skbcnt; /* 8 4 */ unsigned int join_rx_count; /* 12 4 */ /* size: 16, cachelines: 1, members: 3 */ /* last cacheline: 16 bytes */ }; Acked-by: Oliver Hartkopp Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250917-can-raw-repack-v2-1-395e8b3a4437@kernel.org Signed-off-by: Marc Kleine-Budde --- net/can/raw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/can/raw.c b/net/can/raw.c index 76b867d21def..db21d8a8c54d 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -75,8 +75,8 @@ MODULE_ALIAS("can-proto-1"); */ struct uniqframe { - int skbcnt; const struct sk_buff *skb; + int skbcnt; unsigned int join_rx_count; }; -- cgit v1.2.3 From 890e5198a6e5b238627c245fafea1a92670a86cd Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Wed, 17 Sep 2025 13:48:25 +0900 Subject: can: raw: use bitfields to store flags in struct raw_sock The bound, loopback, recv_own_msgs, fd_frames, xl_frames and join_filters fields of struct raw_sock just need to store one bit of information. Declare all those members as a bitfields of type unsigned int and width one bit. Add a temporary variable to raw_setsockopt() and raw_getsockopt() to make the conversion between the stored bits and the socket interface. This reduces the size of struct raw_sock by sixteen bytes. Statistics before: $ pahole --class_name=raw_sock net/can/raw.o struct raw_sock { struct sock sk __attribute__((__aligned__(8))); /* 0 776 */ /* XXX last struct has 1 bit hole */ /* --- cacheline 12 boundary (768 bytes) was 8 bytes ago --- */ int bound; /* 776 4 */ int ifindex; /* 780 4 */ struct net_device * dev; /* 784 8 */ netdevice_tracker dev_tracker; /* 792 0 */ struct list_head notifier; /* 792 16 */ int loopback; /* 808 4 */ int recv_own_msgs; /* 812 4 */ int fd_frames; /* 816 4 */ int xl_frames; /* 820 4 */ struct can_raw_vcid_options raw_vcid_opts; /* 824 4 */ canid_t tx_vcid_shifted; /* 828 4 */ /* --- cacheline 13 boundary (832 bytes) --- */ canid_t rx_vcid_shifted; /* 832 4 */ canid_t rx_vcid_mask_shifted; /* 836 4 */ int join_filters; /* 840 4 */ int count; /* 844 4 */ struct can_filter dfilter; /* 848 8 */ struct can_filter * filter; /* 856 8 */ can_err_mask_t err_mask; /* 864 4 */ /* XXX 4 bytes hole, try to pack */ struct uniqframe * uniq; /* 872 8 */ /* size: 880, cachelines: 14, members: 20 */ /* sum members: 876, holes: 1, sum holes: 4 */ /* member types with bit holes: 1, total: 1 */ /* forced alignments: 1 */ /* last cacheline: 48 bytes */ } __attribute__((__aligned__(8))); ...and after: $ pahole --class_name=raw_sock net/can/raw.o struct raw_sock { struct sock sk __attribute__((__aligned__(8))); /* 0 776 */ /* XXX last struct has 1 bit hole */ /* --- cacheline 12 boundary (768 bytes) was 8 bytes ago --- */ int ifindex; /* 776 4 */ /* XXX 4 bytes hole, try to pack */ struct net_device * dev; /* 784 8 */ netdevice_tracker dev_tracker; /* 792 0 */ struct list_head notifier; /* 792 16 */ unsigned int bound:1; /* 808: 0 4 */ unsigned int loopback:1; /* 808: 1 4 */ unsigned int recv_own_msgs:1; /* 808: 2 4 */ unsigned int fd_frames:1; /* 808: 3 4 */ unsigned int xl_frames:1; /* 808: 4 4 */ unsigned int join_filters:1; /* 808: 5 4 */ /* XXX 2 bits hole, try to pack */ /* Bitfield combined with next fields */ struct can_raw_vcid_options raw_vcid_opts; /* 809 4 */ /* XXX 3 bytes hole, try to pack */ canid_t tx_vcid_shifted; /* 816 4 */ canid_t rx_vcid_shifted; /* 820 4 */ canid_t rx_vcid_mask_shifted; /* 824 4 */ int count; /* 828 4 */ /* --- cacheline 13 boundary (832 bytes) --- */ struct can_filter dfilter; /* 832 8 */ struct can_filter * filter; /* 840 8 */ can_err_mask_t err_mask; /* 848 4 */ /* XXX 4 bytes hole, try to pack */ struct uniqframe * uniq; /* 856 8 */ /* size: 864, cachelines: 14, members: 20 */ /* sum members: 852, holes: 3, sum holes: 11 */ /* sum bitfield members: 6 bits, bit holes: 1, sum bit holes: 2 bits */ /* member types with bit holes: 1, total: 1 */ /* forced alignments: 1 */ /* last cacheline: 32 bytes */ } __attribute__((__aligned__(8))); Acked-by: Oliver Hartkopp Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250917-can-raw-repack-v2-2-395e8b3a4437@kernel.org Signed-off-by: Marc Kleine-Budde --- net/can/raw.c | 59 +++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/can/raw.c b/net/can/raw.c index db21d8a8c54d..5a5ded519cd1 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -82,20 +82,20 @@ struct uniqframe { struct raw_sock { struct sock sk; - int bound; int ifindex; struct net_device *dev; netdevice_tracker dev_tracker; struct list_head notifier; - int loopback; - int recv_own_msgs; - int fd_frames; - int xl_frames; + unsigned int bound:1; + unsigned int loopback:1; + unsigned int recv_own_msgs:1; + unsigned int fd_frames:1; + unsigned int xl_frames:1; + unsigned int join_filters:1; struct can_raw_vcid_options raw_vcid_opts; canid_t tx_vcid_shifted; canid_t rx_vcid_shifted; canid_t rx_vcid_mask_shifted; - int join_filters; int count; /* number of active filters */ struct can_filter dfilter; /* default/single filter */ struct can_filter *filter; /* pointer to filter(s) */ @@ -560,8 +560,8 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, struct can_filter sfilter; /* single filter */ struct net_device *dev = NULL; can_err_mask_t err_mask = 0; - int fd_frames; int count = 0; + int flag; int err = 0; if (level != SOL_CAN_RAW) @@ -682,44 +682,48 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, break; case CAN_RAW_LOOPBACK: - if (optlen != sizeof(ro->loopback)) + if (optlen != sizeof(flag)) return -EINVAL; - if (copy_from_sockptr(&ro->loopback, optval, optlen)) + if (copy_from_sockptr(&flag, optval, optlen)) return -EFAULT; + ro->loopback = !!flag; break; case CAN_RAW_RECV_OWN_MSGS: - if (optlen != sizeof(ro->recv_own_msgs)) + if (optlen != sizeof(flag)) return -EINVAL; - if (copy_from_sockptr(&ro->recv_own_msgs, optval, optlen)) + if (copy_from_sockptr(&flag, optval, optlen)) return -EFAULT; + ro->recv_own_msgs = !!flag; break; case CAN_RAW_FD_FRAMES: - if (optlen != sizeof(fd_frames)) + if (optlen != sizeof(flag)) return -EINVAL; - if (copy_from_sockptr(&fd_frames, optval, optlen)) + if (copy_from_sockptr(&flag, optval, optlen)) return -EFAULT; /* Enabling CAN XL includes CAN FD */ - if (ro->xl_frames && !fd_frames) + if (ro->xl_frames && !flag) return -EINVAL; - ro->fd_frames = fd_frames; + ro->fd_frames = !!flag; break; case CAN_RAW_XL_FRAMES: - if (optlen != sizeof(ro->xl_frames)) + if (optlen != sizeof(flag)) return -EINVAL; - if (copy_from_sockptr(&ro->xl_frames, optval, optlen)) + if (copy_from_sockptr(&flag, optval, optlen)) return -EFAULT; + ro->xl_frames = !!flag; + /* Enabling CAN XL includes CAN FD */ if (ro->xl_frames) ro->fd_frames = ro->xl_frames; @@ -739,12 +743,13 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, break; case CAN_RAW_JOIN_FILTERS: - if (optlen != sizeof(ro->join_filters)) + if (optlen != sizeof(flag)) return -EINVAL; - if (copy_from_sockptr(&ro->join_filters, optval, optlen)) + if (copy_from_sockptr(&flag, optval, optlen)) return -EFAULT; + ro->join_filters = !!flag; break; default: @@ -758,6 +763,7 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; struct raw_sock *ro = raw_sk(sk); + int flag; int len; void *val; @@ -806,25 +812,29 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, case CAN_RAW_LOOPBACK: if (len > sizeof(int)) len = sizeof(int); - val = &ro->loopback; + flag = ro->loopback; + val = &flag; break; case CAN_RAW_RECV_OWN_MSGS: if (len > sizeof(int)) len = sizeof(int); - val = &ro->recv_own_msgs; + flag = ro->recv_own_msgs; + val = &flag; break; case CAN_RAW_FD_FRAMES: if (len > sizeof(int)) len = sizeof(int); - val = &ro->fd_frames; + flag = ro->fd_frames; + val = &flag; break; case CAN_RAW_XL_FRAMES: if (len > sizeof(int)) len = sizeof(int); - val = &ro->xl_frames; + flag = ro->xl_frames; + val = &flag; break; case CAN_RAW_XL_VCID_OPTS: { @@ -849,7 +859,8 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, case CAN_RAW_JOIN_FILTERS: if (len > sizeof(int)) len = sizeof(int); - val = &ro->join_filters; + flag = ro->join_filters; + val = &flag; break; default: -- cgit v1.2.3 From a146cfaaa0dd8a3e2cf3447cd2965a3c4d046e8f Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Wed, 17 Sep 2025 13:48:26 +0900 Subject: can: raw: reorder struct raw_sock's members to optimise packing struct raw_sock has several holes. Reorder the fields to save 8 bytes. Statistics before: $ pahole --class_name=raw_sock net/can/raw.o struct raw_sock { struct sock sk __attribute__((__aligned__(8))); /* 0 776 */ /* XXX last struct has 1 bit hole */ /* --- cacheline 12 boundary (768 bytes) was 8 bytes ago --- */ int ifindex; /* 776 4 */ /* XXX 4 bytes hole, try to pack */ struct net_device * dev; /* 784 8 */ netdevice_tracker dev_tracker; /* 792 0 */ struct list_head notifier; /* 792 16 */ unsigned int bound:1; /* 808: 0 4 */ unsigned int loopback:1; /* 808: 1 4 */ unsigned int recv_own_msgs:1; /* 808: 2 4 */ unsigned int fd_frames:1; /* 808: 3 4 */ unsigned int xl_frames:1; /* 808: 4 4 */ unsigned int join_filters:1; /* 808: 5 4 */ /* XXX 2 bits hole, try to pack */ /* Bitfield combined with next fields */ struct can_raw_vcid_options raw_vcid_opts; /* 809 4 */ /* XXX 3 bytes hole, try to pack */ canid_t tx_vcid_shifted; /* 816 4 */ canid_t rx_vcid_shifted; /* 820 4 */ canid_t rx_vcid_mask_shifted; /* 824 4 */ int count; /* 828 4 */ /* --- cacheline 13 boundary (832 bytes) --- */ struct can_filter dfilter; /* 832 8 */ struct can_filter * filter; /* 840 8 */ can_err_mask_t err_mask; /* 848 4 */ /* XXX 4 bytes hole, try to pack */ struct uniqframe * uniq; /* 856 8 */ /* size: 864, cachelines: 14, members: 20 */ /* sum members: 852, holes: 3, sum holes: 11 */ /* sum bitfield members: 6 bits, bit holes: 1, sum bit holes: 2 bits */ /* member types with bit holes: 1, total: 1 */ /* forced alignments: 1 */ /* last cacheline: 32 bytes */ } __attribute__((__aligned__(8))); ...and after: $ pahole --class_name=raw_sock net/can/raw.o struct raw_sock { struct sock sk __attribute__((__aligned__(8))); /* 0 776 */ /* XXX last struct has 1 bit hole */ /* --- cacheline 12 boundary (768 bytes) was 8 bytes ago --- */ struct net_device * dev; /* 776 8 */ netdevice_tracker dev_tracker; /* 784 0 */ struct list_head notifier; /* 784 16 */ int ifindex; /* 800 4 */ unsigned int bound:1; /* 804: 0 4 */ unsigned int loopback:1; /* 804: 1 4 */ unsigned int recv_own_msgs:1; /* 804: 2 4 */ unsigned int fd_frames:1; /* 804: 3 4 */ unsigned int xl_frames:1; /* 804: 4 4 */ unsigned int join_filters:1; /* 804: 5 4 */ /* XXX 2 bits hole, try to pack */ /* Bitfield combined with next fields */ struct can_raw_vcid_options raw_vcid_opts; /* 805 4 */ /* XXX 3 bytes hole, try to pack */ canid_t tx_vcid_shifted; /* 812 4 */ canid_t rx_vcid_shifted; /* 816 4 */ canid_t rx_vcid_mask_shifted; /* 820 4 */ can_err_mask_t err_mask; /* 824 4 */ int count; /* 828 4 */ /* --- cacheline 13 boundary (832 bytes) --- */ struct can_filter dfilter; /* 832 8 */ struct can_filter * filter; /* 840 8 */ struct uniqframe * uniq; /* 848 8 */ /* size: 856, cachelines: 14, members: 20 */ /* sum members: 852, holes: 1, sum holes: 3 */ /* sum bitfield members: 6 bits, bit holes: 1, sum bit holes: 2 bits */ /* member types with bit holes: 1, total: 1 */ /* forced alignments: 1 */ /* last cacheline: 24 bytes */ } __attribute__((__aligned__(8))); Acked-by: Oliver Hartkopp Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250917-can-raw-repack-v2-3-395e8b3a4437@kernel.org Signed-off-by: Marc Kleine-Budde --- net/can/raw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/can/raw.c b/net/can/raw.c index 5a5ded519cd1..bf65d67b5df0 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -82,10 +82,10 @@ struct uniqframe { struct raw_sock { struct sock sk; - int ifindex; struct net_device *dev; netdevice_tracker dev_tracker; struct list_head notifier; + int ifindex; unsigned int bound:1; unsigned int loopback:1; unsigned int recv_own_msgs:1; @@ -96,10 +96,10 @@ struct raw_sock { canid_t tx_vcid_shifted; canid_t rx_vcid_shifted; canid_t rx_vcid_mask_shifted; + can_err_mask_t err_mask; int count; /* number of active filters */ struct can_filter dfilter; /* default/single filter */ struct can_filter *filter; /* pointer to filter(s) */ - can_err_mask_t err_mask; struct uniqframe __percpu *uniq; }; -- cgit v1.2.3 From a35c04de2565db191726b5741e6b66a35002c652 Mon Sep 17 00:00:00 2001 From: Sidraya Jayagond Date: Wed, 17 Sep 2025 20:42:20 +0200 Subject: net/smc: fix warning in smc_rx_splice() when calling get_page() smc_lo_register_dmb() allocates DMB buffers with kzalloc(), which are later passed to get_page() in smc_rx_splice(). Since kmalloc memory is not page-backed, this triggers WARN_ON_ONCE() in get_page() and prevents holding a refcount on the buffer. This can lead to use-after-free if the memory is released before splice_to_pipe() completes. Use folio_alloc() instead, ensuring DMBs are page-backed and safe for get_page(). WARNING: CPU: 18 PID: 12152 at ./include/linux/mm.h:1330 smc_rx_splice+0xaf8/0xe20 [smc] CPU: 18 UID: 0 PID: 12152 Comm: smcapp Kdump: loaded Not tainted 6.17.0-rc3-11705-g9cf4672ecfee #10 NONE Hardware name: IBM 3931 A01 704 (z/VM 7.4.0) Krnl PSW : 0704e00180000000 000793161032696c (smc_rx_splice+0xafc/0xe20 [smc]) R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:2 PM:0 RI:0 EA:3 Krnl GPRS: 0000000000000000 001cee80007d3001 00077400000000f8 0000000000000005 0000000000000001 001cee80007d3006 0007740000001000 001c000000000000 000000009b0c99e0 0000000000001000 001c0000000000f8 001c000000000000 000003ffcc6f7c88 0007740003e98000 0007931600000005 000792969b2ff7b8 Krnl Code: 0007931610326960: af000000 mc 0,0 0007931610326964: a7f4ff43 brc 15,00079316103267ea #0007931610326968: af000000 mc 0,0 >000793161032696c: a7f4ff3f brc 15,00079316103267ea 0007931610326970: e320f1000004 lg %r2,256(%r15) 0007931610326976: c0e53fd1b5f5 brasl %r14,000793168fd5d560 000793161032697c: a7f4fbb5 brc 15,00079316103260e6 0007931610326980: b904002b lgr %r2,%r11 Call Trace: smc_rx_splice+0xafc/0xe20 [smc] smc_rx_splice+0x756/0xe20 [smc]) smc_rx_recvmsg+0xa74/0xe00 [smc] smc_splice_read+0x1ce/0x3b0 [smc] sock_splice_read+0xa2/0xf0 do_splice_read+0x198/0x240 splice_file_to_pipe+0x7e/0x110 do_splice+0x59e/0xde0 __do_splice+0x11a/0x2d0 __s390x_sys_splice+0x140/0x1f0 __do_syscall+0x122/0x280 system_call+0x6e/0x90 Last Breaking-Event-Address: smc_rx_splice+0x960/0xe20 [smc] ---[ end trace 0000000000000000 ]--- Fixes: f7a22071dbf3 ("net/smc: implement DMB-related operations of loopback-ism") Reviewed-by: Mahanta Jambigi Signed-off-by: Sidraya Jayagond Link: https://patch.msgid.link/20250917184220.801066-1-sidraya@linux.ibm.com Signed-off-by: Jakub Kicinski --- net/smc/smc_loopback.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c index 0eb00bbefd17..77cc1c6dc3e9 100644 --- a/net/smc/smc_loopback.c +++ b/net/smc/smc_loopback.c @@ -56,6 +56,7 @@ static int smc_lo_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, { struct smc_lo_dmb_node *dmb_node, *tmp_node; struct smc_lo_dev *ldev = smcd->priv; + struct folio *folio; int sba_idx, rc; /* check space for new dmb */ @@ -74,13 +75,16 @@ static int smc_lo_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, dmb_node->sba_idx = sba_idx; dmb_node->len = dmb->dmb_len; - dmb_node->cpu_addr = kzalloc(dmb_node->len, GFP_KERNEL | - __GFP_NOWARN | __GFP_NORETRY | - __GFP_NOMEMALLOC); - if (!dmb_node->cpu_addr) { + + /* not critical; fail under memory pressure and fallback to TCP */ + folio = folio_alloc(GFP_KERNEL | __GFP_NOWARN | __GFP_NOMEMALLOC | + __GFP_NORETRY | __GFP_ZERO, + get_order(dmb_node->len)); + if (!folio) { rc = -ENOMEM; goto err_node; } + dmb_node->cpu_addr = folio_address(folio); dmb_node->dma_addr = SMC_DMA_ADDR_INVALID; refcount_set(&dmb_node->refcnt, 1); @@ -122,7 +126,7 @@ static void __smc_lo_unregister_dmb(struct smc_lo_dev *ldev, write_unlock_bh(&ldev->dmb_ht_lock); clear_bit(dmb_node->sba_idx, ldev->sba_idx_mask); - kvfree(dmb_node->cpu_addr); + folio_put(virt_to_folio(dmb_node->cpu_addr)); kfree(dmb_node); if (atomic_dec_and_test(&ldev->dmb_cnt)) -- cgit v1.2.3 From b02c1230104df86d282bd298e5313bb9686cbd70 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Sep 2025 13:20:07 +0000 Subject: tcp: prefer sk_skb_reason_drop() Replace two calls to kfree_skb_reason() with sk_skb_reason_drop(). Signed-off-by: Eric Dumazet Cc: Daniel Zahka Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com> Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250918132007.325299-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_output.c | 2 +- net/psp/psp_sock.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 223d7feeb19d..bb3576ac0ad7 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1586,7 +1586,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, err = tcp_ao_transmit_skb(sk, skb, key.ao_key, th, opts.hash_location); if (err) { - kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED); + sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_NOT_SPECIFIED); return -ENOMEM; } } diff --git a/net/psp/psp_sock.c b/net/psp/psp_sock.c index d19e37e93967..5324a7603bed 100644 --- a/net/psp/psp_sock.c +++ b/net/psp/psp_sock.c @@ -37,7 +37,7 @@ psp_validate_xmit(struct sock *sk, struct net_device *dev, struct sk_buff *skb) good = !pas || rcu_access_pointer(dev->psp_dev) == pas->psd; rcu_read_unlock(); if (!good) { - kfree_skb_reason(skb, SKB_DROP_REASON_PSP_OUTPUT); + sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_PSP_OUTPUT); return NULL; } -- cgit v1.2.3 From 85c7333c35f22cdb8391b4cacfdc496aec4162ae Mon Sep 17 00:00:00 2001 From: Daniel Zahka Date: Thu, 18 Sep 2025 14:27:20 -0700 Subject: psp: clarify checksum behavior of psp_dev_rcv() psp_dev_rcv() decapsulates psp headers from a received frame. This will make any csum complete computed by the device inaccurate. Rather than attempt to patch up skb->csum in psp_dev_rcv() just make it clear to callers what they can expect regarding checksum complete. Signed-off-by: Daniel Zahka Link: https://patch.msgid.link/20250918212723.17495-1-daniel.zahka@gmail.com Signed-off-by: Jakub Kicinski --- net/psp/psp_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c index 0f8c50c8e943..481aaf0fc9fc 100644 --- a/net/psp/psp_main.c +++ b/net/psp/psp_main.c @@ -228,7 +228,8 @@ EXPORT_SYMBOL(psp_dev_encapsulate); * Presently it accepts only already-authenticated packets and does not * support optional fields, such as virtualization cookies. The caller should * ensure that skb->data is pointing to the mac header, and that skb->mac_len - * is set. + * is set. This function does not currently adjust skb->csum (CHECKSUM_COMPLETE + * is not supported). */ int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv) { -- cgit v1.2.3 From b73b8146d7ff68e245525adb944a4c998d423d59 Mon Sep 17 00:00:00 2001 From: Alasdair McWilliam Date: Wed, 17 Sep 2025 10:55:42 +0100 Subject: rtnetlink: add needed_{head,tail}room attributes Various network interface types make use of needed_{head,tail}room values to efficiently reserve buffer space for additional encapsulation headers, such as VXLAN, Geneve, IPSec, etc. However, it is not currently possible to query these values in a generic way. Introduce ability to query the needed_{head,tail}room values of a network device via rtnetlink, such that applications that may wish to use these values can do so. For example, Cilium agent iterates over present devices based on user config (direct routing, vxlan, geneve, wireguard etc.) and in future will configure netkit in order to expose the needed_{head,tail}room into K8s pods. See b9ed315d3c4c ("netkit: Allow for configuring needed_{head,tail}room"). Suggested-by: Daniel Borkmann Signed-off-by: Alasdair McWilliam Reviewed-by: Daniel Borkmann Link: https://patch.msgid.link/20250917095543.14039-1-alasdair@mcwilliam.dev Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt-link.yaml | 6 ++++++ include/uapi/linux/if_link.h | 2 ++ net/core/rtnetlink.c | 10 +++++++++- 3 files changed, 17 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/Documentation/netlink/specs/rt-link.yaml b/Documentation/netlink/specs/rt-link.yaml index 6ab31f86854d..2a23e9699c0b 100644 --- a/Documentation/netlink/specs/rt-link.yaml +++ b/Documentation/netlink/specs/rt-link.yaml @@ -1057,6 +1057,12 @@ attribute-sets: - name: netns-immutable type: u8 + - + name: headroom + type: u16 + - + name: tailroom + type: u16 - name: prop-list-link-attrs subset-of: link-attrs diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 45f56c9f95d9..3b491d96e52e 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -379,6 +379,8 @@ enum { IFLA_DPLL_PIN, IFLA_MAX_PACING_OFFLOAD_HORIZON, IFLA_NETNS_IMMUTABLE, + IFLA_HEADROOM, + IFLA_TAILROOM, __IFLA_MAX }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 094b085cff20..d9e68ca84926 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1326,6 +1326,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_devlink_port_size(dev) + rtnl_dpll_pin_size(dev) + nla_total_size(8) /* IFLA_MAX_PACING_OFFLOAD_HORIZON */ + + nla_total_size(2) /* IFLA_HEADROOM */ + + nla_total_size(2) /* IFLA_TAILROOM */ + 0; } @@ -2091,7 +2093,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_u32(skb, IFLA_CARRIER_UP_COUNT, atomic_read(&dev->carrier_up_count)) || nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT, - atomic_read(&dev->carrier_down_count))) + atomic_read(&dev->carrier_down_count)) || + nla_put_u16(skb, IFLA_HEADROOM, + READ_ONCE(dev->needed_headroom)) || + nla_put_u16(skb, IFLA_TAILROOM, + READ_ONCE(dev->needed_tailroom))) goto nla_put_failure; if (rtnl_fill_proto_down(skb, dev)) @@ -2243,6 +2249,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_GSO_IPV4_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1), [IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 }, [IFLA_NETNS_IMMUTABLE] = { .type = NLA_REJECT }, + [IFLA_HEADROOM] = { .type = NLA_REJECT }, + [IFLA_TAILROOM] = { .type = NLA_REJECT }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { -- cgit v1.2.3 From 1c7e4a618509476658bafba35fffb3a5cfb213b1 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Mon, 15 Sep 2025 11:19:54 +0200 Subject: net: ipv4: make udp_v4_early_demux explicitly return drop reason udp_v4_early_demux already returns drop reasons as it either returns 0 or ip_mc_validate_source, which itself returns drop reasons. Its return value is also already used as a drop reason itself. Makes this explicit by making it return drop reasons. Signed-off-by: Antoine Tenart Reviewed-by: David Ahern Link: https://patch.msgid.link/20250915091958.15382-2-atenart@kernel.org Signed-off-by: Jakub Kicinski --- include/net/udp.h | 2 +- net/ipv4/ip_input.c | 2 +- net/ipv4/udp.c | 12 ++++++------ 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/udp.h b/include/net/udp.h index eecd64097f91..059a0cee5f55 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -404,7 +404,7 @@ static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags, return __skb_recv_udp(sk, flags, &off, err); } -int udp_v4_early_demux(struct sk_buff *skb); +enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb); bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst); int udp_err(struct sk_buff *, u32); int udp_abort(struct sock *sk, int err); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index a09aca2c8567..8878e865ddf6 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -319,7 +319,7 @@ static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, } int tcp_v4_early_demux(struct sk_buff *skb); -int udp_v4_early_demux(struct sk_buff *skb); +enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb); static int ip_rcv_finish_core(struct net *net, struct sk_buff *skb, struct net_device *dev, const struct sk_buff *hint) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0c40426628eb..85cfc32eb2cc 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2811,7 +2811,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, return NULL; } -int udp_v4_early_demux(struct sk_buff *skb) +enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct in_device *in_dev = NULL; @@ -2825,7 +2825,7 @@ int udp_v4_early_demux(struct sk_buff *skb) /* validate the packet */ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) - return 0; + return SKB_NOT_DROPPED_YET; iph = ip_hdr(skb); uh = udp_hdr(skb); @@ -2834,12 +2834,12 @@ int udp_v4_early_demux(struct sk_buff *skb) in_dev = __in_dev_get_rcu(skb->dev); if (!in_dev) - return 0; + return SKB_NOT_DROPPED_YET; ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, iph->protocol); if (!ours) - return 0; + return SKB_NOT_DROPPED_YET; sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, uh->source, iph->saddr, @@ -2850,7 +2850,7 @@ int udp_v4_early_demux(struct sk_buff *skb) } if (!sk) - return 0; + return SKB_NOT_DROPPED_YET; skb->sk = sk; DEBUG_NET_WARN_ON_ONCE(sk_is_refcounted(sk)); @@ -2877,7 +2877,7 @@ int udp_v4_early_demux(struct sk_buff *skb) ip4h_dscp(iph), skb->dev, in_dev, &itag); } - return 0; + return SKB_NOT_DROPPED_YET; } int udp_rcv(struct sk_buff *skb) -- cgit v1.2.3 From dcc0e68ed300dae3325e323417773dd59a6a65db Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Mon, 15 Sep 2025 11:19:55 +0200 Subject: net: ipv4: simplify drop reason handling in ip_rcv_finish_core Instead of setting the drop reason to SKB_DROP_REASON_NOT_SPECIFIED early and having to reset it each time it is overridden by a function returned value, just set the drop reason to the expected value before returning from ip_rcv_finish_core. Signed-off-by: Antoine Tenart Link: https://patch.msgid.link/20250915091958.15382-3-atenart@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv4/ip_input.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 8878e865ddf6..93b8286e526a 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -335,7 +335,6 @@ static int ip_rcv_finish_core(struct net *net, goto drop_error; } - drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) && !skb_dst(skb) && !skb->sk && @@ -354,7 +353,6 @@ static int ip_rcv_finish_core(struct net *net, drop_reason = udp_v4_early_demux(skb); if (unlikely(drop_reason)) goto drop_error; - drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); @@ -372,7 +370,6 @@ static int ip_rcv_finish_core(struct net *net, ip4h_dscp(iph), dev); if (unlikely(drop_reason)) goto drop_error; - drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; } else { struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -391,8 +388,10 @@ static int ip_rcv_finish_core(struct net *net, } #endif - if (iph->ihl > 5 && ip_rcv_options(skb, dev)) + if (iph->ihl > 5 && ip_rcv_options(skb, dev)) { + drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; goto drop; + } rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { -- cgit v1.2.3 From 9e1e2f4ebf99d72389bb257f01f6bed70fccf66c Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Mon, 15 Sep 2025 11:19:56 +0200 Subject: net: ipv4: convert ip_rcv_options to drop reasons This converts the only path not returning drop reasons in ip_rcv_finish_core. Signed-off-by: Antoine Tenart Reviewed-by: David Ahern Link: https://patch.msgid.link/20250915091958.15382-4-atenart@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv4/ip_input.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 93b8286e526a..273578579a6b 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -263,10 +263,11 @@ int ip_local_deliver(struct sk_buff *skb) } EXPORT_SYMBOL(ip_local_deliver); -static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev) +static inline enum skb_drop_reason +ip_rcv_options(struct sk_buff *skb, struct net_device *dev) { - struct ip_options *opt; const struct iphdr *iph; + struct ip_options *opt; /* It looks as overkill, because not all IP options require packet mangling. @@ -277,7 +278,7 @@ static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev) */ if (skb_cow(skb, skb_headroom(skb))) { __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INDISCARDS); - goto drop; + return SKB_DROP_REASON_NOMEM; } iph = ip_hdr(skb); @@ -286,7 +287,7 @@ static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev) if (ip_options_compile(dev_net(dev), opt, skb)) { __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INHDRERRORS); - goto drop; + return SKB_DROP_REASON_IP_INHDR; } if (unlikely(opt->srr)) { @@ -298,17 +299,15 @@ static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev) net_info_ratelimited("source route option %pI4 -> %pI4\n", &iph->saddr, &iph->daddr); - goto drop; + return SKB_DROP_REASON_NOT_SPECIFIED; } } if (ip_options_rcv_srr(skb, dev)) - goto drop; + return SKB_DROP_REASON_NOT_SPECIFIED; } - return false; -drop: - return true; + return SKB_NOT_DROPPED_YET; } static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, @@ -388,9 +387,10 @@ static int ip_rcv_finish_core(struct net *net, } #endif - if (iph->ihl > 5 && ip_rcv_options(skb, dev)) { - drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; - goto drop; + if (iph->ihl > 5) { + drop_reason = ip_rcv_options(skb, dev); + if (drop_reason) + goto drop; } rt = skb_rtable(skb); -- cgit v1.2.3 From b34df17d588de926212527a2f2ce72bc4e330260 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 18 Sep 2025 05:25:57 -0700 Subject: net: netpoll: remove unused netpoll pointer from netpoll_info The netpoll_info structure contains an useless pointer back to its associated netpoll. This field is never used, and the assignment in __netpoll_setup() is does not comtemplate multiple instances, as reported by Jay[1]. Drop both the member and its initialization to simplify the structure. Link: https://lore.kernel.org/all/2930648.1757463506@famine/ [1] Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20250918-netpoll_jv-v1-1-67d50eeb2c26@debian.org Signed-off-by: Jakub Kicinski --- include/linux/netpoll.h | 1 - net/core/netpoll.c | 1 - 2 files changed, 2 deletions(-) (limited to 'net') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index b5ea9882eda8..f22eec466040 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -55,7 +55,6 @@ struct netpoll_info { struct delayed_work tx_work; - struct netpoll *netpoll; struct rcu_head rcu; }; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 5f65b62346d4..c58faa747165 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -591,7 +591,6 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) np->dev = ndev; strscpy(np->dev_name, ndev->name, IFNAMSIZ); - npinfo->netpoll = np; /* fill up the skb queue */ refill_skbs(np); -- cgit v1.2.3 From 614accf5455304ac0e708882609a34ec9aec463b Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 18 Sep 2025 05:25:58 -0700 Subject: net: netpoll: use synchronize_net() instead of synchronize_rcu() Replace synchronize_rcu() with synchronize_net() in __netpoll_free(). synchronize_net() is RTNL-aware and will use the more efficient synchronize_rcu_expedited() when called under RTNL lock, avoiding the potentially expensive synchronize_rcu() in RTNL critical sections. Since __netpoll_free() is called with RTNL held (as indicated by ASSERT_RTNL()), this change improves performance by reducing the time spent in the RTNL critical section. Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20250918-netpoll_jv-v1-2-67d50eeb2c26@debian.org Signed-off-by: Jakub Kicinski --- net/core/netpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index c58faa747165..60a05d3b7c24 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -834,7 +834,7 @@ void __netpoll_free(struct netpoll *np) ASSERT_RTNL(); /* Wait for transmitting packets to finish before freeing. */ - synchronize_rcu(); + synchronize_net(); __netpoll_cleanup(np); kfree(np); } -- cgit v1.2.3 From 1488af7b8b5f9896ea88ee35aa3301713f72737c Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 5 Sep 2025 10:29:18 -0400 Subject: Bluetooth: hci_sync: Fix hci_resume_advertising_sync hci_resume_advertising_sync is suppose to resume all instance paused by hci_pause_advertising_sync, this logic is used for procedures are only allowed when not advertising, but instance 0x00 was not being re-enabled. Fixes: ad383c2c65a5 ("Bluetooth: hci_sync: Enable advertising when LL privacy is enabled") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index b6f888d8354e..7a7d49890858 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -2594,6 +2594,13 @@ static int hci_resume_advertising_sync(struct hci_dev *hdev) hci_remove_ext_adv_instance_sync(hdev, adv->instance, NULL); } + + /* If current advertising instance is set to instance 0x00 + * then we need to re-enable it. + */ + if (!hdev->cur_adv_instance) + err = hci_enable_ext_advertising_sync(hdev, + hdev->cur_adv_instance); } else { /* Schedule for most recent instance to be restarted and begin * the software rotation loop -- cgit v1.2.3 From 2e128683176a56459cef8705fc7c35f438f88abd Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 25 Aug 2025 10:27:29 -0400 Subject: Bluetooth: hci_event: Fix UAF in hci_conn_tx_dequeue This fixes the following UAF caused by not properly locking hdev when processing HCI_EV_NUM_COMP_PKTS: BUG: KASAN: slab-use-after-free in hci_conn_tx_dequeue+0x1be/0x220 net/bluetooth/hci_conn.c:3036 Read of size 4 at addr ffff8880740f0940 by task kworker/u11:0/54 CPU: 1 UID: 0 PID: 54 Comm: kworker/u11:0 Not tainted 6.16.0-rc7 #3 PREEMPT(full) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 Workqueue: hci1 hci_rx_work Call Trace: dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xca/0x230 mm/kasan/report.c:480 kasan_report+0x118/0x150 mm/kasan/report.c:593 hci_conn_tx_dequeue+0x1be/0x220 net/bluetooth/hci_conn.c:3036 hci_num_comp_pkts_evt+0x1c8/0xa50 net/bluetooth/hci_event.c:4404 hci_event_func net/bluetooth/hci_event.c:7477 [inline] hci_event_packet+0x7e0/0x1200 net/bluetooth/hci_event.c:7531 hci_rx_work+0x46a/0xe80 net/bluetooth/hci_core.c:4070 process_one_work kernel/workqueue.c:3238 [inline] process_scheduled_works+0xae1/0x17b0 kernel/workqueue.c:3321 worker_thread+0x8a0/0xda0 kernel/workqueue.c:3402 kthread+0x70e/0x8a0 kernel/kthread.c:464 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 home/kwqcheii/source/fuzzing/kernel/kasan/linux-6.16-rc7/arch/x86/entry/entry_64.S:245 Allocated by task 54: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x93/0xb0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __kmalloc_cache_noprof+0x230/0x3d0 mm/slub.c:4359 kmalloc_noprof include/linux/slab.h:905 [inline] kzalloc_noprof include/linux/slab.h:1039 [inline] __hci_conn_add+0x233/0x1b30 net/bluetooth/hci_conn.c:939 le_conn_complete_evt+0x3d6/0x1220 net/bluetooth/hci_event.c:5628 hci_le_enh_conn_complete_evt+0x189/0x470 net/bluetooth/hci_event.c:5794 hci_event_func net/bluetooth/hci_event.c:7474 [inline] hci_event_packet+0x78c/0x1200 net/bluetooth/hci_event.c:7531 hci_rx_work+0x46a/0xe80 net/bluetooth/hci_core.c:4070 process_one_work kernel/workqueue.c:3238 [inline] process_scheduled_works+0xae1/0x17b0 kernel/workqueue.c:3321 worker_thread+0x8a0/0xda0 kernel/workqueue.c:3402 kthread+0x70e/0x8a0 kernel/kthread.c:464 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 home/kwqcheii/source/fuzzing/kernel/kasan/linux-6.16-rc7/arch/x86/entry/entry_64.S:245 Freed by task 9572: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x46/0x50 mm/kasan/generic.c:576 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x62/0x70 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2381 [inline] slab_free mm/slub.c:4643 [inline] kfree+0x18e/0x440 mm/slub.c:4842 device_release+0x9c/0x1c0 kobject_cleanup lib/kobject.c:689 [inline] kobject_release lib/kobject.c:720 [inline] kref_put include/linux/kref.h:65 [inline] kobject_put+0x22b/0x480 lib/kobject.c:737 hci_conn_cleanup net/bluetooth/hci_conn.c:175 [inline] hci_conn_del+0x8ff/0xcb0 net/bluetooth/hci_conn.c:1173 hci_abort_conn_sync+0x5d1/0xdf0 net/bluetooth/hci_sync.c:5689 hci_cmd_sync_work+0x210/0x3a0 net/bluetooth/hci_sync.c:332 process_one_work kernel/workqueue.c:3238 [inline] process_scheduled_works+0xae1/0x17b0 kernel/workqueue.c:3321 worker_thread+0x8a0/0xda0 kernel/workqueue.c:3402 kthread+0x70e/0x8a0 kernel/kthread.c:464 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 home/kwqcheii/source/fuzzing/kernel/kasan/linux-6.16-rc7/arch/x86/entry/entry_64.S:245 Fixes: 134f4b39df7b ("Bluetooth: add support for skb TX SND/COMPLETION timestamping") Reported-by: Junvyyang, Tencent Zhuque Lab Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 7a2174851857..97f543824bb0 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4391,6 +4391,8 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "num %d", ev->num); + hci_dev_lock(hdev); + for (i = 0; i < ev->num; i++) { struct hci_comp_pkts_info *info = &ev->handles[i]; struct hci_conn *conn; @@ -4472,6 +4474,8 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data, } queue_work(hdev->workqueue, &hdev->tx_work); + + hci_dev_unlock(hdev); } static void hci_mode_change_evt(struct hci_dev *hdev, void *data, -- cgit v1.2.3 From 9e622804d57e2d08f0271200606bd1270f75126f Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 25 Aug 2025 11:10:20 -0400 Subject: Bluetooth: hci_event: Fix UAF in hci_acl_create_conn_sync This fixes the following UFA in hci_acl_create_conn_sync where a connection still pending is command submission (conn->state == BT_OPEN) maybe freed, also since this also can happen with the likes of hci_le_create_conn_sync fix it as well: BUG: KASAN: slab-use-after-free in hci_acl_create_conn_sync+0x5ef/0x790 net/bluetooth/hci_sync.c:6861 Write of size 2 at addr ffff88805ffcc038 by task kworker/u11:2/9541 CPU: 1 UID: 0 PID: 9541 Comm: kworker/u11:2 Not tainted 6.16.0-rc7 #3 PREEMPT(full) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 Workqueue: hci3 hci_cmd_sync_work Call Trace: dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xca/0x230 mm/kasan/report.c:480 kasan_report+0x118/0x150 mm/kasan/report.c:593 hci_acl_create_conn_sync+0x5ef/0x790 net/bluetooth/hci_sync.c:6861 hci_cmd_sync_work+0x210/0x3a0 net/bluetooth/hci_sync.c:332 process_one_work kernel/workqueue.c:3238 [inline] process_scheduled_works+0xae1/0x17b0 kernel/workqueue.c:3321 worker_thread+0x8a0/0xda0 kernel/workqueue.c:3402 kthread+0x70e/0x8a0 kernel/kthread.c:464 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 home/kwqcheii/source/fuzzing/kernel/kasan/linux-6.16-rc7/arch/x86/entry/entry_64.S:245 Allocated by task 123736: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x93/0xb0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __kmalloc_cache_noprof+0x230/0x3d0 mm/slub.c:4359 kmalloc_noprof include/linux/slab.h:905 [inline] kzalloc_noprof include/linux/slab.h:1039 [inline] __hci_conn_add+0x233/0x1b30 net/bluetooth/hci_conn.c:939 hci_conn_add_unset net/bluetooth/hci_conn.c:1051 [inline] hci_connect_acl+0x16c/0x4e0 net/bluetooth/hci_conn.c:1634 pair_device+0x418/0xa70 net/bluetooth/mgmt.c:3556 hci_mgmt_cmd+0x9c9/0xef0 net/bluetooth/hci_sock.c:1719 hci_sock_sendmsg+0x6ca/0xef0 net/bluetooth/hci_sock.c:1839 sock_sendmsg_nosec net/socket.c:712 [inline] __sock_sendmsg+0x219/0x270 net/socket.c:727 sock_write_iter+0x258/0x330 net/socket.c:1131 new_sync_write fs/read_write.c:593 [inline] vfs_write+0x54b/0xa90 fs/read_write.c:686 ksys_write+0x145/0x250 fs/read_write.c:738 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 103680: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x46/0x50 mm/kasan/generic.c:576 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x62/0x70 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2381 [inline] slab_free mm/slub.c:4643 [inline] kfree+0x18e/0x440 mm/slub.c:4842 device_release+0x9c/0x1c0 kobject_cleanup lib/kobject.c:689 [inline] kobject_release lib/kobject.c:720 [inline] kref_put include/linux/kref.h:65 [inline] kobject_put+0x22b/0x480 lib/kobject.c:737 hci_conn_cleanup net/bluetooth/hci_conn.c:175 [inline] hci_conn_del+0x8ff/0xcb0 net/bluetooth/hci_conn.c:1173 hci_conn_complete_evt+0x3c7/0x1040 net/bluetooth/hci_event.c:3199 hci_event_func net/bluetooth/hci_event.c:7477 [inline] hci_event_packet+0x7e0/0x1200 net/bluetooth/hci_event.c:7531 hci_rx_work+0x46a/0xe80 net/bluetooth/hci_core.c:4070 process_one_work kernel/workqueue.c:3238 [inline] process_scheduled_works+0xae1/0x17b0 kernel/workqueue.c:3321 worker_thread+0x8a0/0xda0 kernel/workqueue.c:3402 kthread+0x70e/0x8a0 kernel/kthread.c:464 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 home/kwqcheii/source/fuzzing/kernel/kasan/linux-6.16-rc7/arch/x86/entry/entry_64.S:245 Last potentially related work creation: kasan_save_stack+0x3e/0x60 mm/kasan/common.c:47 kasan_record_aux_stack+0xbd/0xd0 mm/kasan/generic.c:548 insert_work+0x3d/0x330 kernel/workqueue.c:2183 __queue_work+0xbd9/0xfe0 kernel/workqueue.c:2345 queue_delayed_work_on+0x18b/0x280 kernel/workqueue.c:2561 pairing_complete+0x1e7/0x2b0 net/bluetooth/mgmt.c:3451 pairing_complete_cb+0x1ac/0x230 net/bluetooth/mgmt.c:3487 hci_connect_cfm include/net/bluetooth/hci_core.h:2064 [inline] hci_conn_failed+0x24d/0x310 net/bluetooth/hci_conn.c:1275 hci_conn_complete_evt+0x3c7/0x1040 net/bluetooth/hci_event.c:3199 hci_event_func net/bluetooth/hci_event.c:7477 [inline] hci_event_packet+0x7e0/0x1200 net/bluetooth/hci_event.c:7531 hci_rx_work+0x46a/0xe80 net/bluetooth/hci_core.c:4070 process_one_work kernel/workqueue.c:3238 [inline] process_scheduled_works+0xae1/0x17b0 kernel/workqueue.c:3321 worker_thread+0x8a0/0xda0 kernel/workqueue.c:3402 kthread+0x70e/0x8a0 kernel/kthread.c:464 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 home/kwqcheii/source/fuzzing/kernel/kasan/linux-6.16-rc7/arch/x86/entry/entry_64.S:245 Fixes: aef2aa4fa98e ("Bluetooth: hci_event: Fix creating hci_conn object on error status") Reported-by: Junvyyang, Tencent Zhuque Lab Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 21 +++++++++++++++++++++ net/bluetooth/hci_event.c | 26 +++++++++++++++++++++++--- 2 files changed, 44 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 6906af7a8f24..6560b32f3125 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1245,6 +1245,27 @@ static inline struct hci_conn *hci_conn_hash_lookup_ba(struct hci_dev *hdev, return NULL; } +static inline struct hci_conn *hci_conn_hash_lookup_role(struct hci_dev *hdev, + __u8 type, __u8 role, + bdaddr_t *ba) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type == type && c->role == role && !bacmp(&c->dst, ba)) { + rcu_read_unlock(); + return c; + } + } + + rcu_read_unlock(); + + return NULL; +} + static inline struct hci_conn *hci_conn_hash_lookup_le(struct hci_dev *hdev, bdaddr_t *ba, __u8 ba_type) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 97f543824bb0..fe49e8a7969f 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3087,8 +3087,18 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, hci_dev_lock(hdev); + /* Check for existing connection: + * + * 1. If it doesn't exist then it must be receiver/slave role. + * 2. If it does exist confirm that it is connecting/BT_CONNECT in case + * of initiator/master role since there could be a collision where + * either side is attempting to connect or something like a fuzzing + * testing is trying to play tricks to destroy the hcon object before + * it even attempts to connect (e.g. hcon->state == BT_OPEN). + */ conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr); - if (!conn) { + if (!conn || + (conn->role == HCI_ROLE_MASTER && conn->state != BT_CONNECT)) { /* In case of error status and there is no connection pending * just unlock as there is nothing to cleanup. */ @@ -5638,8 +5648,18 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, */ hci_dev_clear_flag(hdev, HCI_LE_ADV); - conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, bdaddr); - if (!conn) { + /* Check for existing connection: + * + * 1. If it doesn't exist then use the role to create a new object. + * 2. If it does exist confirm that it is connecting/BT_CONNECT in case + * of initiator/master role since there could be a collision where + * either side is attempting to connect or something like a fuzzing + * testing is trying to play tricks to destroy the hcon object before + * it even attempts to connect (e.g. hcon->state == BT_OPEN). + */ + conn = hci_conn_hash_lookup_role(hdev, LE_LINK, role, bdaddr); + if (!conn || + (conn->role == HCI_ROLE_MASTER && conn->state != BT_CONNECT)) { /* In case of error status and there is no connection pending * just unlock as there is nothing to cleanup. */ -- cgit v1.2.3 From 6df164e29bd4e6505c5a2e0e5f1e1f6957a16a42 Mon Sep 17 00:00:00 2001 From: Lei Lu Date: Mon, 11 Aug 2025 21:58:48 +0800 Subject: sunrpc: fix null pointer dereference on zero-length checksum In xdr_stream_decode_opaque_auth(), zero-length checksum.len causes checksum.data to be set to NULL. This triggers a NPD when accessing checksum.data in gss_krb5_verify_mic_v2(). This patch ensures that the value of checksum.len is not less than XDR_UNIT. Fixes: 0653028e8f1c ("SUNRPC: Convert gss_verify_header() to use xdr_stream") Cc: stable@kernel.org Signed-off-by: Lei Lu Signed-off-by: Chuck Lever --- net/sunrpc/auth_gss/svcauth_gss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index e82212f6b562..a8ec30759a18 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -724,7 +724,7 @@ svcauth_gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } - if (flavor != RPC_AUTH_GSS) { + if (flavor != RPC_AUTH_GSS || checksum.len < XDR_UNIT) { rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } -- cgit v1.2.3 From d9adbb6e10bf7d4223d3d521ede1b2052903bc5e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 8 Jul 2025 14:14:53 -0400 Subject: sunrpc: delay pc_release callback until after the reply is sent The server-side sunrpc code currently calls pc_release before sending the reply. Change svc_process and svc_process_bc to call pc_release after sending the reply instead. Reviewed-by: NeilBrown Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/svc.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index b1fab3a69544..fc70e13b1cb9 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1426,8 +1426,6 @@ svc_process_common(struct svc_rqst *rqstp) /* Call the function that processes the request. */ rc = process.dispatch(rqstp); - if (procp->pc_release) - procp->pc_release(rqstp); xdr_finish_decode(xdr); if (!rc) @@ -1526,6 +1524,14 @@ static void svc_drop(struct svc_rqst *rqstp) trace_svc_drop(rqstp); } +static void svc_release_rqst(struct svc_rqst *rqstp) +{ + const struct svc_procedure *procp = rqstp->rq_procinfo; + + if (procp && procp->pc_release) + procp->pc_release(rqstp); +} + /** * svc_process - Execute one RPC transaction * @rqstp: RPC transaction context @@ -1565,9 +1571,12 @@ void svc_process(struct svc_rqst *rqstp) if (unlikely(*p != rpc_call)) goto out_baddir; - if (!svc_process_common(rqstp)) + if (!svc_process_common(rqstp)) { + svc_release_rqst(rqstp); goto out_drop; + } svc_send(rqstp); + svc_release_rqst(rqstp); return; out_baddir: @@ -1635,6 +1644,7 @@ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp) if (!proc_error) { /* Processing error: drop the request */ xprt_free_bc_request(req); + svc_release_rqst(rqstp); return; } /* Finally, send the reply synchronously */ @@ -1648,6 +1658,7 @@ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp) timeout.to_maxval = timeout.to_initval; memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf)); task = rpc_run_bc_task(req, &timeout); + svc_release_rqst(rqstp); if (IS_ERR(task)) return; -- cgit v1.2.3 From a9a15ba23efc4d6d34127e8d175ae63a95434f58 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 11 Aug 2025 10:37:07 -0400 Subject: sunrpc: fix pr_notice in svc_tcp_sendto() to show correct length This pr_notice() is confusing since it only prints xdr->len, which doesn't include the 4-byte record marker. That can make it sometimes look like the socket sent more than was requested if it's short by just a few bytes. Add sizeof(marker) to the size and fix the format accordingly. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/svcsock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index e2c5e0e626f9..1afaeb45d6a3 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1293,10 +1293,10 @@ out_notconn: mutex_unlock(&xprt->xpt_mutex); return -ENOTCONN; out_close: - pr_notice("rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", + pr_notice("rpc-srv/tcp: %s: %s %d when sending %zu bytes - shutting down socket\n", xprt->xpt_server->sv_name, (err < 0) ? "got error" : "sent", - (err < 0) ? err : sent, xdr->len); + (err < 0) ? err : sent, xdr->len + sizeof(marker)); svc_xprt_deferred_close(xprt); mutex_unlock(&xprt->xpt_mutex); return -EAGAIN; -- cgit v1.2.3 From 7569065fb123f8428cb9d29939dd16d43d4b50c4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 11 Aug 2025 10:37:08 -0400 Subject: sunrpc: eliminate return pointer in svc_tcp_sendmsg() Return a positive value if something was sent, or a negative error code. Eliminate the "err" variable in the only caller as well. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/svcsock.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 1afaeb45d6a3..c0d5a27ba674 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1224,7 +1224,7 @@ err_noclose: * that the pages backing @xdr are unchanging. */ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, - rpc_fraghdr marker, int *sentp) + rpc_fraghdr marker) { struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES, @@ -1233,8 +1233,6 @@ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, void *buf; int ret; - *sentp = 0; - /* The stream record marker is copied into a temporary page * fragment buffer so that it can be included in rq_bvec. */ @@ -1252,10 +1250,7 @@ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, 1 + count, sizeof(marker) + rqstp->rq_res.len); ret = sock_sendmsg(svsk->sk_sock, &msg); page_frag_free(buf); - if (ret < 0) - return ret; - *sentp += ret; - return 0; + return ret; } /** @@ -1274,7 +1269,7 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) struct xdr_buf *xdr = &rqstp->rq_res; rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT | (u32)xdr->len); - int sent, err; + int sent; svc_tcp_release_ctxt(xprt, rqstp->rq_xprt_ctxt); rqstp->rq_xprt_ctxt = NULL; @@ -1282,9 +1277,9 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) mutex_lock(&xprt->xpt_mutex); if (svc_xprt_is_dead(xprt)) goto out_notconn; - err = svc_tcp_sendmsg(svsk, rqstp, marker, &sent); - trace_svcsock_tcp_send(xprt, err < 0 ? (long)err : sent); - if (err < 0 || sent != (xdr->len + sizeof(marker))) + sent = svc_tcp_sendmsg(svsk, rqstp, marker); + trace_svcsock_tcp_send(xprt, sent); + if (sent < 0 || sent != (xdr->len + sizeof(marker))) goto out_close; mutex_unlock(&xprt->xpt_mutex); return sent; @@ -1295,8 +1290,8 @@ out_notconn: out_close: pr_notice("rpc-srv/tcp: %s: %s %d when sending %zu bytes - shutting down socket\n", xprt->xpt_server->sv_name, - (err < 0) ? "got error" : "sent", - (err < 0) ? err : sent, xdr->len + sizeof(marker)); + (sent < 0) ? "got error" : "sent", + sent, xdr->len + sizeof(marker)); svc_xprt_deferred_close(xprt); mutex_unlock(&xprt->xpt_mutex); return -EAGAIN; -- cgit v1.2.3 From 898374fdd7f06fa4c4a66e8be3135efeae6128d5 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 19 Aug 2025 14:04:02 -0400 Subject: nfsd: unregister with rpcbind when deleting a transport When a listener is added, a part of creation of transport also registers program/port with rpcbind. However, when the listener is removed, while transport goes away, rpcbind still has the entry for that port/type. When deleting the transport, unregister with rpcbind when appropriate. ---v2 created a new xpt_flag XPT_RPCB_UNREG to mark TCP and UDP transport and at xprt destroy send rpcbind unregister if flag set. Suggested-by: Chuck Lever Fixes: d093c9089260 ("nfsd: fix management of listener transports") Cc: stable@vger.kernel.org Signed-off-by: Olga Kornievskaia Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_xprt.h | 3 +++ net/sunrpc/svc_xprt.c | 13 +++++++++++++ net/sunrpc/svcsock.c | 2 ++ 3 files changed, 18 insertions(+) (limited to 'net') diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 369a89aea186..2b886f7eb295 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -104,6 +104,9 @@ enum { * it has access to. It is NOT counted * in ->sv_tmpcnt. */ + XPT_RPCB_UNREG, /* transport that needs unregistering + * with rpcbind (TCP, UDP) on destroy + */ }; /* diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 8b1837228799..b800d704d807 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1014,6 +1014,19 @@ static void svc_delete_xprt(struct svc_xprt *xprt) struct svc_serv *serv = xprt->xpt_server; struct svc_deferred_req *dr; + /* unregister with rpcbind for when transport type is TCP or UDP. + */ + if (test_bit(XPT_RPCB_UNREG, &xprt->xpt_flags)) { + struct svc_sock *svsk = container_of(xprt, struct svc_sock, + sk_xprt); + struct socket *sock = svsk->sk_sock; + + if (svc_register(serv, xprt->xpt_net, sock->sk->sk_family, + sock->sk->sk_protocol, 0) < 0) + pr_warn("failed to unregister %s with rpcbind\n", + xprt->xpt_class->xcl_name); + } + if (test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) return; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index c0d5a27ba674..7b90abc5cf0e 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -836,6 +836,7 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) /* data might have come in before data_ready set up */ set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_RPCB_UNREG, &svsk->sk_xprt.xpt_flags); /* make sure we get destination address info */ switch (svsk->sk_sk->sk_family) { @@ -1350,6 +1351,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) if (sk->sk_state == TCP_LISTEN) { strcpy(svsk->sk_xprt.xpt_remotebuf, "listener"); set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_RPCB_UNREG, &svsk->sk_xprt.xpt_flags); sk->sk_data_ready = svc_tcp_listen_data_ready; set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); } else { -- cgit v1.2.3 From d73d06dac604043b94a5f18ebb6a69da1b867702 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 20 Aug 2025 10:27:28 -0400 Subject: SUNRPC: Move the svc_rpcb_cleanup() call sites Clean up: because svc_rpcb_cleanup() and svc_xprt_destroy_all() are always invoked in pairs, we can deduplicate code by moving the svc_rpcb_cleanup() call sites into svc_xprt_destroy_all(). Tested-by: Olga Kornievskaia Signed-off-by: Chuck Lever --- fs/lockd/svc.c | 6 ++---- fs/nfs/callback.c | 2 +- fs/nfsd/nfsctl.c | 2 +- fs/nfsd/nfssvc.c | 7 ++----- include/linux/sunrpc/svc_xprt.h | 3 ++- net/sunrpc/svc.c | 1 - net/sunrpc/svc_xprt.c | 7 ++++++- 7 files changed, 14 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index e80262a51884..d68afa196535 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -216,8 +216,7 @@ out_err: if (warned++ == 0) printk(KERN_WARNING "lockd_up: makesock failed, error=%d\n", err); - svc_xprt_destroy_all(serv, net); - svc_rpcb_cleanup(serv, net); + svc_xprt_destroy_all(serv, net, true); return err; } @@ -255,8 +254,7 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net) nlm_shutdown_hosts_net(net); cancel_delayed_work_sync(&ln->grace_period_end); locks_end_grace(&ln->lockd_manager); - svc_xprt_destroy_all(serv, net); - svc_rpcb_cleanup(serv, net); + svc_xprt_destroy_all(serv, net, true); } } else { pr_err("%s: no users! net=%x\n", diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 511f80878809..c8b837006bb2 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -136,7 +136,7 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc return; dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum); - svc_xprt_destroy_all(serv, net); + svc_xprt_destroy_all(serv, net, false); } static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index bc6b776fc657..63d52edcad72 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1993,7 +1993,7 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) * remaining listeners and recreate the list. */ if (delete) - svc_xprt_destroy_all(serv, net); + svc_xprt_destroy_all(serv, net, false); /* walk list of addrs again, open any that still don't exist */ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 82b0111ac469..7057ddd7a0a8 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -535,16 +535,13 @@ void nfsd_destroy_serv(struct net *net) #endif } - svc_xprt_destroy_all(serv, net); - /* * write_ports can create the server without actually starting - * any threads--if we get shut down before any threads are + * any threads. If we get shut down before any threads are * started, then nfsd_destroy_serv will be run before any of this * other initialization has been done except the rpcb information. */ - svc_rpcb_cleanup(serv, net); - + svc_xprt_destroy_all(serv, net, true); nfsd_shutdown_net(net); svc_destroy(&serv); } diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 2b886f7eb295..da2a2531e110 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -168,7 +168,8 @@ int svc_xprt_create(struct svc_serv *serv, const char *xprt_name, struct net *net, const int family, const unsigned short port, int flags, const struct cred *cred); -void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net); +void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net, + bool unregister); void svc_xprt_received(struct svc_xprt *xprt); void svc_xprt_enqueue(struct svc_xprt *xprt); void svc_xprt_put(struct svc_xprt *xprt); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index fc70e13b1cb9..cb4010e2dc0c 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -436,7 +436,6 @@ void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net) svc_unregister(serv, net); rpcb_put_local(net); } -EXPORT_SYMBOL_GPL(svc_rpcb_cleanup); static int svc_uses_rpcbind(struct svc_serv *serv) { diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index b800d704d807..6973184ff667 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1115,6 +1115,7 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net) * svc_xprt_destroy_all - Destroy transports associated with @serv * @serv: RPC service to be shut down * @net: target network namespace + * @unregister: true if it is OK to unregister the destroyed xprts * * Server threads may still be running (especially in the case where the * service is still running in other network namespaces). @@ -1127,7 +1128,8 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net) * threads, we may need to wait a little while and then check again to * see if they're done. */ -void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net) +void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net, + bool unregister) { int delay = 0; @@ -1137,6 +1139,9 @@ void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net) svc_clean_up_xprts(serv, net); msleep(delay++); } + + if (unregister) + svc_rpcb_cleanup(serv, net); } EXPORT_SYMBOL_GPL(svc_xprt_destroy_all); -- cgit v1.2.3 From 6c15463c4511d26f2a820f63f5b76624a71afc44 Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Tue, 12 Aug 2025 19:33:59 +0800 Subject: sunrpc: fix "occurence"->"occurrence" Trivial fix to spelling mistake in comment text. Signed-off-by: Xichao Zhao Reviewed-by: Joe Damato Signed-off-by: Chuck Lever --- net/sunrpc/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index 09434e1143c5..8b01b7ae2690 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -389,7 +389,7 @@ static ssize_t rpc_sysfs_xprt_dstaddr_store(struct kobject *kobj, saddr = (struct sockaddr *)&xprt->addr; port = rpc_get_port(saddr); - /* buf_len is the len until the first occurence of either + /* buf_len is the len until the first occurrence of either * '\n' or '\0' */ buf_len = strcspn(buf, "\n"); -- cgit v1.2.3 From e0d3bba84ff8b82d4e8820856a7850afb17c14f9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 19 Sep 2025 12:23:25 +0200 Subject: wifi: cfg80211: remove IEEE80211_CHAN_{1,2,4,8,16}MHZ flags These were used by S1G for older chandef representation, but are no longer needed. Clean them up, even if we can't drop them from the userspace API entirely. Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 16 +--------------- net/wireless/nl80211.c | 15 --------------- 2 files changed, 1 insertion(+), 30 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 1c041ce7a03b..781624f5913a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -101,16 +101,6 @@ struct wiphy; * @IEEE80211_CHAN_NO_10MHZ: 10 MHz bandwidth is not permitted * on this channel. * @IEEE80211_CHAN_NO_HE: HE operation is not permitted on this channel. - * @IEEE80211_CHAN_1MHZ: 1 MHz bandwidth is permitted - * on this channel. - * @IEEE80211_CHAN_2MHZ: 2 MHz bandwidth is permitted - * on this channel. - * @IEEE80211_CHAN_4MHZ: 4 MHz bandwidth is permitted - * on this channel. - * @IEEE80211_CHAN_8MHZ: 8 MHz bandwidth is permitted - * on this channel. - * @IEEE80211_CHAN_16MHZ: 16 MHz bandwidth is permitted - * on this channel. * @IEEE80211_CHAN_NO_320MHZ: If the driver supports 320 MHz on the band, * this flag indicates that a 320 MHz channel cannot use this * channel as the control or any of the secondary channels. @@ -152,11 +142,7 @@ enum ieee80211_channel_flags { IEEE80211_CHAN_NO_20MHZ = BIT(11), IEEE80211_CHAN_NO_10MHZ = BIT(12), IEEE80211_CHAN_NO_HE = BIT(13), - IEEE80211_CHAN_1MHZ = BIT(14), - IEEE80211_CHAN_2MHZ = BIT(15), - IEEE80211_CHAN_4MHZ = BIT(16), - IEEE80211_CHAN_8MHZ = BIT(17), - IEEE80211_CHAN_16MHZ = BIT(18), + /* can use free bits here */ IEEE80211_CHAN_NO_320MHZ = BIT(19), IEEE80211_CHAN_NO_EHT = BIT(20), IEEE80211_CHAN_DFS_CONCURRENT = BIT(21), diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index de34a1d14073..346dfd2bd987 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1280,21 +1280,6 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, if ((chan->flags & IEEE80211_CHAN_NO_HE) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_HE)) goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_1MHZ) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_1MHZ)) - goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_2MHZ) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_2MHZ)) - goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_4MHZ) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_4MHZ)) - goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_8MHZ) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_8MHZ)) - goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_16MHZ) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_16MHZ)) - goto nla_put_failure; if ((chan->flags & IEEE80211_CHAN_NO_320MHZ) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_320MHZ)) goto nla_put_failure; -- cgit v1.2.3 From d7610cb7454bbd8bf6d58f71b0ed57155d3c545f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 22 Sep 2025 14:42:36 +0200 Subject: ns: simplify ns_common_init() further Simply derive the ns operations from the namespace type. Acked-by: Thomas Gleixner Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 4 ++-- include/linux/ns_common.h | 30 ++++++++++++++++++++++++++---- ipc/namespace.c | 2 +- kernel/cgroup/namespace.c | 2 +- kernel/pid_namespace.c | 2 +- kernel/time/namespace.c | 2 +- kernel/user_namespace.c | 2 +- kernel/utsname.c | 2 +- net/core/net_namespace.c | 9 +-------- 9 files changed, 35 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/fs/namespace.c b/fs/namespace.c index 271cd6294c8a..d65917ec5544 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4104,9 +4104,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a } if (anon) - ret = ns_common_init_inum(new_ns, &mntns_operations, MNT_NS_ANON_INO); + ret = ns_common_init_inum(new_ns, MNT_NS_ANON_INO); else - ret = ns_common_init(new_ns, &mntns_operations); + ret = ns_common_init(new_ns); if (ret) { kfree(new_ns); dec_mnt_namespaces(ucounts); diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index aea8528d799a..56492cd9ff8d 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -25,6 +25,17 @@ extern struct time_namespace init_time_ns; extern struct user_namespace init_user_ns; extern struct uts_namespace init_uts_ns; +extern const struct proc_ns_operations netns_operations; +extern const struct proc_ns_operations utsns_operations; +extern const struct proc_ns_operations ipcns_operations; +extern const struct proc_ns_operations pidns_operations; +extern const struct proc_ns_operations pidns_for_children_operations; +extern const struct proc_ns_operations userns_operations; +extern const struct proc_ns_operations mntns_operations; +extern const struct proc_ns_operations cgroupns_operations; +extern const struct proc_ns_operations timens_operations; +extern const struct proc_ns_operations timens_for_children_operations; + struct ns_common { struct dentry *stashed; const struct proc_ns_operations *ops; @@ -84,10 +95,21 @@ void __ns_common_free(struct ns_common *ns); struct user_namespace *: &init_user_ns, \ struct uts_namespace *: &init_uts_ns) -#define ns_common_init(__ns, __ops) \ - __ns_common_init(to_ns_common(__ns), __ops, (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) - -#define ns_common_init_inum(__ns, __ops, __inum) __ns_common_init(to_ns_common(__ns), __ops, __inum) +#define to_ns_operations(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \ + struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \ + struct mnt_namespace *: &mntns_operations, \ + struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \ + struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \ + struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \ + struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \ + struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL)) + +#define ns_common_init(__ns) \ + __ns_common_init(to_ns_common(__ns), to_ns_operations(__ns), (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) + +#define ns_common_init_inum(__ns, __inum) __ns_common_init(to_ns_common(__ns), to_ns_operations(__ns), __inum) #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) diff --git a/ipc/namespace.c b/ipc/namespace.c index bd85d1c9d2c2..d89dfd718d2b 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -62,7 +62,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, if (ns == NULL) goto fail_dec; - err = ns_common_init(ns, &ipcns_operations); + err = ns_common_init(ns); if (err) goto fail_free; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 16ead7508371..04c98338ac08 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -27,7 +27,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); - ret = ns_common_init(new_ns, &cgroupns_operations); + ret = ns_common_init(new_ns); if (ret) return ERR_PTR(ret); ns_tree_add(new_ns); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 162f5fb63d75..a262a3f19443 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -103,7 +103,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns->pid_cachep == NULL) goto out_free_idr; - err = ns_common_init(ns, &pidns_operations); + err = ns_common_init(ns); if (err) goto out_free_idr; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 7aa4d6fedd49..9f26e61be044 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -97,7 +97,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, if (!ns->vvar_page) goto fail_free; - err = ns_common_init(ns, &timens_operations); + err = ns_common_init(ns); if (err) goto fail_free_page; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index f9df45c46235..e1559e8a8a02 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -126,7 +126,7 @@ int create_user_ns(struct cred *new) ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); - ret = ns_common_init(ns, &userns_operations); + ret = ns_common_init(ns); if (ret) goto fail_free; diff --git a/kernel/utsname.c b/kernel/utsname.c index 95d733eb2c98..00001592ad13 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -50,7 +50,7 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, if (!ns) goto fail_dec; - err = ns_common_init(ns, &utsns_operations); + err = ns_common_init(ns); if (err) goto fail_free; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index d5e3fd819163..bdea7d5fac56 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -400,16 +400,9 @@ static __net_init void preinit_net_sysctl(struct net *net) /* init code that must occur even if setup_net() is not called. */ static __net_init int preinit_net(struct net *net, struct user_namespace *user_ns) { - const struct proc_ns_operations *ns_ops; int ret; -#ifdef CONFIG_NET_NS - ns_ops = &netns_operations; -#else - ns_ops = NULL; -#endif - - ret = ns_common_init(net, ns_ops); + ret = ns_common_init(net); if (ret) return ret; -- cgit v1.2.3 From 302a1f674c00dd5581ab8e493ef44767c5101aab Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 25 Aug 2025 10:03:07 -0400 Subject: Bluetooth: MGMT: Fix possible UAFs This attemps to fix possible UAFs caused by struct mgmt_pending being freed while still being processed like in the following trace, in order to fix mgmt_pending_valid is introduce and use to check if the mgmt_pending hasn't been removed from the pending list, on the complete callbacks it is used to check and in addtion remove the cmd from the list while holding mgmt_pending_lock to avoid TOCTOU problems since if the cmd is left on the list it can still be accessed and freed. BUG: KASAN: slab-use-after-free in mgmt_add_adv_patterns_monitor_sync+0x35/0x50 net/bluetooth/mgmt.c:5223 Read of size 8 at addr ffff8880709d4dc0 by task kworker/u11:0/55 CPU: 0 UID: 0 PID: 55 Comm: kworker/u11:0 Not tainted 6.16.4 #2 PREEMPT(full) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 Workqueue: hci0 hci_cmd_sync_work Call Trace: dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xca/0x240 mm/kasan/report.c:482 kasan_report+0x118/0x150 mm/kasan/report.c:595 mgmt_add_adv_patterns_monitor_sync+0x35/0x50 net/bluetooth/mgmt.c:5223 hci_cmd_sync_work+0x210/0x3a0 net/bluetooth/hci_sync.c:332 process_one_work kernel/workqueue.c:3238 [inline] process_scheduled_works+0xade/0x17b0 kernel/workqueue.c:3321 worker_thread+0x8a0/0xda0 kernel/workqueue.c:3402 kthread+0x711/0x8a0 kernel/kthread.c:464 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 home/kwqcheii/source/fuzzing/kernel/kasan/linux-6.16.4/arch/x86/entry/entry_64.S:245 Allocated by task 12210: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x93/0xb0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __kmalloc_cache_noprof+0x230/0x3d0 mm/slub.c:4364 kmalloc_noprof include/linux/slab.h:905 [inline] kzalloc_noprof include/linux/slab.h:1039 [inline] mgmt_pending_new+0x65/0x1e0 net/bluetooth/mgmt_util.c:269 mgmt_pending_add+0x35/0x140 net/bluetooth/mgmt_util.c:296 __add_adv_patterns_monitor+0x130/0x200 net/bluetooth/mgmt.c:5247 add_adv_patterns_monitor+0x214/0x360 net/bluetooth/mgmt.c:5364 hci_mgmt_cmd+0x9c9/0xef0 net/bluetooth/hci_sock.c:1719 hci_sock_sendmsg+0x6ca/0xef0 net/bluetooth/hci_sock.c:1839 sock_sendmsg_nosec net/socket.c:714 [inline] __sock_sendmsg+0x219/0x270 net/socket.c:729 sock_write_iter+0x258/0x330 net/socket.c:1133 new_sync_write fs/read_write.c:593 [inline] vfs_write+0x5c9/0xb30 fs/read_write.c:686 ksys_write+0x145/0x250 fs/read_write.c:738 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 12221: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x46/0x50 mm/kasan/generic.c:576 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x62/0x70 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2381 [inline] slab_free mm/slub.c:4648 [inline] kfree+0x18e/0x440 mm/slub.c:4847 mgmt_pending_free net/bluetooth/mgmt_util.c:311 [inline] mgmt_pending_foreach+0x30d/0x380 net/bluetooth/mgmt_util.c:257 __mgmt_power_off+0x169/0x350 net/bluetooth/mgmt.c:9444 hci_dev_close_sync+0x754/0x1330 net/bluetooth/hci_sync.c:5290 hci_dev_do_close net/bluetooth/hci_core.c:501 [inline] hci_dev_close+0x108/0x200 net/bluetooth/hci_core.c:526 sock_do_ioctl+0xd9/0x300 net/socket.c:1192 sock_ioctl+0x576/0x790 net/socket.c:1313 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:907 [inline] __se_sys_ioctl+0xf9/0x170 fs/ioctl.c:893 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: cf75ad8b41d2 ("Bluetooth: hci_sync: Convert MGMT_SET_POWERED") Fixes: 2bd1b237616b ("Bluetooth: hci_sync: Convert MGMT_OP_SET_DISCOVERABLE to use cmd_sync") Fixes: f056a65783cc ("Bluetooth: hci_sync: Convert MGMT_OP_SET_CONNECTABLE to use cmd_sync") Fixes: 3244845c6307 ("Bluetooth: hci_sync: Convert MGMT_OP_SSP") Fixes: d81a494c43df ("Bluetooth: hci_sync: Convert MGMT_OP_SET_LE") Fixes: b338d91703fa ("Bluetooth: Implement support for Mesh") Fixes: 6f6ff38a1e14 ("Bluetooth: hci_sync: Convert MGMT_OP_SET_LOCAL_NAME") Fixes: 71efbb08b538 ("Bluetooth: hci_sync: Convert MGMT_OP_SET_PHY_CONFIGURATION") Fixes: b747a83690c8 ("Bluetooth: hci_sync: Refactor add Adv Monitor") Fixes: abfeea476c68 ("Bluetooth: hci_sync: Convert MGMT_OP_START_DISCOVERY") Fixes: 26ac4c56f03f ("Bluetooth: hci_sync: Convert MGMT_OP_SET_ADVERTISING") Reported-by: cen zhang Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 259 ++++++++++++++++++++++++++++++++-------------- net/bluetooth/mgmt_util.c | 46 ++++++++ net/bluetooth/mgmt_util.h | 3 + 3 files changed, 231 insertions(+), 77 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 50634ef5c8b7..225140fcb3d6 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1323,8 +1323,7 @@ static void mgmt_set_powered_complete(struct hci_dev *hdev, void *data, int err) struct mgmt_mode *cp; /* Make sure cmd still outstanding. */ - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_SET_POWERED, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; cp = cmd->param; @@ -1351,23 +1350,29 @@ static void mgmt_set_powered_complete(struct hci_dev *hdev, void *data, int err) mgmt_status(err)); } - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); } static int set_powered_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_mode *cp; + struct mgmt_mode cp; + + mutex_lock(&hdev->mgmt_pending_lock); /* Make sure cmd still outstanding. */ - if (cmd != pending_find(MGMT_OP_SET_POWERED, hdev)) + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); return -ECANCELED; + } - cp = cmd->param; + memcpy(&cp, cmd->param, sizeof(cp)); + + mutex_unlock(&hdev->mgmt_pending_lock); BT_DBG("%s", hdev->name); - return hci_set_powered_sync(hdev, cp->val); + return hci_set_powered_sync(hdev, cp.val); } static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, @@ -1516,8 +1521,7 @@ static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "err %d", err); /* Make sure cmd still outstanding. */ - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_SET_DISCOVERABLE, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; hci_dev_lock(hdev); @@ -1539,12 +1543,15 @@ static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data, new_settings(hdev, cmd->sk); done: - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); hci_dev_unlock(hdev); } static int set_discoverable_sync(struct hci_dev *hdev, void *data) { + if (!mgmt_pending_listed(hdev, data)) + return -ECANCELED; + BT_DBG("%s", hdev->name); return hci_update_discoverable_sync(hdev); @@ -1691,8 +1698,7 @@ static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "err %d", err); /* Make sure cmd still outstanding. */ - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_SET_CONNECTABLE, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; hci_dev_lock(hdev); @@ -1707,7 +1713,7 @@ static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data, new_settings(hdev, cmd->sk); done: - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); hci_dev_unlock(hdev); } @@ -1743,6 +1749,9 @@ static int set_connectable_update_settings(struct hci_dev *hdev, static int set_connectable_sync(struct hci_dev *hdev, void *data) { + if (!mgmt_pending_listed(hdev, data)) + return -ECANCELED; + BT_DBG("%s", hdev->name); return hci_update_connectable_sync(hdev); @@ -1919,14 +1928,17 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) { struct cmd_lookup match = { NULL, hdev }; struct mgmt_pending_cmd *cmd = data; - struct mgmt_mode *cp = cmd->param; - u8 enable = cp->val; + struct mgmt_mode *cp; + u8 enable; bool changed; /* Make sure cmd still outstanding. */ - if (err == -ECANCELED || cmd != pending_find(MGMT_OP_SET_SSP, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; + cp = cmd->param; + enable = cp->val; + if (err) { u8 mgmt_err = mgmt_status(err); @@ -1935,8 +1947,7 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) new_settings(hdev, NULL); } - mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, true, - cmd_status_rsp, &mgmt_err); + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err); return; } @@ -1946,7 +1957,7 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) changed = hci_dev_test_and_clear_flag(hdev, HCI_SSP_ENABLED); } - mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, true, settings_rsp, &match); + settings_rsp(cmd, &match); if (changed) new_settings(hdev, match.sk); @@ -1960,14 +1971,25 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) static int set_ssp_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_mode *cp = cmd->param; + struct mgmt_mode cp; bool changed = false; int err; - if (cp->val) + mutex_lock(&hdev->mgmt_pending_lock); + + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); + return -ECANCELED; + } + + memcpy(&cp, cmd->param, sizeof(cp)); + + mutex_unlock(&hdev->mgmt_pending_lock); + + if (cp.val) changed = !hci_dev_test_and_set_flag(hdev, HCI_SSP_ENABLED); - err = hci_write_ssp_mode_sync(hdev, cp->val); + err = hci_write_ssp_mode_sync(hdev, cp.val); if (!err && changed) hci_dev_clear_flag(hdev, HCI_SSP_ENABLED); @@ -2060,32 +2082,50 @@ static int set_hs(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) static void set_le_complete(struct hci_dev *hdev, void *data, int err) { + struct mgmt_pending_cmd *cmd = data; struct cmd_lookup match = { NULL, hdev }; u8 status = mgmt_status(err); bt_dev_dbg(hdev, "err %d", err); - if (status) { - mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, true, cmd_status_rsp, - &status); + if (err == -ECANCELED || !mgmt_pending_valid(hdev, data)) return; + + if (status) { + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, status); + goto done; } - mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, true, settings_rsp, &match); + settings_rsp(cmd, &match); new_settings(hdev, match.sk); if (match.sk) sock_put(match.sk); + +done: + mgmt_pending_free(cmd); } static int set_le_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_mode *cp = cmd->param; - u8 val = !!cp->val; + struct mgmt_mode cp; + u8 val; int err; + mutex_lock(&hdev->mgmt_pending_lock); + + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); + return -ECANCELED; + } + + memcpy(&cp, cmd->param, sizeof(cp)); + val = !!cp.val; + + mutex_unlock(&hdev->mgmt_pending_lock); + if (!val) { hci_clear_adv_instance_sync(hdev, NULL, 0x00, true); @@ -2127,7 +2167,12 @@ static void set_mesh_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; u8 status = mgmt_status(err); - struct sock *sk = cmd->sk; + struct sock *sk; + + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) + return; + + sk = cmd->sk; if (status) { mgmt_pending_foreach(MGMT_OP_SET_MESH_RECEIVER, hdev, true, @@ -2142,24 +2187,37 @@ static void set_mesh_complete(struct hci_dev *hdev, void *data, int err) static int set_mesh_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_cp_set_mesh *cp = cmd->param; - size_t len = cmd->param_len; + struct mgmt_cp_set_mesh cp; + size_t len; + + mutex_lock(&hdev->mgmt_pending_lock); + + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); + return -ECANCELED; + } + + memcpy(&cp, cmd->param, sizeof(cp)); + + mutex_unlock(&hdev->mgmt_pending_lock); + + len = cmd->param_len; memset(hdev->mesh_ad_types, 0, sizeof(hdev->mesh_ad_types)); - if (cp->enable) + if (cp.enable) hci_dev_set_flag(hdev, HCI_MESH); else hci_dev_clear_flag(hdev, HCI_MESH); - hdev->le_scan_interval = __le16_to_cpu(cp->period); - hdev->le_scan_window = __le16_to_cpu(cp->window); + hdev->le_scan_interval = __le16_to_cpu(cp.period); + hdev->le_scan_window = __le16_to_cpu(cp.window); - len -= sizeof(*cp); + len -= sizeof(cp); /* If filters don't fit, forward all adv pkts */ if (len <= sizeof(hdev->mesh_ad_types)) - memcpy(hdev->mesh_ad_types, cp->ad_types, len); + memcpy(hdev->mesh_ad_types, cp.ad_types, len); hci_update_passive_scan_sync(hdev); return 0; @@ -3867,15 +3925,16 @@ static int name_changed_sync(struct hci_dev *hdev, void *data) static void set_name_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_cp_set_local_name *cp = cmd->param; + struct mgmt_cp_set_local_name *cp; u8 status = mgmt_status(err); bt_dev_dbg(hdev, "err %d", err); - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_SET_LOCAL_NAME, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; + cp = cmd->param; + if (status) { mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, status); @@ -3887,16 +3946,27 @@ static void set_name_complete(struct hci_dev *hdev, void *data, int err) hci_cmd_sync_queue(hdev, name_changed_sync, NULL, NULL); } - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); } static int set_name_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_cp_set_local_name *cp = cmd->param; + struct mgmt_cp_set_local_name cp; + + mutex_lock(&hdev->mgmt_pending_lock); + + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); + return -ECANCELED; + } + + memcpy(&cp, cmd->param, sizeof(cp)); + + mutex_unlock(&hdev->mgmt_pending_lock); if (lmp_bredr_capable(hdev)) { - hci_update_name_sync(hdev, cp->name); + hci_update_name_sync(hdev, cp.name); hci_update_eir_sync(hdev); } @@ -4048,12 +4118,10 @@ int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip) static void set_default_phy_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; - struct sk_buff *skb = cmd->skb; + struct sk_buff *skb; u8 status = mgmt_status(err); - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_SET_PHY_CONFIGURATION, hdev)) - return; + skb = cmd->skb; if (!status) { if (!skb) @@ -4080,7 +4148,7 @@ static void set_default_phy_complete(struct hci_dev *hdev, void *data, int err) if (skb && !IS_ERR(skb)) kfree_skb(skb); - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); } static int set_default_phy_sync(struct hci_dev *hdev, void *data) @@ -4088,7 +4156,9 @@ static int set_default_phy_sync(struct hci_dev *hdev, void *data) struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_set_phy_configuration *cp = cmd->param; struct hci_cp_le_set_default_phy cp_phy; - u32 selected_phys = __le32_to_cpu(cp->selected_phys); + u32 selected_phys; + + selected_phys = __le32_to_cpu(cp->selected_phys); memset(&cp_phy, 0, sizeof(cp_phy)); @@ -4228,7 +4298,7 @@ static int set_phy_configuration(struct sock *sk, struct hci_dev *hdev, goto unlock; } - cmd = mgmt_pending_add(sk, MGMT_OP_SET_PHY_CONFIGURATION, hdev, data, + cmd = mgmt_pending_new(sk, MGMT_OP_SET_PHY_CONFIGURATION, hdev, data, len); if (!cmd) err = -ENOMEM; @@ -5189,7 +5259,17 @@ static void mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, { struct mgmt_rp_add_adv_patterns_monitor rp; struct mgmt_pending_cmd *cmd = data; - struct adv_monitor *monitor = cmd->user_data; + struct adv_monitor *monitor; + + /* This is likely the result of hdev being closed and mgmt_index_removed + * is attempting to clean up any pending command so + * hci_adv_monitors_clear is about to be called which will take care of + * freeing the adv_monitor instances. + */ + if (status == -ECANCELED && !mgmt_pending_valid(hdev, cmd)) + return; + + monitor = cmd->user_data; hci_dev_lock(hdev); @@ -5215,9 +5295,20 @@ static void mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, static int mgmt_add_adv_patterns_monitor_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct adv_monitor *monitor = cmd->user_data; + struct adv_monitor *mon; + + mutex_lock(&hdev->mgmt_pending_lock); + + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); + return -ECANCELED; + } + + mon = cmd->user_data; + + mutex_unlock(&hdev->mgmt_pending_lock); - return hci_add_adv_monitor(hdev, monitor); + return hci_add_adv_monitor(hdev, mon); } static int __add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev, @@ -5484,7 +5575,8 @@ unlock: status); } -static void read_local_oob_data_complete(struct hci_dev *hdev, void *data, int err) +static void read_local_oob_data_complete(struct hci_dev *hdev, void *data, + int err) { struct mgmt_rp_read_local_oob_data mgmt_rp; size_t rp_size = sizeof(mgmt_rp); @@ -5504,7 +5596,8 @@ static void read_local_oob_data_complete(struct hci_dev *hdev, void *data, int e bt_dev_dbg(hdev, "status %d", status); if (status) { - mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, status); + mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, + status); goto remove; } @@ -5786,17 +5879,12 @@ static void start_discovery_complete(struct hci_dev *hdev, void *data, int err) bt_dev_dbg(hdev, "err %d", err); - if (err == -ECANCELED) - return; - - if (cmd != pending_find(MGMT_OP_START_DISCOVERY, hdev) && - cmd != pending_find(MGMT_OP_START_LIMITED_DISCOVERY, hdev) && - cmd != pending_find(MGMT_OP_START_SERVICE_DISCOVERY, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err), cmd->param, 1); - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); hci_discovery_set_state(hdev, err ? DISCOVERY_STOPPED: DISCOVERY_FINDING); @@ -5804,6 +5892,9 @@ static void start_discovery_complete(struct hci_dev *hdev, void *data, int err) static int start_discovery_sync(struct hci_dev *hdev, void *data) { + if (!mgmt_pending_listed(hdev, data)) + return -ECANCELED; + return hci_start_discovery_sync(hdev); } @@ -6009,15 +6100,14 @@ static void stop_discovery_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_STOP_DISCOVERY, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; bt_dev_dbg(hdev, "err %d", err); mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err), cmd->param, 1); - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); if (!err) hci_discovery_set_state(hdev, DISCOVERY_STOPPED); @@ -6025,6 +6115,9 @@ static void stop_discovery_complete(struct hci_dev *hdev, void *data, int err) static int stop_discovery_sync(struct hci_dev *hdev, void *data) { + if (!mgmt_pending_listed(hdev, data)) + return -ECANCELED; + return hci_stop_discovery_sync(hdev); } @@ -6234,14 +6327,18 @@ static void enable_advertising_instance(struct hci_dev *hdev, int err) static void set_advertising_complete(struct hci_dev *hdev, void *data, int err) { + struct mgmt_pending_cmd *cmd = data; struct cmd_lookup match = { NULL, hdev }; u8 instance; struct adv_info *adv_instance; u8 status = mgmt_status(err); + if (err == -ECANCELED || !mgmt_pending_valid(hdev, data)) + return; + if (status) { - mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, true, - cmd_status_rsp, &status); + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, status); + mgmt_pending_free(cmd); return; } @@ -6250,8 +6347,7 @@ static void set_advertising_complete(struct hci_dev *hdev, void *data, int err) else hci_dev_clear_flag(hdev, HCI_ADVERTISING); - mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, true, settings_rsp, - &match); + settings_rsp(cmd, &match); new_settings(hdev, match.sk); @@ -6283,10 +6379,23 @@ static void set_advertising_complete(struct hci_dev *hdev, void *data, int err) static int set_adv_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_mode *cp = cmd->param; - u8 val = !!cp->val; + struct mgmt_mode cp; + u8 val; - if (cp->val == 0x02) + mutex_lock(&hdev->mgmt_pending_lock); + + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); + return -ECANCELED; + } + + memcpy(&cp, cmd->param, sizeof(cp)); + + mutex_unlock(&hdev->mgmt_pending_lock); + + val = !!cp.val; + + if (cp.val == 0x02) hci_dev_set_flag(hdev, HCI_ADVERTISING_CONNECTABLE); else hci_dev_clear_flag(hdev, HCI_ADVERTISING_CONNECTABLE); @@ -8039,10 +8148,6 @@ static void read_local_oob_ext_data_complete(struct hci_dev *hdev, void *data, u8 status = mgmt_status(err); u16 eir_len; - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev)) - return; - if (!status) { if (!skb) status = MGMT_STATUS_FAILED; @@ -8149,7 +8254,7 @@ done: kfree_skb(skb); kfree(mgmt_rp); - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); } static int read_local_ssp_oob_req(struct hci_dev *hdev, struct sock *sk, @@ -8158,7 +8263,7 @@ static int read_local_ssp_oob_req(struct hci_dev *hdev, struct sock *sk, struct mgmt_pending_cmd *cmd; int err; - cmd = mgmt_pending_add(sk, MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev, + cmd = mgmt_pending_new(sk, MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev, cp, sizeof(*cp)); if (!cmd) return -ENOMEM; diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c index a88a07da3947..aa7b5585cb26 100644 --- a/net/bluetooth/mgmt_util.c +++ b/net/bluetooth/mgmt_util.c @@ -320,6 +320,52 @@ void mgmt_pending_remove(struct mgmt_pending_cmd *cmd) mgmt_pending_free(cmd); } +bool __mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd) +{ + struct mgmt_pending_cmd *tmp; + + lockdep_assert_held(&hdev->mgmt_pending_lock); + + if (!cmd) + return false; + + list_for_each_entry(tmp, &hdev->mgmt_pending, list) { + if (cmd == tmp) + return true; + } + + return false; +} + +bool mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd) +{ + bool listed; + + mutex_lock(&hdev->mgmt_pending_lock); + listed = __mgmt_pending_listed(hdev, cmd); + mutex_unlock(&hdev->mgmt_pending_lock); + + return listed; +} + +bool mgmt_pending_valid(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd) +{ + bool listed; + + if (!cmd) + return false; + + mutex_lock(&hdev->mgmt_pending_lock); + + listed = __mgmt_pending_listed(hdev, cmd); + if (listed) + list_del(&cmd->list); + + mutex_unlock(&hdev->mgmt_pending_lock); + + return listed; +} + void mgmt_mesh_foreach(struct hci_dev *hdev, void (*cb)(struct mgmt_mesh_tx *mesh_tx, void *data), void *data, struct sock *sk) diff --git a/net/bluetooth/mgmt_util.h b/net/bluetooth/mgmt_util.h index 024e51dd6937..bcba8c9d8952 100644 --- a/net/bluetooth/mgmt_util.h +++ b/net/bluetooth/mgmt_util.h @@ -65,6 +65,9 @@ struct mgmt_pending_cmd *mgmt_pending_new(struct sock *sk, u16 opcode, void *data, u16 len); void mgmt_pending_free(struct mgmt_pending_cmd *cmd); void mgmt_pending_remove(struct mgmt_pending_cmd *cmd); +bool __mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd); +bool mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd); +bool mgmt_pending_valid(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd); void mgmt_mesh_foreach(struct hci_dev *hdev, void (*cb)(struct mgmt_mesh_tx *mesh_tx, void *data), void *data, struct sock *sk); -- cgit v1.2.3 From 6445bb832dc0ba0ab816e5bd79ef0209cdd46d3a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 19 Sep 2025 08:35:28 +0000 Subject: tcp: Remove osk from __inet_hash() arg. __inet_hash() is called from inet_hash() and inet6_hash with osk NULL. Let's remove the 2nd arg from __inet_hash(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250919083706.1863217-2-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_hashtables.h | 2 +- net/ipv4/inet_hashtables.c | 6 +++--- net/ipv6/inet6_hashtables.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index a3b32241c2f2..64bc8870db88 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -289,7 +289,7 @@ int inet_hashinfo2_init_mod(struct inet_hashinfo *h); bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk); bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk); -int __inet_hash(struct sock *sk, struct sock *osk); +int __inet_hash(struct sock *sk); int inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index ef4ccfd46ff6..baee5c075e6c 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -739,7 +739,7 @@ static int inet_reuseport_add_sock(struct sock *sk, return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } -int __inet_hash(struct sock *sk, struct sock *osk) +int __inet_hash(struct sock *sk) { struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); struct inet_listen_hashbucket *ilb2; @@ -747,7 +747,7 @@ int __inet_hash(struct sock *sk, struct sock *osk) if (sk->sk_state != TCP_LISTEN) { local_bh_disable(); - inet_ehash_nolisten(sk, osk, NULL); + inet_ehash_nolisten(sk, NULL, NULL); local_bh_enable(); return 0; } @@ -779,7 +779,7 @@ int inet_hash(struct sock *sk) int err = 0; if (sk->sk_state != TCP_CLOSE) - err = __inet_hash(sk, NULL); + err = __inet_hash(sk); return err; } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index a3a9ea49fee2..64fcd7df0c9a 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -374,7 +374,7 @@ int inet6_hash(struct sock *sk) int err = 0; if (sk->sk_state != TCP_CLOSE) - err = __inet_hash(sk, NULL); + err = __inet_hash(sk); return err; } -- cgit v1.2.3 From 0ac44301e3bf4f5abc892ab530188ca95c61e59f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 19 Sep 2025 08:35:29 +0000 Subject: tcp: Remove inet6_hash(). inet_hash() and inet6_hash() are exactly the same. Also, we do not need to export inet6_hash(). Let's consolidate the two into __inet_hash() and rename it to inet_hash(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250919083706.1863217-3-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/inet6_hashtables.h | 2 -- include/net/inet_hashtables.h | 1 - net/ipv4/inet_hashtables.c | 17 +++++------------ net/ipv6/inet6_hashtables.c | 11 ----------- net/ipv6/tcp_ipv6.c | 2 +- 5 files changed, 6 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 1f985d2012ce..282e29237d93 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -167,8 +167,6 @@ struct sock *inet6_lookup(const struct net *net, struct sk_buff *skb, int doff, const struct in6_addr *daddr, const __be16 dport, const int dif); -int inet6_hash(struct sock *sk); - static inline bool inet6_match(const struct net *net, const struct sock *sk, const struct in6_addr *saddr, const struct in6_addr *daddr, diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 64bc8870db88..b787be651ce7 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -289,7 +289,6 @@ int inet_hashinfo2_init_mod(struct inet_hashinfo *h); bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk); bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk); -int __inet_hash(struct sock *sk); int inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index baee5c075e6c..efa8a615b868 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -739,12 +739,15 @@ static int inet_reuseport_add_sock(struct sock *sk, return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } -int __inet_hash(struct sock *sk) +int inet_hash(struct sock *sk) { struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); struct inet_listen_hashbucket *ilb2; int err = 0; + if (sk->sk_state == TCP_CLOSE) + return 0; + if (sk->sk_state != TCP_LISTEN) { local_bh_disable(); inet_ehash_nolisten(sk, NULL, NULL); @@ -772,17 +775,7 @@ unlock: return err; } -EXPORT_IPV6_MOD(__inet_hash); - -int inet_hash(struct sock *sk) -{ - int err = 0; - - if (sk->sk_state != TCP_CLOSE) - err = __inet_hash(sk); - - return err; -} +EXPORT_IPV6_MOD(inet_hash); void inet_unhash(struct sock *sk) { diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 64fcd7df0c9a..5e1da088d8e1 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -368,14 +368,3 @@ int inet6_hash_connect(struct inet_timewait_death_row *death_row, __inet6_check_established); } EXPORT_SYMBOL_GPL(inet6_hash_connect); - -int inet6_hash(struct sock *sk) -{ - int err = 0; - - if (sk->sk_state != TCP_CLOSE) - err = __inet_hash(sk); - - return err; -} -EXPORT_SYMBOL_GPL(inet6_hash); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d1e5b2a186fb..9622c2776ade 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2355,7 +2355,7 @@ struct proto tcpv6_prot = { .splice_eof = tcp_splice_eof, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, - .hash = inet6_hash, + .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .put_port = inet_put_port, -- cgit v1.2.3 From bb6f9445666e1ed9f39c805e153243a65ea05257 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 19 Sep 2025 08:35:30 +0000 Subject: tcp: Remove redundant sk_unhashed() in inet_unhash(). inet_unhash() checks sk_unhashed() twice at the entry and after locking ehash/lhash bucket. The former was somehow added redundantly by commit 4f9bf2a2f5aa ("tcp: Don't acquire inet_listen_hashbucket::lock with disabled BH."). inet_unhash() is called for the full socket from 4 places, and it is always under lock_sock() or the socket is not yet published to other threads: 1. __sk_prot_rehash() -> called from inet_sk_reselect_saddr(), which has lockdep_sock_is_held() 2. sk_common_release() -> called when inet_create() or inet6_create() fail, then the socket is not yet published 3. tcp_set_state() -> calls tcp_call_bpf_2arg(), and tcp_call_bpf() has sock_owned_by_me() 4. inet_ctl_sock_create() -> creates a kernel socket and unhashes it immediately, but TCP socket is not hashed in sock_create_kern() (only SOCK_RAW is) So we do not need to check sk_unhashed() twice before/after ehash/lhash lock in inet_unhash(). Let's remove the 2nd one. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250919083706.1863217-4-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_hashtables.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index efa8a615b868..4eb933f56fe6 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -793,11 +793,6 @@ void inet_unhash(struct sock *sk) * avoid circular locking dependency on PREEMPT_RT. */ spin_lock(&ilb2->lock); - if (sk_unhashed(sk)) { - spin_unlock(&ilb2->lock); - return; - } - if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_stop_listen_sock(sk); @@ -808,10 +803,6 @@ void inet_unhash(struct sock *sk) spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock_bh(lock); - if (sk_unhashed(sk)) { - spin_unlock_bh(lock); - return; - } __sk_nulls_del_node_init_rcu(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock_bh(lock); -- cgit v1.2.3 From c9809f03c158f07eaa76c7dd3606fc0a184520f2 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 19 Sep 2025 14:08:58 +0200 Subject: mptcp: pm: netlink: only add server-side attr when true This attribute is a boolean. No need to add it to set it to 'false'. Indeed, the default value when this attribute is not set is naturally 'false'. A few bytes can then be saved by not adding this attribute if the connection is not on the server side. This prepares the future deprecation of its attribute, in favour of a new flag. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250919-net-next-mptcp-server-side-flag-v1-1-a97a5d561a8b@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 4 ++-- include/uapi/linux/mptcp_pm.h | 4 ++-- net/mptcp/pm_netlink.c | 4 +++- tools/testing/selftests/net/mptcp/userspace_pm.sh | 2 +- 4 files changed, 8 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index d1b4829b580a..fc47a2931014 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -28,13 +28,13 @@ definitions: traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, server-side, [flags]. + dport, [server-side], [flags]. - name: established doc: >- A MPTCP connection is established (can start new subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, server-side, [flags]. + dport, [server-side], [flags]. - name: closed doc: >- diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index 7359d34da446..bf44a5cf5b5a 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -16,10 +16,10 @@ * good time to allocate memory and send ADD_ADDR if needed. Depending on the * traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED * is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side, [flags]. + * sport, dport, [server-side], [flags]. * @MPTCP_EVENT_ESTABLISHED: A MPTCP connection is established (can start new * subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side, [flags]. + * sport, dport, [server-side], [flags]. * @MPTCP_EVENT_CLOSED: A MPTCP connection has stopped. Attribute: token. * @MPTCP_EVENT_ANNOUNCED: A new address has been announced by the peer. * Attributes: token, rem_id, family, daddr4 | daddr6 [, dport]. diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 483ddbb9ec40..33a6bf536c02 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -413,7 +413,9 @@ static int mptcp_event_created(struct sk_buff *skb, if (err) return err; - if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, READ_ONCE(msk->pm.server_side))) + /* only set when it is the server side */ + if (READ_ONCE(msk->pm.server_side) && + nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, 1)) return -EMSGSIZE; if (READ_ONCE(msk->pm.remote_deny_join_id0)) diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh index 3d45991f24ed..87323942cb8a 100755 --- a/tools/testing/selftests/net/mptcp/userspace_pm.sh +++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh @@ -241,7 +241,7 @@ make_connection() print_test "Established IP${is_v6} MPTCP Connection ns2 => ns1" if [ "${client_token}" != "" ] && [ "${server_token}" != "" ] && - [ "${client_serverside}" = 0 ] && [ "${server_serverside}" = 1 ] && + [ "${client_serverside:-0}" = 0 ] && [ "${server_serverside:-0}" = 1 ] && [ "${client_nojoin:-0}" = 0 ] && [ "${server_nojoin:-0}" = 1 ] then test_pass -- cgit v1.2.3 From 3d7ae91107b839ffeeb19730a2e2a46e0054bae8 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 19 Sep 2025 14:08:59 +0200 Subject: mptcp: pm: netlink: announce server-side flag Now that the 'flags' attribute is used, it seems interesting to add one flag for 'server-side', a boolean value. This is duplicating the info from the dedicated 'server-side' attribute, but it will be deprecated in the next commit, and removed in a few versions. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250919-net-next-mptcp-server-side-flag-v1-2-a97a5d561a8b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + net/mptcp/pm_netlink.c | 11 +++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 5fd5b4cf75ca..95d621f6d598 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -32,6 +32,7 @@ #define MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED _BITUL(1) #define MPTCP_PM_EV_FLAG_DENY_JOIN_ID0 _BITUL(0) +#define MPTCP_PM_EV_FLAG_SERVER_SIDE _BITUL(1) #define MPTCP_PM_ADDR_FLAG_SIGNAL (1 << 0) #define MPTCP_PM_ADDR_FLAG_SUBFLOW (1 << 1) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 33a6bf536c02..aa0c73faaa6a 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -413,10 +413,13 @@ static int mptcp_event_created(struct sk_buff *skb, if (err) return err; - /* only set when it is the server side */ - if (READ_ONCE(msk->pm.server_side) && - nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, 1)) - return -EMSGSIZE; + if (READ_ONCE(msk->pm.server_side)) { + flags |= MPTCP_PM_EV_FLAG_SERVER_SIDE; + + /* only set when it is the server side */ + if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, 1)) + return -EMSGSIZE; + } if (READ_ONCE(msk->pm.remote_deny_join_id0)) flags |= MPTCP_PM_EV_FLAG_DENY_JOIN_ID0; -- cgit v1.2.3 From c8bc168f5f3d152b378726f89e8561ccedcb5d5c Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 19 Sep 2025 14:09:00 +0200 Subject: mptcp: pm: netlink: deprecate server-side attribute Now that such info is in the 'flags' attribute, it is time to deprecate the dedicated 'server-side' attribute. It will be removed in a few versions. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250919-net-next-mptcp-server-side-flag-v1-3-a97a5d561a8b@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 1 + net/mptcp/pm_netlink.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index fc47a2931014..ba30a40b9dbf 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -266,6 +266,7 @@ attribute-sets: - name: server-side type: u8 + doc: "Deprecated: use 'flags'" operations: list: diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index aa0c73faaa6a..d5b383870f79 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -416,7 +416,7 @@ static int mptcp_event_created(struct sk_buff *skb, if (READ_ONCE(msk->pm.server_side)) { flags |= MPTCP_PM_EV_FLAG_SERVER_SIDE; - /* only set when it is the server side */ + /* Deprecated, and only set when it is the server side */ if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, 1)) return -EMSGSIZE; } -- cgit v1.2.3 From 1be5b82c45850f495adf67887075507d5e8a860b Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 19 Sep 2025 14:09:03 +0200 Subject: mptcp: remove unused returned value of check_data_fin When working on a fix modifying mptcp_check_data_fin(), I noticed the returned value was no longer used. It looks like it was used for 3 days, between commit 7ed90803a213 ("mptcp: send explicit ack on delayed ack_seq incr") and commit ea4ca586b16f ("mptcp: refine MPTCP-level ack scheduling"). This returned value can be safely removed. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250919-net-next-mptcp-server-side-flag-v1-6-a97a5d561a8b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index d9fbddb99ad0..735a209d4072 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -545,11 +545,10 @@ static void mptcp_cleanup_rbuf(struct mptcp_sock *msk, int copied) } } -static bool mptcp_check_data_fin(struct sock *sk) +static void mptcp_check_data_fin(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); u64 rcv_data_fin_seq; - bool ret = false; /* Need to ack a DATA_FIN received from a peer while this side * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2. @@ -588,12 +587,10 @@ static bool mptcp_check_data_fin(struct sock *sk) break; } - ret = true; if (!__mptcp_check_fallback(msk)) mptcp_send_ack(msk); mptcp_close_wake_up(sk); } - return ret; } static void mptcp_dss_corruption(struct mptcp_sock *msk, struct sock *ssk) -- cgit v1.2.3 From 9870d350e45a5724ee25f77aa0b6d053c9b766db Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Thu, 18 Sep 2025 16:24:25 +0200 Subject: net: replace use of system_unbound_wq with system_dfl_wq Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. system_unbound_wq should be the default workqueue so as not to enforce locality constraints for random work whenever it's not required. Adding system_dfl_wq to encourage its use when unbound work should be used. The old system_unbound_wq will be kept for a few release cycles. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Link: https://patch.msgid.link/20250918142427.309519-2-marco.crivellari@suse.com Signed-off-by: Jakub Kicinski --- drivers/net/macvlan.c | 2 +- drivers/net/netdevsim/dev.c | 6 +++--- net/core/link_watch.c | 4 ++-- net/unix/garbage.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 4df991e494bd..7966545512cf 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -369,7 +369,7 @@ static void macvlan_broadcast_enqueue(struct macvlan_port *port, } spin_unlock(&port->bc_queue.lock); - queue_work(system_unbound_wq, &port->bc_work); + queue_work(system_dfl_wq, &port->bc_work); if (err) goto free_nskb; diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c index 2672d071b325..95f66c1f59db 100644 --- a/drivers/net/netdevsim/dev.c +++ b/drivers/net/netdevsim/dev.c @@ -851,7 +851,7 @@ static void nsim_dev_trap_report_work(struct work_struct *work) nsim_dev = nsim_trap_data->nsim_dev; if (!devl_trylock(priv_to_devlink(nsim_dev))) { - queue_delayed_work(system_unbound_wq, + queue_delayed_work(system_dfl_wq, &nsim_dev->trap_data->trap_report_dw, 1); return; } @@ -867,7 +867,7 @@ static void nsim_dev_trap_report_work(struct work_struct *work) cond_resched(); } devl_unlock(priv_to_devlink(nsim_dev)); - queue_delayed_work(system_unbound_wq, + queue_delayed_work(system_dfl_wq, &nsim_dev->trap_data->trap_report_dw, msecs_to_jiffies(NSIM_TRAP_REPORT_INTERVAL_MS)); } @@ -924,7 +924,7 @@ static int nsim_dev_traps_init(struct devlink *devlink) INIT_DELAYED_WORK(&nsim_dev->trap_data->trap_report_dw, nsim_dev_trap_report_work); - queue_delayed_work(system_unbound_wq, + queue_delayed_work(system_dfl_wq, &nsim_dev->trap_data->trap_report_dw, msecs_to_jiffies(NSIM_TRAP_REPORT_INTERVAL_MS)); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 864f3bbc3a4c..212cde35affa 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -157,9 +157,9 @@ static void linkwatch_schedule_work(int urgent) * override the existing timer. */ if (test_bit(LW_URGENT, &linkwatch_flags)) - mod_delayed_work(system_unbound_wq, &linkwatch_work, 0); + mod_delayed_work(system_dfl_wq, &linkwatch_work, 0); else - queue_delayed_work(system_unbound_wq, &linkwatch_work, delay); + queue_delayed_work(system_dfl_wq, &linkwatch_work, delay); } diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 01e2b9452c75..684ab03137b6 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -592,7 +592,7 @@ static DECLARE_WORK(unix_gc_work, __unix_gc); void unix_gc(void) { WRITE_ONCE(gc_in_progress, true); - queue_work(system_unbound_wq, &unix_gc_work); + queue_work(system_dfl_wq, &unix_gc_work); } #define UNIX_INFLIGHT_TRIGGER_GC 16000 -- cgit v1.2.3 From 5fd8bb982e10f29e856ef71072609af5ce55d281 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Thu, 18 Sep 2025 16:24:26 +0200 Subject: net: replace use of system_wq with system_percpu_wq Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. system_unbound_wq should be the default workqueue so as not to enforce locality constraints for random work whenever it's not required. Adding system_dfl_wq to encourage its use when unbound work should be used. The old system_unbound_wq will be kept for a few release cycles. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Link: https://patch.msgid.link/20250918142427.309519-3-marco.crivellari@suse.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/sfc/efx_channels.c | 2 +- drivers/net/ethernet/sfc/siena/efx_channels.c | 2 +- drivers/net/phy/sfp.c | 12 ++++++------ net/bridge/br_cfm.c | 6 +++--- net/bridge/br_mrp.c | 8 ++++---- net/ceph/mon_client.c | 2 +- net/core/skmsg.c | 2 +- net/devlink/core.c | 2 +- net/ipv4/inet_fragment.c | 2 +- net/netfilter/nf_conntrack_ecache.c | 2 +- net/openvswitch/dp_notify.c | 2 +- net/rfkill/input.c | 2 +- net/smc/smc_core.c | 2 +- net/vmw_vsock/af_vsock.c | 2 +- 14 files changed, 24 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/sfc/efx_channels.c b/drivers/net/ethernet/sfc/efx_channels.c index 0f66324ed351..ed3a96ebc7f3 100644 --- a/drivers/net/ethernet/sfc/efx_channels.c +++ b/drivers/net/ethernet/sfc/efx_channels.c @@ -1281,7 +1281,7 @@ static int efx_poll(struct napi_struct *napi, int budget) time = jiffies - channel->rfs_last_expiry; /* Would our quota be >= 20? */ if (channel->rfs_filter_count * time >= 600 * HZ) - mod_delayed_work(system_wq, &channel->filter_work, 0); + mod_delayed_work(system_percpu_wq, &channel->filter_work, 0); #endif /* There is no race here; although napi_disable() will diff --git a/drivers/net/ethernet/sfc/siena/efx_channels.c b/drivers/net/ethernet/sfc/siena/efx_channels.c index 703419866d18..fc075ab6b7b5 100644 --- a/drivers/net/ethernet/sfc/siena/efx_channels.c +++ b/drivers/net/ethernet/sfc/siena/efx_channels.c @@ -1300,7 +1300,7 @@ static int efx_poll(struct napi_struct *napi, int budget) time = jiffies - channel->rfs_last_expiry; /* Would our quota be >= 20? */ if (channel->rfs_filter_count * time >= 600 * HZ) - mod_delayed_work(system_wq, &channel->filter_work, 0); + mod_delayed_work(system_percpu_wq, &channel->filter_work, 0); #endif /* There is no race here; although napi_disable() will diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c index d49f91ac2e50..dfea675281fd 100644 --- a/drivers/net/phy/sfp.c +++ b/drivers/net/phy/sfp.c @@ -911,7 +911,7 @@ static void sfp_soft_start_poll(struct sfp *sfp) if (sfp->state_soft_mask & (SFP_F_LOS | SFP_F_TX_FAULT) && !sfp->need_poll) - mod_delayed_work(system_wq, &sfp->poll, poll_jiffies); + mod_delayed_work(system_percpu_wq, &sfp->poll, poll_jiffies); mutex_unlock(&sfp->st_mutex); } @@ -1682,7 +1682,7 @@ static void sfp_hwmon_probe(struct work_struct *work) err = sfp_read(sfp, true, 0, &sfp->diag, sizeof(sfp->diag)); if (err < 0) { if (sfp->hwmon_tries--) { - mod_delayed_work(system_wq, &sfp->hwmon_probe, + mod_delayed_work(system_percpu_wq, &sfp->hwmon_probe, T_PROBE_RETRY_SLOW); } else { dev_warn(sfp->dev, "hwmon probe failed: %pe\n", @@ -1709,7 +1709,7 @@ static void sfp_hwmon_probe(struct work_struct *work) static int sfp_hwmon_insert(struct sfp *sfp) { if (sfp->have_a2 && sfp->id.ext.diagmon & SFP_DIAGMON_DDM) { - mod_delayed_work(system_wq, &sfp->hwmon_probe, 1); + mod_delayed_work(system_percpu_wq, &sfp->hwmon_probe, 1); sfp->hwmon_tries = R_PROBE_RETRY_SLOW; } @@ -2563,7 +2563,7 @@ static void sfp_sm_module(struct sfp *sfp, unsigned int event) /* Force a poll to re-read the hardware signal state after * sfp_sm_mod_probe() changed state_hw_mask. */ - mod_delayed_work(system_wq, &sfp->poll, 1); + mod_delayed_work(system_percpu_wq, &sfp->poll, 1); err = sfp_hwmon_insert(sfp); if (err) @@ -3008,7 +3008,7 @@ static void sfp_poll(struct work_struct *work) // it's unimportant if we race while reading this. if (sfp->state_soft_mask & (SFP_F_LOS | SFP_F_TX_FAULT) || sfp->need_poll) - mod_delayed_work(system_wq, &sfp->poll, poll_jiffies); + mod_delayed_work(system_percpu_wq, &sfp->poll, poll_jiffies); } static struct sfp *sfp_alloc(struct device *dev) @@ -3178,7 +3178,7 @@ static int sfp_probe(struct platform_device *pdev) } if (sfp->need_poll) - mod_delayed_work(system_wq, &sfp->poll, poll_jiffies); + mod_delayed_work(system_percpu_wq, &sfp->poll, poll_jiffies); /* We could have an issue in cases no Tx disable pin is available or * wired as modules using a laser as their light source will continue to diff --git a/net/bridge/br_cfm.c b/net/bridge/br_cfm.c index a3c755d0a09d..c2c1c7d44c61 100644 --- a/net/bridge/br_cfm.c +++ b/net/bridge/br_cfm.c @@ -134,7 +134,7 @@ static void ccm_rx_timer_start(struct br_cfm_peer_mep *peer_mep) * of the configured CC 'expected_interval' * in order to detect CCM defect after 3.25 interval. */ - queue_delayed_work(system_wq, &peer_mep->ccm_rx_dwork, + queue_delayed_work(system_percpu_wq, &peer_mep->ccm_rx_dwork, usecs_to_jiffies(interval_us / 4)); } @@ -285,7 +285,7 @@ static void ccm_tx_work_expired(struct work_struct *work) ccm_frame_tx(skb); interval_us = interval_to_us(mep->cc_config.exp_interval); - queue_delayed_work(system_wq, &mep->ccm_tx_dwork, + queue_delayed_work(system_percpu_wq, &mep->ccm_tx_dwork, usecs_to_jiffies(interval_us)); } @@ -809,7 +809,7 @@ int br_cfm_cc_ccm_tx(struct net_bridge *br, const u32 instance, * to send first frame immediately */ mep->ccm_tx_end = jiffies + usecs_to_jiffies(tx_info->period * 1000000); - queue_delayed_work(system_wq, &mep->ccm_tx_dwork, 0); + queue_delayed_work(system_percpu_wq, &mep->ccm_tx_dwork, 0); save: mep->cc_ccm_tx_info = *tx_info; diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index fd2de35ffb3c..3c36fa24bc05 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -341,7 +341,7 @@ static void br_mrp_test_work_expired(struct work_struct *work) out: rcu_read_unlock(); - queue_delayed_work(system_wq, &mrp->test_work, + queue_delayed_work(system_percpu_wq, &mrp->test_work, usecs_to_jiffies(mrp->test_interval)); } @@ -418,7 +418,7 @@ static void br_mrp_in_test_work_expired(struct work_struct *work) out: rcu_read_unlock(); - queue_delayed_work(system_wq, &mrp->in_test_work, + queue_delayed_work(system_percpu_wq, &mrp->in_test_work, usecs_to_jiffies(mrp->in_test_interval)); } @@ -725,7 +725,7 @@ int br_mrp_start_test(struct net_bridge *br, mrp->test_max_miss = test->max_miss; mrp->test_monitor = test->monitor; mrp->test_count_miss = 0; - queue_delayed_work(system_wq, &mrp->test_work, + queue_delayed_work(system_percpu_wq, &mrp->test_work, usecs_to_jiffies(test->interval)); return 0; @@ -865,7 +865,7 @@ int br_mrp_start_in_test(struct net_bridge *br, mrp->in_test_end = jiffies + usecs_to_jiffies(in_test->period); mrp->in_test_max_miss = in_test->max_miss; mrp->in_test_count_miss = 0; - queue_delayed_work(system_wq, &mrp->in_test_work, + queue_delayed_work(system_percpu_wq, &mrp->in_test_work, usecs_to_jiffies(in_test->interval)); return 0; diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index ab66b599ac47..c227ececa925 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -314,7 +314,7 @@ static void __schedule_delayed(struct ceph_mon_client *monc) delay = CEPH_MONC_PING_INTERVAL; dout("__schedule_delayed after %lu\n", delay); - mod_delayed_work(system_wq, &monc->delayed_work, + mod_delayed_work(system_percpu_wq, &monc->delayed_work, round_jiffies_relative(delay)); } diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 83c78379932e..2ac7731e1e0a 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -876,7 +876,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) sk_psock_stop(psock); INIT_RCU_WORK(&psock->rwork, sk_psock_destroy); - queue_rcu_work(system_wq, &psock->rwork); + queue_rcu_work(system_percpu_wq, &psock->rwork); } EXPORT_SYMBOL_GPL(sk_psock_drop); diff --git a/net/devlink/core.c b/net/devlink/core.c index 7203c39532fc..58093f49c090 100644 --- a/net/devlink/core.c +++ b/net/devlink/core.c @@ -320,7 +320,7 @@ static void devlink_release(struct work_struct *work) void devlink_put(struct devlink *devlink) { if (refcount_dec_and_test(&devlink->refcount)) - queue_rcu_work(system_wq, &devlink->rwork); + queue_rcu_work(system_percpu_wq, &devlink->rwork); } struct devlink *devlinks_xa_find_get(struct net *net, unsigned long *indexp) diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 470ab17ceb51..025895eb6ec5 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -183,7 +183,7 @@ static void fqdir_work_fn(struct work_struct *work) rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL); if (llist_add(&fqdir->free_list, &fqdir_free_list)) - queue_delayed_work(system_wq, &fqdir_free_work, HZ); + queue_delayed_work(system_percpu_wq, &fqdir_free_work, HZ); } int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net) diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index af68c64acaab..81baf2082604 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -301,7 +301,7 @@ void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state) net->ct.ecache_dwork_pending = true; } else if (state == NFCT_ECACHE_DESTROY_SENT) { if (!hlist_nulls_empty(&cnet->ecache.dying_list)) - mod_delayed_work(system_wq, &cnet->ecache.dwork, 0); + mod_delayed_work(system_percpu_wq, &cnet->ecache.dwork, 0); else net->ct.ecache_dwork_pending = false; } diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c index 7af0cde8b293..a2af90ee99af 100644 --- a/net/openvswitch/dp_notify.c +++ b/net/openvswitch/dp_notify.c @@ -75,7 +75,7 @@ static int dp_device_event(struct notifier_block *unused, unsigned long event, /* schedule vport destroy, dev_put and genl notification */ ovs_net = net_generic(dev_net(dev), ovs_net_id); - queue_work(system_wq, &ovs_net->dp_notify_work); + queue_work(system_percpu_wq, &ovs_net->dp_notify_work); } return NOTIFY_DONE; diff --git a/net/rfkill/input.c b/net/rfkill/input.c index 598d0a61bda7..53d286b10843 100644 --- a/net/rfkill/input.c +++ b/net/rfkill/input.c @@ -159,7 +159,7 @@ static void rfkill_schedule_global_op(enum rfkill_sched_op op) rfkill_op_pending = true; if (op == RFKILL_GLOBAL_OP_EPO && !rfkill_is_epo_lock_active()) { /* bypass the limiter for EPO */ - mod_delayed_work(system_wq, &rfkill_op_work, 0); + mod_delayed_work(system_percpu_wq, &rfkill_op_work, 0); rfkill_last_scheduled = jiffies; } else rfkill_schedule_ratelimited(); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 2a559a98541c..e216d237865b 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -85,7 +85,7 @@ static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) * otherwise there is a risk of out-of-sync link groups. */ if (!lgr->freeing) { - mod_delayed_work(system_wq, &lgr->free_work, + mod_delayed_work(system_percpu_wq, &lgr->free_work, (!lgr->is_smcd && lgr->role == SMC_CLNT) ? SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 0538948d5fd9..4c2db6cca557 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1649,7 +1649,7 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr, * reschedule it, then ungrab the socket refcount to * keep it balanced. */ - if (mod_delayed_work(system_wq, &vsk->connect_work, + if (mod_delayed_work(system_percpu_wq, &vsk->connect_work, timeout)) sock_put(sk); -- cgit v1.2.3 From 27ce71e1ce81875df72f7698ba27988392bef602 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Thu, 18 Sep 2025 16:24:27 +0200 Subject: net: WQ_PERCPU added to alloc_workqueue users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. alloc_workqueue() treats all queues as per-CPU by default, while unbound workqueues must opt-in via WQ_UNBOUND. This default is suboptimal: most workloads benefit from unbound queues, allowing the scheduler to place worker threads where they’re needed and reducing noise when CPUs are isolated. This change adds a new WQ_PERCPU flag at the network subsystem, to explicitly request the use of the per-CPU behavior. Both flags coexist for one release cycle to allow callers to transition their calls. Once migration is complete, WQ_UNBOUND can be removed and unbound will become the implicit default. With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND must now use WQ_PERCPU. All existing users have been updated accordingly. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Link: https://patch.msgid.link/20250918142427.309519-4-marco.crivellari@suse.com Signed-off-by: Jakub Kicinski --- drivers/net/can/spi/hi311x.c | 3 ++- drivers/net/can/spi/mcp251x.c | 3 ++- drivers/net/ethernet/cavium/liquidio/lio_core.c | 2 +- drivers/net/ethernet/cavium/liquidio/lio_main.c | 8 +++++--- drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 3 ++- drivers/net/ethernet/cavium/liquidio/request_manager.c | 2 +- drivers/net/ethernet/cavium/liquidio/response_manager.c | 3 ++- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 2 +- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 3 ++- drivers/net/ethernet/intel/fm10k/fm10k_main.c | 2 +- drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +- drivers/net/ethernet/marvell/octeontx2/af/cgx.c | 2 +- drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c | 2 +- drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c | 2 +- drivers/net/ethernet/marvell/octeontx2/af/rvu_rep.c | 2 +- drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c | 3 ++- drivers/net/ethernet/marvell/prestera/prestera_main.c | 2 +- drivers/net/ethernet/marvell/prestera/prestera_pci.c | 2 +- drivers/net/ethernet/mellanox/mlxsw/core.c | 4 ++-- drivers/net/ethernet/netronome/nfp/nfp_main.c | 2 +- drivers/net/ethernet/qlogic/qed/qed_main.c | 3 ++- drivers/net/ethernet/wiznet/w5100.c | 2 +- drivers/net/fjes/fjes_main.c | 5 +++-- drivers/net/wireguard/device.c | 6 ++++-- drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c | 3 ++- drivers/net/wwan/wwan_hwsim.c | 2 +- net/ceph/messenger.c | 3 ++- net/core/sock_diag.c | 2 +- net/rds/ib_rdma.c | 3 ++- net/rxrpc/rxperf.c | 2 +- net/smc/af_smc.c | 6 +++--- net/smc/smc_core.c | 2 +- net/tls/tls_device.c | 2 +- net/vmw_vsock/virtio_transport.c | 2 +- net/vmw_vsock/vsock_loopback.c | 2 +- 35 files changed, 57 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/drivers/net/can/spi/hi311x.c b/drivers/net/can/spi/hi311x.c index 09ae218315d7..96f23311b4ee 100644 --- a/drivers/net/can/spi/hi311x.c +++ b/drivers/net/can/spi/hi311x.c @@ -770,7 +770,8 @@ static int hi3110_open(struct net_device *net) goto out_close; } - priv->wq = alloc_workqueue("hi3110_wq", WQ_FREEZABLE | WQ_MEM_RECLAIM, + priv->wq = alloc_workqueue("hi3110_wq", + WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!priv->wq) { ret = -ENOMEM; diff --git a/drivers/net/can/spi/mcp251x.c b/drivers/net/can/spi/mcp251x.c index 313e1d241f01..b797e08499d7 100644 --- a/drivers/net/can/spi/mcp251x.c +++ b/drivers/net/can/spi/mcp251x.c @@ -1378,7 +1378,8 @@ static int mcp251x_can_probe(struct spi_device *spi) if (ret) goto out_clk; - priv->wq = alloc_workqueue("mcp251x_wq", WQ_FREEZABLE | WQ_MEM_RECLAIM, + priv->wq = alloc_workqueue("mcp251x_wq", + WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!priv->wq) { ret = -ENOMEM; diff --git a/drivers/net/ethernet/cavium/liquidio/lio_core.c b/drivers/net/ethernet/cavium/liquidio/lio_core.c index 674c54831875..215dac201b4a 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_core.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_core.c @@ -472,7 +472,7 @@ int setup_rx_oom_poll_fn(struct net_device *netdev) q_no = lio->linfo.rxpciq[q].s.q_no; wq = &lio->rxq_status_wq[q_no]; wq->wq = alloc_workqueue("rxq-oom-status", - WQ_MEM_RECLAIM, 0); + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!wq->wq) { dev_err(&oct->pci_dev->dev, "unable to create cavium rxq oom status wq\n"); return -ENOMEM; diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c index 1d79f6eaa41f..8e2fcec26ea1 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_main.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c @@ -526,7 +526,8 @@ static inline int setup_link_status_change_wq(struct net_device *netdev) struct octeon_device *oct = lio->oct_dev; lio->link_status_wq.wq = alloc_workqueue("link-status", - WQ_MEM_RECLAIM, 0); + WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!lio->link_status_wq.wq) { dev_err(&oct->pci_dev->dev, "unable to create cavium link status wq\n"); return -1; @@ -659,7 +660,8 @@ static inline int setup_sync_octeon_time_wq(struct net_device *netdev) struct octeon_device *oct = lio->oct_dev; lio->sync_octeon_time_wq.wq = - alloc_workqueue("update-octeon-time", WQ_MEM_RECLAIM, 0); + alloc_workqueue("update-octeon-time", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!lio->sync_octeon_time_wq.wq) { dev_err(&oct->pci_dev->dev, "Unable to create wq to update octeon time\n"); return -1; @@ -1734,7 +1736,7 @@ static inline int setup_tx_poll_fn(struct net_device *netdev) struct octeon_device *oct = lio->oct_dev; lio->txq_status_wq.wq = alloc_workqueue("txq-status", - WQ_MEM_RECLAIM, 0); + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!lio->txq_status_wq.wq) { dev_err(&oct->pci_dev->dev, "unable to create cavium txq status wq\n"); return -1; diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c index 62c2eadc33e3..3230dff5ba05 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c @@ -304,7 +304,8 @@ static int setup_link_status_change_wq(struct net_device *netdev) struct octeon_device *oct = lio->oct_dev; lio->link_status_wq.wq = alloc_workqueue("link-status", - WQ_MEM_RECLAIM, 0); + WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!lio->link_status_wq.wq) { dev_err(&oct->pci_dev->dev, "unable to create cavium link status wq\n"); return -1; diff --git a/drivers/net/ethernet/cavium/liquidio/request_manager.c b/drivers/net/ethernet/cavium/liquidio/request_manager.c index 12105ffb5dac..d7cfb20eea00 100644 --- a/drivers/net/ethernet/cavium/liquidio/request_manager.c +++ b/drivers/net/ethernet/cavium/liquidio/request_manager.c @@ -132,7 +132,7 @@ int octeon_init_instr_queue(struct octeon_device *oct, oct->fn_list.setup_iq_regs(oct, iq_no); oct->check_db_wq[iq_no].wq = alloc_workqueue("check_iq_db", - WQ_MEM_RECLAIM, + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!oct->check_db_wq[iq_no].wq) { vfree(iq->request_list); diff --git a/drivers/net/ethernet/cavium/liquidio/response_manager.c b/drivers/net/ethernet/cavium/liquidio/response_manager.c index 861050966e18..de1a8335b545 100644 --- a/drivers/net/ethernet/cavium/liquidio/response_manager.c +++ b/drivers/net/ethernet/cavium/liquidio/response_manager.c @@ -39,7 +39,8 @@ int octeon_setup_response_list(struct octeon_device *oct) } spin_lock_init(&oct->cmd_resp_wqlock); - oct->dma_comp_wq.wq = alloc_workqueue("dma-comp", WQ_MEM_RECLAIM, 0); + oct->dma_comp_wq.wq = alloc_workqueue("dma-comp", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!oct->dma_comp_wq.wq) { dev_err(&oct->pci_dev->dev, "failed to create wq thread\n"); return -ENOMEM; diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 0f4efd505332..c96d1d6ba8fe 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -4884,7 +4884,7 @@ static int dpaa2_eth_probe(struct fsl_mc_device *dpni_dev) priv->tx_tstamp_type = HWTSTAMP_TX_OFF; priv->rx_tstamp = false; - priv->dpaa2_ptp_wq = alloc_workqueue("dpaa2_ptp_wq", 0, 0); + priv->dpaa2_ptp_wq = alloc_workqueue("dpaa2_ptp_wq", WQ_PERCPU, 0); if (!priv->dpaa2_ptp_wq) { err = -ENOMEM; goto err_wq_alloc; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index f5457ae0b64f..9d34d28ff168 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -12912,7 +12912,8 @@ static int __init hclge_init(void) { pr_debug("%s is initializing\n", HCLGE_NAME); - hclge_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, HCLGE_NAME); + hclge_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, + HCLGE_NAME); if (!hclge_wq) { pr_err("%s: failed to create workqueue\n", HCLGE_NAME); return -ENOMEM; diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c index 142f07ca8bc0..b8c15b837fda 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c @@ -37,7 +37,7 @@ static int __init fm10k_init_module(void) pr_info("%s\n", fm10k_copyright); /* create driver workqueue */ - fm10k_workqueue = alloc_workqueue("%s", WQ_MEM_RECLAIM, 0, + fm10k_workqueue = alloc_workqueue("%s", WQ_MEM_RECLAIM | WQ_PERCPU, 0, fm10k_driver_name); if (!fm10k_workqueue) return -ENOMEM; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index b14019d44b58..02fccdbbc288 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -16617,7 +16617,7 @@ static int __init i40e_init_module(void) * since we need to be able to guarantee forward progress even under * memory pressure. */ - i40e_wq = alloc_workqueue("%s", 0, 0, i40e_driver_name); + i40e_wq = alloc_workqueue("%s", WQ_PERCPU, 0, i40e_driver_name); if (!i40e_wq) { pr_err("%s: Failed to create workqueue\n", i40e_driver_name); return -ENOMEM; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c index 0c46ba8a5adc..ca1343f43379 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c @@ -2005,7 +2005,7 @@ static int cgx_probe(struct pci_dev *pdev, const struct pci_device_id *id) /* init wq for processing linkup requests */ INIT_WORK(&cgx->cgx_cmd_work, cgx_lmac_linkup_work); - cgx->cgx_cmd_workq = alloc_workqueue("cgx_cmd_workq", 0, 0); + cgx->cgx_cmd_workq = alloc_workqueue("cgx_cmd_workq", WQ_PERCPU, 0); if (!cgx->cgx_cmd_workq) { dev_err(dev, "alloc workqueue failed for cgx cmd"); err = -ENOMEM; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c b/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c index d7030dfa5dad..a80c8e7c94f2 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c @@ -913,7 +913,7 @@ int rvu_mcs_init(struct rvu *rvu) /* Initialize the wq for handling mcs interrupts */ INIT_LIST_HEAD(&rvu->mcs_intrq_head); INIT_WORK(&rvu->mcs_intr_work, mcs_intr_handler_task); - rvu->mcs_intr_wq = alloc_workqueue("mcs_intr_wq", 0, 0); + rvu->mcs_intr_wq = alloc_workqueue("mcs_intr_wq", WQ_PERCPU, 0); if (!rvu->mcs_intr_wq) { dev_err(rvu->dev, "mcs alloc workqueue failed\n"); return -ENOMEM; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c index 3303c475414a..3abd750a4bd7 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c @@ -315,7 +315,7 @@ static int cgx_lmac_event_handler_init(struct rvu *rvu) spin_lock_init(&rvu->cgx_evq_lock); INIT_LIST_HEAD(&rvu->cgx_evq_head); INIT_WORK(&rvu->cgx_evh_work, cgx_evhandler_task); - rvu->cgx_evh_wq = alloc_workqueue("rvu_evh_wq", 0, 0); + rvu->cgx_evh_wq = alloc_workqueue("rvu_evh_wq", WQ_PERCPU, 0); if (!rvu->cgx_evh_wq) { dev_err(rvu->dev, "alloc workqueue failed"); return -ENOMEM; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_rep.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_rep.c index 03099bc570bd..4415d0ce9aef 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_rep.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_rep.c @@ -376,7 +376,7 @@ int rvu_rep_install_mcam_rules(struct rvu *rvu) spin_lock_init(&rvu->rep_evtq_lock); INIT_LIST_HEAD(&rvu->rep_evtq_head); INIT_WORK(&rvu->rep_evt_work, rvu_rep_wq_handler); - rvu->rep_evt_wq = alloc_workqueue("rep_evt_wq", 0, 0); + rvu->rep_evt_wq = alloc_workqueue("rep_evt_wq", WQ_PERCPU, 0); if (!rvu->rep_evt_wq) { dev_err(rvu->dev, "REP workqueue allocation failed\n"); return -ENOMEM; diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c index c691f0722154..77543d472345 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_ipsec.c @@ -798,7 +798,8 @@ int cn10k_ipsec_init(struct net_device *netdev) pf->ipsec.sa_size = sa_size; INIT_WORK(&pf->ipsec.sa_work, cn10k_ipsec_sa_wq_handler); - pf->ipsec.sa_workq = alloc_workqueue("cn10k_ipsec_sa_workq", 0, 0); + pf->ipsec.sa_workq = alloc_workqueue("cn10k_ipsec_sa_workq", + WQ_PERCPU, 0); if (!pf->ipsec.sa_workq) { netdev_err(pf->netdev, "SA alloc workqueue failed\n"); return -ENOMEM; diff --git a/drivers/net/ethernet/marvell/prestera/prestera_main.c b/drivers/net/ethernet/marvell/prestera/prestera_main.c index 71ffb55d1fc4..65e7ef033bde 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_main.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_main.c @@ -1500,7 +1500,7 @@ EXPORT_SYMBOL(prestera_device_unregister); static int __init prestera_module_init(void) { - prestera_wq = alloc_workqueue("prestera", 0, 0); + prestera_wq = alloc_workqueue("prestera", WQ_PERCPU, 0); if (!prestera_wq) return -ENOMEM; diff --git a/drivers/net/ethernet/marvell/prestera/prestera_pci.c b/drivers/net/ethernet/marvell/prestera/prestera_pci.c index c45d108b2f6d..3e13322470da 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_pci.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_pci.c @@ -898,7 +898,7 @@ static int prestera_pci_probe(struct pci_dev *pdev, dev_info(fw->dev.dev, "Prestera FW is ready\n"); - fw->wq = alloc_workqueue("prestera_fw_wq", WQ_HIGHPRI, 1); + fw->wq = alloc_workqueue("prestera_fw_wq", WQ_HIGHPRI | WQ_PERCPU, 1); if (!fw->wq) { err = -ENOMEM; goto err_wq_alloc; diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index 980f3223f124..83c7cf3bbea3 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -886,7 +886,7 @@ static int mlxsw_emad_init(struct mlxsw_core *mlxsw_core) if (!(mlxsw_core->bus->features & MLXSW_BUS_F_TXRX)) return 0; - emad_wq = alloc_workqueue("mlxsw_core_emad", 0, 0); + emad_wq = alloc_workqueue("mlxsw_core_emad", WQ_PERCPU, 0); if (!emad_wq) return -ENOMEM; mlxsw_core->emad_wq = emad_wq; @@ -3381,7 +3381,7 @@ static int __init mlxsw_core_module_init(void) if (err) return err; - mlxsw_wq = alloc_workqueue(mlxsw_core_driver_name, 0, 0); + mlxsw_wq = alloc_workqueue(mlxsw_core_driver_name, WQ_PERCPU, 0); if (!mlxsw_wq) { err = -ENOMEM; goto err_alloc_workqueue; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.c b/drivers/net/ethernet/netronome/nfp/nfp_main.c index 71301dbd8fb5..48390b2fd44d 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_main.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_main.c @@ -797,7 +797,7 @@ static int nfp_pci_probe(struct pci_dev *pdev, pf->pdev = pdev; pf->dev_info = dev_info; - pf->wq = alloc_workqueue("nfp-%s", 0, 2, pci_name(pdev)); + pf->wq = alloc_workqueue("nfp-%s", WQ_PERCPU, 2, pci_name(pdev)); if (!pf->wq) { err = -ENOMEM; goto err_pci_priv_unset; diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index 886061d7351a..d4685ad4b169 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -1214,7 +1214,8 @@ static int qed_slowpath_wq_start(struct qed_dev *cdev) hwfn = &cdev->hwfns[i]; hwfn->slowpath_wq = alloc_workqueue("slowpath-%02x:%02x.%02x", - 0, 0, cdev->pdev->bus->number, + WQ_PERCPU, 0, + cdev->pdev->bus->number, PCI_SLOT(cdev->pdev->devfn), hwfn->abs_pf_id); diff --git a/drivers/net/ethernet/wiznet/w5100.c b/drivers/net/ethernet/wiznet/w5100.c index b77f096eaf99..c5424d882135 100644 --- a/drivers/net/ethernet/wiznet/w5100.c +++ b/drivers/net/ethernet/wiznet/w5100.c @@ -1142,7 +1142,7 @@ int w5100_probe(struct device *dev, const struct w5100_ops *ops, if (err < 0) goto err_register; - priv->xfer_wq = alloc_workqueue("%s", WQ_MEM_RECLAIM, 0, + priv->xfer_wq = alloc_workqueue("%s", WQ_MEM_RECLAIM | WQ_PERCPU, 0, netdev_name(ndev)); if (!priv->xfer_wq) { err = -ENOMEM; diff --git a/drivers/net/fjes/fjes_main.c b/drivers/net/fjes/fjes_main.c index 4a4ed2ccf72f..b63965d9a1ba 100644 --- a/drivers/net/fjes/fjes_main.c +++ b/drivers/net/fjes/fjes_main.c @@ -1364,14 +1364,15 @@ static int fjes_probe(struct platform_device *plat_dev) adapter->force_reset = false; adapter->open_guard = false; - adapter->txrx_wq = alloc_workqueue(DRV_NAME "/txrx", WQ_MEM_RECLAIM, 0); + adapter->txrx_wq = alloc_workqueue(DRV_NAME "/txrx", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (unlikely(!adapter->txrx_wq)) { err = -ENOMEM; goto err_free_netdev; } adapter->control_wq = alloc_workqueue(DRV_NAME "/control", - WQ_MEM_RECLAIM, 0); + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (unlikely(!adapter->control_wq)) { err = -ENOMEM; goto err_free_txrx_wq; diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c index 813bd10d3dc7..46a71ec36af8 100644 --- a/drivers/net/wireguard/device.c +++ b/drivers/net/wireguard/device.c @@ -333,7 +333,8 @@ static int wg_newlink(struct net_device *dev, goto err_free_peer_hashtable; wg->handshake_receive_wq = alloc_workqueue("wg-kex-%s", - WQ_CPU_INTENSIVE | WQ_FREEZABLE, 0, dev->name); + WQ_CPU_INTENSIVE | WQ_FREEZABLE | WQ_PERCPU, 0, + dev->name); if (!wg->handshake_receive_wq) goto err_free_index_hashtable; @@ -343,7 +344,8 @@ static int wg_newlink(struct net_device *dev, goto err_destroy_handshake_receive; wg->packet_crypt_wq = alloc_workqueue("wg-crypt-%s", - WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 0, dev->name); + WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_PERCPU, 0, + dev->name); if (!wg->packet_crypt_wq) goto err_destroy_handshake_send; diff --git a/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c b/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c index 6a7a26085fc7..2310493203d3 100644 --- a/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c +++ b/drivers/net/wwan/t7xx/t7xx_hif_dpmaif_rx.c @@ -1085,7 +1085,8 @@ static void t7xx_dpmaif_bat_release_work(struct work_struct *work) int t7xx_dpmaif_bat_rel_wq_alloc(struct dpmaif_ctrl *dpmaif_ctrl) { dpmaif_ctrl->bat_release_wq = alloc_workqueue("dpmaif_bat_release_work_queue", - WQ_MEM_RECLAIM, 1); + WQ_MEM_RECLAIM | WQ_PERCPU, + 1); if (!dpmaif_ctrl->bat_release_wq) return -ENOMEM; diff --git a/drivers/net/wwan/wwan_hwsim.c b/drivers/net/wwan/wwan_hwsim.c index b02befd1b6fb..733688cd4607 100644 --- a/drivers/net/wwan/wwan_hwsim.c +++ b/drivers/net/wwan/wwan_hwsim.c @@ -509,7 +509,7 @@ static int __init wwan_hwsim_init(void) if (wwan_hwsim_devsnum < 0 || wwan_hwsim_devsnum > 128) return -EINVAL; - wwan_wq = alloc_workqueue("wwan_wq", 0, 0); + wwan_wq = alloc_workqueue("wwan_wq", WQ_PERCPU, 0); if (!wwan_wq) return -ENOMEM; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 9f6d860411cb..1fbec4853f00 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -252,7 +252,8 @@ int __init ceph_msgr_init(void) * The number of active work items is limited by the number of * connections, so leave @max_active at default. */ - ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_MEM_RECLAIM, 0); + ceph_msgr_wq = alloc_workqueue("ceph-msgr", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (ceph_msgr_wq) return 0; diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index b23594c767f2..026ce9bd9e5e 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -348,7 +348,7 @@ static struct pernet_operations diag_net_ops = { static int __init sock_diag_init(void) { - broadcast_wq = alloc_workqueue("sock_diag_events", 0, 0); + broadcast_wq = alloc_workqueue("sock_diag_events", WQ_PERCPU, 0); BUG_ON(!broadcast_wq); return register_pernet_subsys(&diag_net_ops); } diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index d1cfceeff133..6585164c7059 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -672,7 +672,8 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, int rds_ib_mr_init(void) { - rds_ib_mr_wq = alloc_workqueue("rds_mr_flushd", WQ_MEM_RECLAIM, 0); + rds_ib_mr_wq = alloc_workqueue("rds_mr_flushd", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!rds_ib_mr_wq) return -ENOMEM; return 0; diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c index 0377301156b0..2ea71e3831f7 100644 --- a/net/rxrpc/rxperf.c +++ b/net/rxrpc/rxperf.c @@ -630,7 +630,7 @@ static int __init rxperf_init(void) pr_info("Server registering\n"); - rxperf_workqueue = alloc_workqueue("rxperf", 0, 0); + rxperf_workqueue = alloc_workqueue("rxperf", WQ_PERCPU, 0); if (!rxperf_workqueue) goto error_workqueue; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index a7187e5873ec..9097e4f24d2b 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -3535,15 +3535,15 @@ static int __init smc_init(void) rc = -ENOMEM; - smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0); + smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", WQ_PERCPU, 0); if (!smc_tcp_ls_wq) goto out_pnet; - smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0); + smc_hs_wq = alloc_workqueue("smc_hs_wq", WQ_PERCPU, 0); if (!smc_hs_wq) goto out_alloc_tcp_ls_wq; - smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0); + smc_close_wq = alloc_workqueue("smc_close_wq", WQ_PERCPU, 0); if (!smc_close_wq) goto out_alloc_hs_wq; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index e216d237865b..a9e80f44307d 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -896,7 +896,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) rc = SMC_CLC_DECL_MEM; goto ism_put_vlan; } - lgr->tx_wq = alloc_workqueue("smc_tx_wq-%*phN", 0, 0, + lgr->tx_wq = alloc_workqueue("smc_tx_wq-%*phN", WQ_PERCPU, 0, SMC_LGR_ID_SIZE, &lgr->id); if (!lgr->tx_wq) { rc = -ENOMEM; diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index a82fdcf19969..a64ae15b1a60 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -1412,7 +1412,7 @@ int __init tls_device_init(void) if (!dummy_page) return -ENOMEM; - destruct_wq = alloc_workqueue("ktls_device_destruct", 0, 0); + destruct_wq = alloc_workqueue("ktls_device_destruct", WQ_PERCPU, 0); if (!destruct_wq) { err = -ENOMEM; goto err_free_dummy; diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index b6569b0ca2bb..8c867023a2e5 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -926,7 +926,7 @@ static int __init virtio_vsock_init(void) { int ret; - virtio_vsock_workqueue = alloc_workqueue("virtio_vsock", 0, 0); + virtio_vsock_workqueue = alloc_workqueue("virtio_vsock", WQ_PERCPU, 0); if (!virtio_vsock_workqueue) return -ENOMEM; diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c index 6e78927a598e..bc2ff918b315 100644 --- a/net/vmw_vsock/vsock_loopback.c +++ b/net/vmw_vsock/vsock_loopback.c @@ -139,7 +139,7 @@ static int __init vsock_loopback_init(void) struct vsock_loopback *vsock = &the_vsock_loopback; int ret; - vsock->workqueue = alloc_workqueue("vsock-loopback", 0, 0); + vsock->workqueue = alloc_workqueue("vsock-loopback", WQ_PERCPU, 0); if (!vsock->workqueue) return -ENOMEM; -- cgit v1.2.3 From 17b14d235f58155a05cd9371e4559361ca3c67da Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2025 20:48:49 +0000 Subject: net: move sk_uid and sk_protocol to sock_read_tx sk_uid and sk_protocol are read from inet6_csk_route_socket() for each TCP transmit. Also read from udpv6_sendmsg(), udp_sendmsg() and others. Move them to sock_read_tx for better cache locality. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250919204856.2977245-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 6 +++--- net/core/sock.c | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index ee95081b0c0b..66c2f396b57d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -492,6 +492,9 @@ struct sock { long sk_sndtimeo; u32 sk_priority; u32 sk_mark; + kuid_t sk_uid; + u16 sk_protocol; + u16 sk_type; struct dst_entry __rcu *sk_dst_cache; netdev_features_t sk_route_caps; #ifdef CONFIG_SOCK_VALIDATE_XMIT @@ -517,15 +520,12 @@ struct sock { sk_no_check_tx : 1, sk_no_check_rx : 1; u8 sk_shutdown; - u16 sk_type; - u16 sk_protocol; unsigned long sk_lingertime; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; int sk_err_soft; u32 sk_ack_backlog; u32 sk_max_ack_backlog; - kuid_t sk_uid; unsigned long sk_ino; spinlock_t sk_peer_lock; int sk_bind_phc; diff --git a/net/core/sock.c b/net/core/sock.c index 21742da19e45..ad79efde4476 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -4471,6 +4471,8 @@ static int __init sock_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_uid); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_protocol); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type); -- cgit v1.2.3 From 9303c3ced111803dcd1aa36a778f290977935ca5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2025 20:48:50 +0000 Subject: net: move sk->sk_err_soft and sk->sk_sndbuf sk->sk_sndbuf is read-mostly in tx path, so move it from sock_write_tx group to more appropriate sock_read_tx. sk->sk_err_soft was not identified previously, but is used from tcp_ack(). Move it to sock_write_tx group for better cache locality. Also change tcp_ack() to clear sk->sk_err_soft only if needed. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250919204856.2977245-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 4 ++-- net/core/sock.c | 3 ++- net/ipv4/tcp_input.c | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 66c2f396b57d..b4fefeea0213 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -467,7 +467,7 @@ struct sock { __cacheline_group_begin(sock_write_tx); int sk_write_pending; atomic_t sk_omem_alloc; - int sk_sndbuf; + int sk_err_soft; int sk_wmem_queued; refcount_t sk_wmem_alloc; @@ -507,6 +507,7 @@ struct sock { unsigned int sk_gso_max_size; gfp_t sk_allocation; u32 sk_txhash; + int sk_sndbuf; u8 sk_pacing_shift; bool sk_use_task_frag; __cacheline_group_end(sock_read_tx); @@ -523,7 +524,6 @@ struct sock { unsigned long sk_lingertime; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; - int sk_err_soft; u32 sk_ack_backlog; u32 sk_max_ack_backlog; unsigned long sk_ino; diff --git a/net/core/sock.c b/net/core/sock.c index ad79efde4476..dc03d4b5909a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -4452,7 +4452,7 @@ static int __init sock_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); - CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_err_soft); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags); @@ -4479,6 +4479,7 @@ static int __init sock_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndbuf); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9fdc6ce25eb1..f93d48d98d5d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4085,7 +4085,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* We passed data and got it acked, remove any soft error * log. Something worked... */ - WRITE_ONCE(sk->sk_err_soft, 0); + if (READ_ONCE(sk->sk_err_soft)) + WRITE_ONCE(sk->sk_err_soft, 0); WRITE_ONCE(icsk->icsk_probes_out, 0); tp->rcv_tstamp = tcp_jiffies32; if (!prior_packets) -- cgit v1.2.3 From e1b022c2bdf1f2a631340b1b2ef265090534f65a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2025 20:48:51 +0000 Subject: tcp: remove CACHELINE_ASSERT_GROUP_SIZE() uses Maintaining the CACHELINE_ASSERT_GROUP_SIZE() uses for struct tcp_sock has been painful. This had little benefit, so remove them. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250919204856.2977245-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9b327b6807fc..5932dba3bd71 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5101,7 +5101,6 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_tx, 32); /* TXRX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset); @@ -5112,7 +5111,6 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, lost_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, sacked_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, scaling_ratio); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_txrx, 32); /* RX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, copied_seq); @@ -5129,9 +5127,6 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh); #if IS_ENABLED(CONFIG_TLS_DEVICE) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tcp_clean_acked); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_rx, 77); -#else - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_rx, 69); #endif /* TX read-write hotpath cache lines */ @@ -5151,7 +5146,6 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 97); /* TXRX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags); @@ -5172,11 +5166,6 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); - /* 32bit arches with 8byte alignment on u64 fields might need padding - * before tcp_clock_cache. - */ - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 107 + 4); - /* RX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, segs_in); @@ -5193,7 +5182,6 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 112); } void __init tcp_init(void) -- cgit v1.2.3 From 1b44d700023e77dd92821e7811db825e75a1a394 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2025 20:48:52 +0000 Subject: tcp: move tcp->rcv_tstamp to tcp_sock_write_txrx group tcp_ack() writes this field, it belongs to tcp_sock_write_txrx. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250919204856.2977245-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- Documentation/networking/net_cachelines/tcp_sock.rst | 2 +- include/linux/tcp.h | 4 ++-- net/ipv4/tcp.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index d4dc01800945..429df29fba8b 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -26,7 +26,7 @@ u64 bytes_acked read_w u32 dsack_dups u32 snd_una read_mostly read_write tcp_wnd_end,tcp_urg_mode,tcp_minshall_check,tcp_cwnd_validate(tx);tcp_ack,tcp_may_update_window,tcp_clean_rtx_queue(write),tcp_ack_tstamp(rx) u32 snd_sml read_write tcp_minshall_check,tcp_minshall_update -u32 rcv_tstamp read_mostly tcp_ack +u32 rcv_tstamp read_write read_write tcp_ack void * tcp_clean_acked read_mostly tcp_ack u32 lsndtime read_write tcp_slow_start_after_idle_check,tcp_event_data_sent u32 last_oow_ack_time diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 3ca5ed02de6d..1e6c2ded22c9 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -238,7 +238,6 @@ struct tcp_sock { /* RX read-mostly hotpath cache lines */ __cacheline_group_begin(tcp_sock_read_rx); u32 copied_seq; /* Head of yet unread data */ - u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ u32 snd_wl1; /* Sequence for window update */ u32 tlp_high_seq; /* snd_nxt at the time of TLP */ u32 rttvar_us; /* smoothed mdev_max */ @@ -246,13 +245,13 @@ struct tcp_sock { u16 advmss; /* Advertised MSS */ u16 urg_data; /* Saved octet of OOB data and control flags */ u32 lost; /* Total data packets lost incl. rexmits */ + u32 snd_ssthresh; /* Slow start size threshold */ struct minmax rtt_min; /* OOO segments go in this rbtree. Socket lock must be held. */ struct rb_root out_of_order_queue; #if defined(CONFIG_TLS_DEVICE) void (*tcp_clean_acked)(struct sock *sk, u32 acked_seq); #endif - u32 snd_ssthresh; /* Slow start size threshold */ u8 recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */ __cacheline_group_end(tcp_sock_read_rx); @@ -319,6 +318,7 @@ struct tcp_sock { */ u32 app_limited; /* limited until "delivered" reaches this val */ u32 rcv_wnd; /* Current receiver window */ + u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ /* * Options received (usually on last packet, some only on SYN packets). */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5932dba3bd71..721287ca3328 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5114,7 +5114,6 @@ static void __init tcp_struct_check(void) /* RX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, copied_seq); - CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rcv_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_wl1); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tlp_high_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rttvar_us); @@ -5164,6 +5163,7 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); /* RX read-write hotpath cache lines */ -- cgit v1.2.3 From a105ea47a4e855d24ebf65f1c5fb907162e7b8cf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2025 20:48:54 +0000 Subject: tcp: move tcp_clean_acked to tcp_sock_read_tx group tp->tcp_clean_acked is fetched in tx path when snd_una is updated. This field thus belongs to tcp_sock_read_tx group. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250919204856.2977245-7-edumazet@google.com Signed-off-by: Jakub Kicinski --- Documentation/networking/net_cachelines/tcp_sock.rst | 2 +- include/linux/tcp.h | 6 +++--- net/ipv4/tcp.c | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index c2138619b995..26f32dbcf6ec 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -27,7 +27,7 @@ u32 dsack_dups u32 snd_una read_mostly read_write tcp_wnd_end,tcp_urg_mode,tcp_minshall_check,tcp_cwnd_validate(tx);tcp_ack,tcp_may_update_window,tcp_clean_rtx_queue(write),tcp_ack_tstamp(rx) u32 snd_sml read_write tcp_minshall_check,tcp_minshall_update u32 rcv_tstamp read_write read_write tcp_ack -void * tcp_clean_acked read_mostly tcp_ack +void * tcp_clean_acked read_mostly tcp_ack u32 lsndtime read_write tcp_slow_start_after_idle_check,tcp_event_data_sent u32 last_oow_ack_time u32 compressed_ack_rcv_nxt diff --git a/include/linux/tcp.h b/include/linux/tcp.h index c1d7fce251d7..3f282130c863 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -215,6 +215,9 @@ struct tcp_sock { u16 gso_segs; /* Max number of segs per GSO packet */ /* from STCP, retrans queue hinting */ struct sk_buff *retransmit_skb_hint; +#if defined(CONFIG_TLS_DEVICE) + void (*tcp_clean_acked)(struct sock *sk, u32 acked_seq); +#endif __cacheline_group_end(tcp_sock_read_tx); /* TXRX read-mostly hotpath cache lines */ @@ -250,9 +253,6 @@ struct tcp_sock { struct minmax rtt_min; /* OOO segments go in this rbtree. Socket lock must be held. */ struct rb_root out_of_order_queue; -#if defined(CONFIG_TLS_DEVICE) - void (*tcp_clean_acked)(struct sock *sk, u32 acked_seq); -#endif __cacheline_group_end(tcp_sock_read_rx); /* TX read-write hotpath cache lines */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 721287ca3328..7949d16506a4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5101,6 +5101,9 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint); +#if IS_ENABLED(CONFIG_TLS_DEVICE) + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, tcp_clean_acked); +#endif /* TXRX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset); @@ -5124,9 +5127,6 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rtt_min); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, out_of_order_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh); -#if IS_ENABLED(CONFIG_TLS_DEVICE) - CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tcp_clean_acked); -#endif /* TX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, segs_out); -- cgit v1.2.3 From 649091ef597bb7de34dd8ceea39bbc4252970558 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2025 20:48:56 +0000 Subject: tcp: reclaim 8 bytes in struct request_sock_queue synflood_warned had to be u32 for xchg(), but ensuring atomicity is not really needed. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250919204856.2977245-9-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/request_sock.h | 2 +- net/ipv4/tcp_input.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 6a5ec1418e85..cd4d4cf71d0d 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -185,8 +185,8 @@ struct fastopen_queue { struct request_sock_queue { spinlock_t rskq_lock; u8 rskq_defer_accept; + u8 synflood_warned; - u32 synflood_warned; atomic_t qlen; atomic_t young; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f93d48d98d5d..79d5252ed6cc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -7282,8 +7282,8 @@ static bool tcp_syn_flood_action(struct sock *sk, const char *proto) #endif __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); - if (!READ_ONCE(queue->synflood_warned) && syncookies != 2 && - xchg(&queue->synflood_warned, 1) == 0) { + if (syncookies != 2 && !READ_ONCE(queue->synflood_warned)) { + WRITE_ONCE(queue->synflood_warned, 1); if (IS_ENABLED(CONFIG_IPV6) && sk->sk_family == AF_INET6) { net_info_ratelimited("%s: Possible SYN flooding on port [%pI6c]:%u. %s.\n", proto, inet6_rcv_saddr(sk), -- cgit v1.2.3 From 17f34ab55a8518ecbd5dcacec48e6ee903f7c1d0 Mon Sep 17 00:00:00 2001 From: Ryder Lee Date: Mon, 22 Sep 2025 22:19:08 +0000 Subject: wifi: cfg80211: fix width unit in cfg80211_radio_chandef_valid() The original code used nl80211_chan_width_to_mhz(), which returns the width in MHz. However, the expected unit is KHz. Fixes: 510dba80ed66 ("wifi: cfg80211: add helper for checking if a chandef is valid on a radio") Signed-off-by: Ryder Lee Link: https://patch.msgid.link/df54294e6c4ed0f3ceff6e818b710478ddfc62c0.1758579480.git.Ryder%20Lee%20ryder.lee@mediatek.com/ Signed-off-by: Johannes Berg --- net/wireless/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/util.c b/net/wireless/util.c index f26440d18ad3..56724b33af04 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -2965,7 +2965,7 @@ bool cfg80211_radio_chandef_valid(const struct wiphy_radio *radio, u32 freq, width; freq = ieee80211_chandef_to_khz(chandef); - width = cfg80211_chandef_get_width(chandef); + width = MHZ_TO_KHZ(cfg80211_chandef_get_width(chandef)); if (!ieee80211_radio_freq_range_valid(radio, freq, width)) return false; -- cgit v1.2.3 From c67732d067860850b767c81736b49f88a946bffb Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 23 Sep 2025 15:37:08 +0900 Subject: can: annotate mtu accesses with READ_ONCE() As hinted in commit 501a90c94510 ("inet: protect against too small mtu values."), net_device->mtu is vulnerable to race conditions if it is written and read without holding the RTNL. At the moment, all the writes are done while the interface is down, either in the devices' probe() function or in can_changelink(). So there are no such issues yet. But upcoming changes will allow to modify the MTU while the CAN XL devices are up. In preparation to the introduction of CAN XL, annotate all the net_device->mtu accesses which are not yet guarded by the RTNL with a READ_ONCE(). Note that all the write accesses are already either guarded by the RTNL or are already annotated and thus need no changes. Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250923-can-fix-mtu-v3-1-581bde113f52@kernel.org Signed-off-by: Marc Kleine-Budde --- net/can/af_can.c | 2 +- net/can/isotp.c | 2 +- net/can/raw.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/can/af_can.c b/net/can/af_can.c index b2387a46794a..770173d8db42 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -221,7 +221,7 @@ int can_send(struct sk_buff *skb, int loop) } /* Make sure the CAN frame can pass the selected CAN netdevice. */ - if (unlikely(skb->len > skb->dev->mtu)) { + if (unlikely(skb->len > READ_ONCE(skb->dev->mtu))) { err = -EMSGSIZE; goto inval_skb; } diff --git a/net/can/isotp.c b/net/can/isotp.c index dee1412b3c9c..74ee1e52249b 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -1313,7 +1313,7 @@ static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) err = -ENODEV; goto out; } - if (dev->mtu < so->ll.mtu) { + if (READ_ONCE(dev->mtu) < so->ll.mtu) { dev_put(dev); err = -EINVAL; goto out; diff --git a/net/can/raw.c b/net/can/raw.c index bf65d67b5df0..a53853f5e9af 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -961,7 +961,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) err = -EINVAL; /* check for valid CAN (CC/FD/XL) frame content */ - txmtu = raw_check_txframe(ro, skb, dev->mtu); + txmtu = raw_check_txframe(ro, skb, READ_ONCE(dev->mtu)); if (!txmtu) goto free_skb; -- cgit v1.2.3 From d57f4b874946e997be52f5ebb5e0e1dad368c16f Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 17 Sep 2025 15:22:04 +0200 Subject: tcp: Update bind bucket state on port release Today, once an inet_bind_bucket enters a state where fastreuse >= 0 or fastreuseport >= 0 after a socket is explicitly bound to a port, it remains in that state until all sockets are removed and the bucket is destroyed. In this state, the bucket is skipped during ephemeral port selection in connect(). For applications using a reduced ephemeral port range (IP_LOCAL_PORT_RANGE socket option), this can cause faster port exhaustion since blocked buckets are excluded from reuse. The reason the bucket state isn't updated on port release is unclear. Possibly a performance trade-off to avoid scanning bucket owners, or just an oversight. Fix it by recalculating the bucket state when a socket releases a port. To limit overhead, each inet_bind2_bucket stores its own (fastreuse, fastreuseport) state. On port release, only the relevant port-addr bucket is scanned, and the overall state is derived from these. Signed-off-by: Jakub Sitnicki Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250917-update-bind-bucket-state-on-unhash-v5-1-57168b661b47@cloudflare.com Signed-off-by: Paolo Abeni --- include/net/inet_connection_sock.h | 5 +++-- include/net/inet_hashtables.h | 2 ++ include/net/inet_timewait_sock.h | 3 ++- include/net/sock.h | 4 ++++ net/ipv4/inet_connection_sock.c | 12 +++++++---- net/ipv4/inet_hashtables.c | 44 +++++++++++++++++++++++++++++++++++++- net/ipv4/inet_timewait_sock.c | 1 + 7 files changed, 63 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 0737d8e178dd..b4b886647607 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -316,8 +316,9 @@ int inet_csk_listen_start(struct sock *sk); void inet_csk_listen_stop(struct sock *sk); /* update the fast reuse flag when adding a socket */ -void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, - struct sock *sk); +void inet_csk_update_fastreuse(const struct sock *sk, + struct inet_bind_bucket *tb, + struct inet_bind2_bucket *tb2); struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu); diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index b787be651ce7..ac05a52d9e13 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -108,6 +108,8 @@ struct inet_bind2_bucket { struct hlist_node bhash_node; /* List of sockets hashed to this bucket */ struct hlist_head owners; + signed char fastreuse; + signed char fastreuseport; }; static inline struct net *ib_net(const struct inet_bind_bucket *ib) diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 3a31c74c9e15..63a644ff30de 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -70,7 +70,8 @@ struct inet_timewait_sock { unsigned int tw_transparent : 1, tw_flowlabel : 20, tw_usec_ts : 1, - tw_pad : 2, /* 2 bits hole */ + tw_connect_bind : 1, + tw_pad : 1, /* 1 bit hole */ tw_tos : 8; u32 tw_txhash; u32 tw_priority; diff --git a/include/net/sock.h b/include/net/sock.h index b4fefeea0213..8c5b64f41ab7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1494,6 +1494,10 @@ static inline int __sk_prot_rehash(struct sock *sk) #define SOCK_BINDADDR_LOCK 4 #define SOCK_BINDPORT_LOCK 8 +/** + * define SOCK_CONNECT_BIND - &sock->sk_userlocks flag for auto-bind at connect() time + */ +#define SOCK_CONNECT_BIND 16 struct socket_alloc { struct socket socket; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 142ff8d86fc2..cdd1e12aac8c 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -423,7 +423,7 @@ success: } static inline int sk_reuseport_match(struct inet_bind_bucket *tb, - struct sock *sk) + const struct sock *sk) { if (tb->fastreuseport <= 0) return 0; @@ -453,8 +453,9 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb, ipv6_only_sock(sk), true, false); } -void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, - struct sock *sk) +void inet_csk_update_fastreuse(const struct sock *sk, + struct inet_bind_bucket *tb, + struct inet_bind2_bucket *tb2) { bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; @@ -501,6 +502,9 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, tb->fastreuseport = 0; } } + + tb2->fastreuse = tb->fastreuse; + tb2->fastreuseport = tb->fastreuseport; } /* Obtain a reference to a local port for the given sock, @@ -582,7 +586,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) } success: - inet_csk_update_fastreuse(tb, sk); + inet_csk_update_fastreuse(sk, tb, tb2); if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, tb2, port); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 4eb933f56fe6..b7024e3d9ac3 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -58,6 +58,14 @@ static u32 sk_ehashfn(const struct sock *sk) sk->sk_daddr, sk->sk_dport); } +static bool sk_is_connect_bind(const struct sock *sk) +{ + if (sk->sk_state == TCP_TIME_WAIT) + return inet_twsk(sk)->tw_connect_bind; + else + return sk->sk_userlocks & SOCK_CONNECT_BIND; +} + /* * Allocate and initialize a new local port bind bucket. * The bindhash mutex for snum's hash chain must be held here. @@ -87,10 +95,22 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, */ void inet_bind_bucket_destroy(struct inet_bind_bucket *tb) { + const struct inet_bind2_bucket *tb2; + if (hlist_empty(&tb->bhash2)) { hlist_del_rcu(&tb->node); kfree_rcu(tb, rcu); + return; + } + + if (tb->fastreuse == -1 && tb->fastreuseport == -1) + return; + hlist_for_each_entry(tb2, &tb->bhash2, bhash_node) { + if (tb2->fastreuse != -1 || tb2->fastreuseport != -1) + return; } + tb->fastreuse = -1; + tb->fastreuseport = -1; } bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, @@ -121,6 +141,8 @@ static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2, #else tb2->rcv_saddr = sk->sk_rcv_saddr; #endif + tb2->fastreuse = 0; + tb2->fastreuseport = 0; INIT_HLIST_HEAD(&tb2->owners); hlist_add_head(&tb2->node, &head->chain); hlist_add_head(&tb2->bhash_node, &tb->bhash2); @@ -143,11 +165,23 @@ struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, /* Caller must hold hashbucket lock for this tb with local BH disabled */ void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) { + const struct sock *sk; + if (hlist_empty(&tb->owners)) { __hlist_del(&tb->node); __hlist_del(&tb->bhash_node); kmem_cache_free(cachep, tb); + return; } + + if (tb->fastreuse == -1 && tb->fastreuseport == -1) + return; + sk_for_each_bound(sk, &tb->owners) { + if (!sk_is_connect_bind(sk)) + return; + } + tb->fastreuse = -1; + tb->fastreuseport = -1; } static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, @@ -191,6 +225,7 @@ static void __inet_put_port(struct sock *sk) tb = inet_csk(sk)->icsk_bind_hash; inet_csk(sk)->icsk_bind_hash = NULL; inet_sk(sk)->inet_num = 0; + sk->sk_userlocks &= ~SOCK_CONNECT_BIND; spin_lock(&head2->lock); if (inet_csk(sk)->icsk_bind2_hash) { @@ -277,7 +312,7 @@ bhash2_find: } } if (update_fastreuse) - inet_csk_update_fastreuse(tb, child); + inet_csk_update_fastreuse(child, tb, tb2); inet_bind_hash(child, tb, tb2, port); spin_unlock(&head2->lock); spin_unlock(&head->lock); @@ -950,6 +985,10 @@ static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, if (!tb2) { tb2 = new_tb2; inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk); + if (sk_is_connect_bind(sk)) { + tb2->fastreuse = -1; + tb2->fastreuseport = -1; + } } inet_csk(sk)->icsk_bind2_hash = tb2; sk_add_bind_node(sk, &tb2->owners); @@ -1120,6 +1159,8 @@ ok: head2, tb, sk); if (!tb2) goto error; + tb2->fastreuse = -1; + tb2->fastreuseport = -1; } /* Here we want to add a little bit of randomness to the next source @@ -1132,6 +1173,7 @@ ok: /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, tb2, port); + sk->sk_userlocks |= SOCK_CONNECT_BIND; if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 2ca2912f61f4..e1a86130f038 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -208,6 +208,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, tw->tw_hash = sk->sk_hash; tw->tw_ipv6only = 0; tw->tw_transparent = inet_test_bit(TRANSPARENT, sk); + tw->tw_connect_bind = !!(sk->sk_userlocks & SOCK_CONNECT_BIND); tw->tw_prot = sk->sk_prot_creator; atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); twsk_net_set(tw, sock_net(sk)); -- cgit v1.2.3 From 884eee8e43f3072db4111178c98b9aa5c57bcf92 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:47 +0200 Subject: net/smc: Remove error handling of unregister_dmb() smcd_buf_free() calls smc_ism_unregister_dmb(lgr->smcd, buf_desc) and then unconditionally frees buf_desc. Remove the cleaning up of fields of buf_desc in smc_ism_unregister_dmb(), because it is not helpful. This removes the only usage of ISM_ERROR from the smc module. So move it to drivers/s390/net/ism.h. Signed-off-by: Alexandra Winter Reviewed-by: Mahanta Jambigi Reviewed-by: Dust Li Link: https://patch.msgid.link/20250918110500.1731261-2-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/s390/net/ism.h | 1 + include/net/smc.h | 2 -- net/smc/smc_ism.c | 14 +++++--------- net/smc/smc_ism.h | 3 ++- 4 files changed, 8 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/drivers/s390/net/ism.h b/drivers/s390/net/ism.h index 047fa6101555..b5b03db52fce 100644 --- a/drivers/s390/net/ism.h +++ b/drivers/s390/net/ism.h @@ -10,6 +10,7 @@ #include #define UTIL_STR_LEN 16 +#define ISM_ERROR 0xFFFF /* * Do not use the first word of the DMB bits to ensure 8 byte aligned access. diff --git a/include/net/smc.h b/include/net/smc.h index db84e4e35080..a9c023dd1380 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -44,8 +44,6 @@ struct smcd_dmb { #define ISM_RESERVED_VLANID 0x1FFF -#define ISM_ERROR 0xFFFF - struct smcd_dev; struct smcd_gid { diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index a58ffb7a0610..fca01b95b65a 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -205,13 +205,13 @@ out: return rc; } -int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) +void smc_ism_unregister_dmb(struct smcd_dev *smcd, + struct smc_buf_desc *dmb_desc) { struct smcd_dmb dmb; - int rc = 0; if (!dmb_desc->dma_addr) - return rc; + return; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_tok = dmb_desc->token; @@ -219,13 +219,9 @@ int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) dmb.cpu_addr = dmb_desc->cpu_addr; dmb.dma_addr = dmb_desc->dma_addr; dmb.dmb_len = dmb_desc->len; - rc = smcd->ops->unregister_dmb(smcd, &dmb); - if (!rc || rc == ISM_ERROR) { - dmb_desc->cpu_addr = NULL; - dmb_desc->dma_addr = 0; - } + smcd->ops->unregister_dmb(smcd, &dmb); - return rc; + return; } int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index 6763133dd8d0..765aa8fae6fa 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -47,7 +47,8 @@ int smc_ism_get_vlan(struct smcd_dev *dev, unsigned short vlan_id); int smc_ism_put_vlan(struct smcd_dev *dev, unsigned short vlan_id); int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size, struct smc_buf_desc *dmb_desc); -int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc); +void smc_ism_unregister_dmb(struct smcd_dev *dev, + struct smc_buf_desc *dmb_desc); bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd); int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, struct smc_buf_desc *dmb_desc); -- cgit v1.2.3 From a4997e17d13767e67170f09bfa0b867862cad9d9 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:48 +0200 Subject: net/smc: Decouple sf and attached send_buf in smc_loopback Before this patch there was the following assumption in smc_loopback.c>smc_lo_move_data(): sf (signalling flag) == 0 : data is already in an attached target dmb sf == 1 : data is not yet in the target dmb This is true for the 2 callers in smc client smcd_cdc_msg_send() : sf=1 smcd_tx_rdma_writes() : sf=0 but should not be a general assumption. Add a bool to struct smc_buf_desc to indicate whether an SMC-D sndbuf_desc is an attached buffer. Don't call move_data() for attached send_buffers, because it is not necessary. Move the data in smc_lo_move_data() if len != 0 and signal when requested. Signed-off-by: Alexandra Winter Reviewed-by: Mahanta Jambigi Reviewed-by: Dust Li Link: https://patch.msgid.link/20250918110500.1731261-3-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- net/smc/smc_core.h | 5 +++++ net/smc/smc_ism.c | 1 + net/smc/smc_loopback.c | 9 +++------ net/smc/smc_tx.c | 3 +++ 4 files changed, 12 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 48a1b1dcb576..a5a78cbff341 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -13,6 +13,7 @@ #define _SMC_CORE_H #include +#include #include #include #include @@ -221,6 +222,10 @@ struct smc_buf_desc { /* virtually contiguous */ }; struct { /* SMC-D */ + /* SMC-D tx buffer */ + bool is_attached; + /* no need for explicit writes */ + /* SMC-D rx buffer: */ unsigned short sba_idx; /* SBA index number */ u64 token; diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index fca01b95b65a..503a9f93b392 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -274,6 +274,7 @@ int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, dmb_desc->cpu_addr = dmb.cpu_addr; dmb_desc->dma_addr = dmb.dma_addr; dmb_desc->len = dmb.dmb_len; + dmb_desc->is_attached = true; } return rc; } diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c index 0eb00bbefd17..1853c26fbbbb 100644 --- a/net/smc/smc_loopback.c +++ b/net/smc/smc_loopback.c @@ -224,12 +224,6 @@ static int smc_lo_move_data(struct smcd_dev *smcd, u64 dmb_tok, struct smc_lo_dev *ldev = smcd->priv; struct smc_connection *conn; - if (!sf) - /* since sndbuf is merged with peer DMB, there is - * no need to copy data from sndbuf to peer DMB. - */ - return 0; - read_lock_bh(&ldev->dmb_ht_lock); hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_tok) { if (tmp_node->token == dmb_tok) { @@ -244,6 +238,9 @@ static int smc_lo_move_data(struct smcd_dev *smcd, u64 dmb_tok, memcpy((char *)rmb_node->cpu_addr + offset, data, size); read_unlock_bh(&ldev->dmb_ht_lock); + if (!sf) + return 0; + conn = smcd->conn[rmb_node->sba_idx]; if (!conn || conn->killed) return -EPIPE; diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 214ac3cbcf9a..3144b4b1fe29 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -426,6 +426,9 @@ static int smcd_tx_rdma_writes(struct smc_connection *conn, size_t len, int srcchunk, dstchunk; int rc; + if (conn->sndbuf_desc->is_attached) + return 0; + for (dstchunk = 0; dstchunk < 2; dstchunk++) { for (srcchunk = 0; srcchunk < 2; srcchunk++) { void *data = conn->sndbuf_desc->cpu_addr + src_off; -- cgit v1.2.3 From 35758b0032c056cdff3e8f5a70669cb3e2c8d0e4 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:49 +0200 Subject: dibs: Create drivers/dibs Create the file structure for a 'DIBS - Direct Internal Buffer Sharing' shim layer that will provide generic functionality and declarations for dibs device drivers and dibs clients. Following patches will add functionality. Signed-off-by: Alexandra Winter Link: https://patch.msgid.link/20250918110500.1731261-4-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- MAINTAINERS | 7 +++++++ drivers/Makefile | 1 + drivers/dibs/Kconfig | 12 ++++++++++++ drivers/dibs/Makefile | 7 +++++++ drivers/dibs/dibs_main.c | 37 +++++++++++++++++++++++++++++++++++++ include/linux/dibs.h | 42 ++++++++++++++++++++++++++++++++++++++++++ net/Kconfig | 1 + 7 files changed, 107 insertions(+) create mode 100644 drivers/dibs/Kconfig create mode 100644 drivers/dibs/Makefile create mode 100644 drivers/dibs/dibs_main.c create mode 100644 include/linux/dibs.h (limited to 'net') diff --git a/MAINTAINERS b/MAINTAINERS index a8a770714101..ecc55fae5f9d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7132,6 +7132,13 @@ L: linux-gpio@vger.kernel.org S: Maintained F: drivers/gpio/gpio-gpio-mm.c +DIBS (DIRECT INTERNAL BUFFER SHARING) +M: Alexandra Winter +L: netdev@vger.kernel.org +S: Supported +F: drivers/dibs/ +F: include/linux/dibs.h + DIGITEQ AUTOMOTIVE MGB4 V4L2 DRIVER M: Martin Tuma L: linux-media@vger.kernel.org diff --git a/drivers/Makefile b/drivers/Makefile index b5749cf67044..a104163b1353 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -195,4 +195,5 @@ obj-$(CONFIG_DRM_ACCEL) += accel/ obj-$(CONFIG_CDX_BUS) += cdx/ obj-$(CONFIG_DPLL) += dpll/ +obj-$(CONFIG_DIBS) += dibs/ obj-$(CONFIG_S390) += s390/ diff --git a/drivers/dibs/Kconfig b/drivers/dibs/Kconfig new file mode 100644 index 000000000000..09c12f6838ad --- /dev/null +++ b/drivers/dibs/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 +config DIBS + tristate "DIBS support" + default n + help + Direct Internal Buffer Sharing (DIBS) + A communication method that uses common physical (internal) memory + for synchronous direct access into a remote buffer. + + Select this option to provide the abstraction layer between + dibs devices and dibs clients like the SMC protocol. + The module name is dibs. diff --git a/drivers/dibs/Makefile b/drivers/dibs/Makefile new file mode 100644 index 000000000000..825dec431bfc --- /dev/null +++ b/drivers/dibs/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# DIBS class module +# + +dibs-y += dibs_main.o +obj-$(CONFIG_DIBS) += dibs.o diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c new file mode 100644 index 000000000000..68e189932fcf --- /dev/null +++ b/drivers/dibs/dibs_main.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DIBS - Direct Internal Buffer Sharing + * + * Implementation of the DIBS class module + * + * Copyright IBM Corp. 2025 + */ +#define KMSG_COMPONENT "dibs" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include + +MODULE_DESCRIPTION("Direct Internal Buffer Sharing class"); +MODULE_LICENSE("GPL"); + +/* use an array rather a list for fast mapping: */ +static struct dibs_client *clients[MAX_DIBS_CLIENTS]; +static u8 max_client; + +static int __init dibs_init(void) +{ + memset(clients, 0, sizeof(clients)); + max_client = 0; + + return 0; +} + +static void __exit dibs_exit(void) +{ +} + +module_init(dibs_init); +module_exit(dibs_exit); diff --git a/include/linux/dibs.h b/include/linux/dibs.h new file mode 100644 index 000000000000..3f4175aaa732 --- /dev/null +++ b/include/linux/dibs.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Direct Internal Buffer Sharing + * + * Definitions for the DIBS module + * + * Copyright IBM Corp. 2025 + */ +#ifndef _DIBS_H +#define _DIBS_H + +/* DIBS - Direct Internal Buffer Sharing - concept + * ----------------------------------------------- + * In the case of multiple system sharing the same hardware, dibs fabrics can + * provide dibs devices to these systems. The systems use dibs devices of the + * same fabric to communicate via dmbs (Direct Memory Buffers). Each dmb has + * exactly one owning local dibs device and one remote using dibs device, that + * is authorized to write into this dmb. This access control is provided by the + * dibs fabric. + * + * Because the access to the dmb is based on access to physical memory, it is + * lossless and synchronous. The remote devices can directly access any offset + * of the dmb. + * + * Dibs fabrics, dibs devices and dmbs are identified by tokens and ids. + * Dibs fabric id is unique within the same hardware (with the exception of the + * dibs loopback fabric), dmb token is unique within the same fabric, dibs + * device gids are guaranteed to be unique within the same fabric and + * statistically likely to be globally unique. The exchange of these tokens and + * ids between the systems is not part of the dibs concept. + * + * The dibs layer provides an abstraction between dibs device drivers and dibs + * clients. + */ + +#define MAX_DIBS_CLIENTS 8 + +struct dibs_client { + const char *name; +}; + +#endif /* _DIBS_H */ diff --git a/net/Kconfig b/net/Kconfig index 4b563aea4c23..1d3f757d4b07 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -88,6 +88,7 @@ source "net/tls/Kconfig" source "net/xfrm/Kconfig" source "net/iucv/Kconfig" source "net/smc/Kconfig" +source "drivers/dibs/Kconfig" source "net/xdp/Kconfig" config NET_HANDSHAKE -- cgit v1.2.3 From d324a2ca3f8efd57f5839aa2690554a5cbb3586f Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:50 +0200 Subject: dibs: Register smc as dibs_client Formally register smc as dibs client. Functionality will be moved by follow-on patches from ism_client to dibs_client until eventually ism_client can be removed. As DIBS is only a shim layer without any dependencies, we can depend SMC on DIBS without adding indirect dependencies. A follow-on patch will remove dependency of SMC on ISM. Signed-off-by: Alexandra Winter Reviewed-by: Julian Ruess Link: https://patch.msgid.link/20250918110500.1731261-5-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- arch/s390/configs/debug_defconfig | 1 + arch/s390/configs/defconfig | 1 + drivers/dibs/dibs_main.c | 35 +++++++++++++++++++++++++++++++++++ include/linux/dibs.h | 23 +++++++++++++++++++++++ net/smc/Kconfig | 2 +- net/smc/smc_ism.c | 6 ++++++ 6 files changed, 67 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 5e616bc988ac..7bc54f053a3b 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -120,6 +120,7 @@ CONFIG_UNIX=y CONFIG_UNIX_DIAG=m CONFIG_XFRM_USER=m CONFIG_NET_KEY=m +CONFIG_DIBS=y CONFIG_SMC_DIAG=m CONFIG_SMC_LO=y CONFIG_INET=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 094599cdaf4d..4bf6f3311f7d 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -111,6 +111,7 @@ CONFIG_UNIX=y CONFIG_UNIX_DIAG=m CONFIG_XFRM_USER=m CONFIG_NET_KEY=m +CONFIG_DIBS=y CONFIG_SMC_DIAG=m CONFIG_SMC_LO=y CONFIG_INET=y diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c index 68e189932fcf..a5d2be9c3246 100644 --- a/drivers/dibs/dibs_main.c +++ b/drivers/dibs/dibs_main.c @@ -20,6 +20,41 @@ MODULE_LICENSE("GPL"); /* use an array rather a list for fast mapping: */ static struct dibs_client *clients[MAX_DIBS_CLIENTS]; static u8 max_client; +static DEFINE_MUTEX(clients_lock); + +int dibs_register_client(struct dibs_client *client) +{ + int i, rc = -ENOSPC; + + mutex_lock(&clients_lock); + for (i = 0; i < MAX_DIBS_CLIENTS; ++i) { + if (!clients[i]) { + clients[i] = client; + client->id = i; + if (i == max_client) + max_client++; + rc = 0; + break; + } + } + mutex_unlock(&clients_lock); + + return rc; +} +EXPORT_SYMBOL_GPL(dibs_register_client); + +int dibs_unregister_client(struct dibs_client *client) +{ + int rc = 0; + + mutex_lock(&clients_lock); + clients[client->id] = NULL; + if (client->id + 1 == max_client) + max_client--; + mutex_unlock(&clients_lock); + return rc; +} +EXPORT_SYMBOL_GPL(dibs_unregister_client); static int __init dibs_init(void) { diff --git a/include/linux/dibs.h b/include/linux/dibs.h index 3f4175aaa732..7bedeaf52c1b 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -33,10 +33,33 @@ * clients. */ +/* DIBS client + * ----------- + */ #define MAX_DIBS_CLIENTS 8 struct dibs_client { + /* client name for logging and debugging purposes */ const char *name; + /* client index - provided and used by dibs layer */ + u8 id; }; +/* Functions to be called by dibs clients: + */ +/** + * dibs_register_client() - register a client with dibs layer + * @client: this client + * + * Return: zero on success. + */ +int dibs_register_client(struct dibs_client *client); +/** + * dibs_unregister_client() - unregister a client with dibs layer + * @client: this client + * + * Return: zero on success. + */ +int dibs_unregister_client(struct dibs_client *client); + #endif /* _DIBS_H */ diff --git a/net/smc/Kconfig b/net/smc/Kconfig index ba5e6a2dd2fd..40dd60c1d23f 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config SMC tristate "SMC socket protocol family" - depends on INET && INFINIBAND + depends on INET && INFINIBAND && DIBS depends on m || ISM != m help SMC-R provides a "sockets over RDMA" solution making use of diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 503a9f93b392..a7a965e3c0ce 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -18,6 +18,7 @@ #include "smc_pnet.h" #include "smc_netlink.h" #include "linux/ism.h" +#include "linux/dibs.h" struct smcd_dev_list smcd_dev_list = { .list = LIST_HEAD_INIT(smcd_dev_list.list), @@ -42,6 +43,9 @@ static struct ism_client smc_ism_client = { .handle_irq = smcd_handle_irq, }; #endif +static struct dibs_client smc_dibs_client = { + .name = "SMC-D", +}; static void smc_ism_create_system_eid(void) { @@ -623,11 +627,13 @@ int smc_ism_init(void) #if IS_ENABLED(CONFIG_ISM) rc = ism_register_client(&smc_ism_client); #endif + rc = dibs_register_client(&smc_dibs_client); return rc; } void smc_ism_exit(void) { + dibs_unregister_client(&smc_dibs_client); #if IS_ENABLED(CONFIG_ISM) ism_unregister_client(&smc_ism_client); #endif -- cgit v1.2.3 From 69baaac9361edd169713562f088829a1be9c51a9 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:53 +0200 Subject: dibs: Define dibs_client_ops and dibs_dev_ops Move the device add() and remove() functions from ism_client to dibs_client_ops and call add_dev()/del_dev() for ism devices and dibs_loopback devices. dibs_client_ops->add_dev() = smcd_register_dev() for the smc_dibs_client. This is the first step to handle ism and loopback devices alike (as dibs devices) in the smc dibs client. Define dibs_dev->ops and move smcd_ops->get_chid to dibs_dev_ops->get_fabric_id() for ism and loopback devices. See below for why this needs to be in the same patch as dibs_client_ops->add_dev(). The following changes contain intermediate steps, that will be obsoleted by follow-on patches, once more functionality has been moved to dibs: Use different smcd_ops and max_dmbs for ism and loopback. Follow-on patches will change SMC-D to directly use dibs_ops instead of smcd_ops. In smcd_register_dev() it is now necessary to identify a dibs_loopback device before smcd_dev and smcd_ops->get_chid() are available. So provide dibs_dev_ops->get_fabric_id() in this patch and evaluate it in smc_ism_is_loopback(). Call smc_loopback_init() in smcd_register_dev() and call smc_loopback_exit() in smcd_unregister_dev() to handle the functionality that is still in smc_loopback. Follow-on patches will move all smc_loopback code to dibs_loopback. In smcd_[un]register_dev() use only ism device name, this will be replaced by dibs device name by a follow-on patch. End of changes with intermediate parts. Allocate an smcd event workqueue for all dibs devices, although dibs_loopback does not generate events. Use kernel memory instead of devres memory for smcd_dev and smcd->conn. Since commit a72178cfe855 ("net/smc: Fix dependency of SMC on ISM") an ism device and its driver can have a longer lifetime than the smc module, so smc should not rely on devres to free its resources [1]. It is now the responsibility of the smc client to free smcd and smcd->conn for all dibs devices, ism devices as well as loopback. Call client->ops->del_dev() for all existing dibs devices in dibs_unregister_client(), so all device related structures can be freed in the client. When dibs_unregister_client() is called in the context of smc_exit() or smc_core_reboot_event(), these functions have already called smc_lgrs_shutdown() which calls smc_smcd_terminate_all(smcd) and sets going_away. This is done a second time in smcd_unregister_dev(). This is analogous to how smcr is handled in these functions, by calling first smc_lgrs_shutdown() and then smc_ib_unregister_client() > smc_ib_remove_dev(), so leave it that way. It may be worth investigating, whether smc_lgrs_shutdown() is still required or useful. Remove CONFIG_SMC_LO. CONFIG_DIBS_LO now controls whether a dibs loopback device exists or not. Link: https://www.kernel.org/doc/Documentation/driver-model/devres.txt [1] Signed-off-by: Alexandra Winter Reviewed-by: Mahanta Jambigi Link: https://patch.msgid.link/20250918110500.1731261-8-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- arch/s390/configs/debug_defconfig | 1 - arch/s390/configs/defconfig | 1 - drivers/dibs/dibs_loopback.c | 11 ++++ drivers/dibs/dibs_main.c | 36 +++++++++++ drivers/s390/net/ism_drv.c | 43 ++++++------- include/linux/dibs.h | 89 ++++++++++++++++++++++++- include/linux/ism.h | 2 - include/net/smc.h | 3 +- net/smc/Kconfig | 13 ---- net/smc/Makefile | 2 +- net/smc/af_smc.c | 12 +--- net/smc/smc_ism.c | 132 ++++++++++++++++++++++++++------------ net/smc/smc_ism.h | 7 +- net/smc/smc_loopback.c | 94 ++++----------------------- net/smc/smc_loopback.h | 17 +---- 15 files changed, 270 insertions(+), 193 deletions(-) (limited to 'net') diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 5a2ed07b6198..a97c8d19f643 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -123,7 +123,6 @@ CONFIG_NET_KEY=m CONFIG_DIBS=y CONFIG_DIBS_LO=y CONFIG_SMC_DIAG=m -CONFIG_SMC_LO=y CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 4cbdd7e2ff9f..7f7b52d9a33c 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -114,7 +114,6 @@ CONFIG_NET_KEY=m CONFIG_DIBS=y CONFIG_DIBS_LO=y CONFIG_SMC_DIAG=m -CONFIG_SMC_LO=y CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y diff --git a/drivers/dibs/dibs_loopback.c b/drivers/dibs/dibs_loopback.c index 225514a452a8..215986ae54a4 100644 --- a/drivers/dibs/dibs_loopback.c +++ b/drivers/dibs/dibs_loopback.c @@ -18,6 +18,15 @@ /* global loopback device */ static struct dibs_lo_dev *lo_dev; +static u16 dibs_lo_get_fabric_id(struct dibs_dev *dibs) +{ + return DIBS_LOOPBACK_FABRIC; +} + +static const struct dibs_dev_ops dibs_lo_ops = { + .get_fabric_id = dibs_lo_get_fabric_id, +}; + static void dibs_lo_dev_exit(struct dibs_lo_dev *ldev) { dibs_dev_del(ldev->dibs); @@ -40,6 +49,8 @@ static int dibs_lo_dev_probe(void) } ldev->dibs = dibs; + dibs->drv_priv = ldev; + dibs->ops = &dibs_lo_ops; ret = dibs_dev_add(dibs); if (ret) diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c index a7e33be36158..f1cfa5849277 100644 --- a/drivers/dibs/dibs_main.c +++ b/drivers/dibs/dibs_main.c @@ -36,8 +36,10 @@ static struct dibs_dev_list dibs_dev_list = { int dibs_register_client(struct dibs_client *client) { + struct dibs_dev *dibs; int i, rc = -ENOSPC; + mutex_lock(&dibs_dev_list.mutex); mutex_lock(&clients_lock); for (i = 0; i < MAX_DIBS_CLIENTS; ++i) { if (!clients[i]) { @@ -51,19 +53,37 @@ int dibs_register_client(struct dibs_client *client) } mutex_unlock(&clients_lock); + if (i < MAX_DIBS_CLIENTS) { + /* initialize with all devices that we got so far */ + list_for_each_entry(dibs, &dibs_dev_list.list, list) { + dibs->priv[i] = NULL; + client->ops->add_dev(dibs); + } + } + mutex_unlock(&dibs_dev_list.mutex); + return rc; } EXPORT_SYMBOL_GPL(dibs_register_client); int dibs_unregister_client(struct dibs_client *client) { + struct dibs_dev *dibs; int rc = 0; + mutex_lock(&dibs_dev_list.mutex); + list_for_each_entry(dibs, &dibs_dev_list.list, list) { + clients[client->id]->ops->del_dev(dibs); + dibs->priv[client->id] = NULL; + } + mutex_lock(&clients_lock); clients[client->id] = NULL; if (client->id + 1 == max_client) max_client--; mutex_unlock(&clients_lock); + + mutex_unlock(&dibs_dev_list.mutex); return rc; } EXPORT_SYMBOL_GPL(dibs_unregister_client); @@ -80,7 +100,15 @@ EXPORT_SYMBOL_GPL(dibs_dev_alloc); int dibs_dev_add(struct dibs_dev *dibs) { + int i; + mutex_lock(&dibs_dev_list.mutex); + mutex_lock(&clients_lock); + for (i = 0; i < max_client; ++i) { + if (clients[i]) + clients[i]->ops->add_dev(dibs); + } + mutex_unlock(&clients_lock); list_add(&dibs->list, &dibs_dev_list.list); mutex_unlock(&dibs_dev_list.mutex); @@ -90,7 +118,15 @@ EXPORT_SYMBOL_GPL(dibs_dev_add); void dibs_dev_del(struct dibs_dev *dibs) { + int i; + mutex_lock(&dibs_dev_list.mutex); + mutex_lock(&clients_lock); + for (i = 0; i < max_client; ++i) { + if (clients[i]) + clients[i]->ops->del_dev(dibs); + } + mutex_unlock(&clients_lock); list_del_init(&dibs->list); mutex_unlock(&dibs_dev_list.mutex); } diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 8ecd0cccc7e8..2bd8f64ebb56 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -79,7 +79,6 @@ int ism_register_client(struct ism_client *client) /* initialize with all devices that we got so far */ list_for_each_entry(ism, &ism_dev_list.list, list) { ism->priv[i] = NULL; - client->add(ism); ism_setup_forwarding(client, ism); } } @@ -465,6 +464,16 @@ int ism_move(struct ism_dev *ism, u64 dmb_tok, unsigned int idx, bool sf, } EXPORT_SYMBOL_GPL(ism_move); +static u16 ism_get_chid(struct dibs_dev *dibs) +{ + struct ism_dev *ism = dibs->drv_priv; + + if (!ism || !ism->pdev) + return 0; + + return to_zpci(ism->pdev)->pchid; +} + static void ism_handle_event(struct ism_dev *ism) { struct ism_event *entry; @@ -523,6 +532,10 @@ static irqreturn_t ism_handle_irq(int irq, void *data) return IRQ_HANDLED; } +static const struct dibs_dev_ops ism_ops = { + .get_fabric_id = ism_get_chid, +}; + static int ism_dev_init(struct ism_dev *ism) { struct pci_dev *pdev = ism->pdev; @@ -564,7 +577,6 @@ static int ism_dev_init(struct ism_dev *ism) mutex_lock(&clients_lock); for (i = 0; i < max_client; ++i) { if (clients[i]) { - clients[i]->add(ism); ism_setup_forwarding(clients[i], ism); } } @@ -611,12 +623,6 @@ static void ism_dev_exit(struct ism_dev *ism) spin_unlock_irqrestore(&ism->lock, flags); mutex_lock(&ism_dev_list.mutex); - mutex_lock(&clients_lock); - for (i = 0; i < max_client; ++i) { - if (clients[i]) - clients[i]->remove(ism); - } - mutex_unlock(&clients_lock); if (ism_v2_capable) ism_del_vlan_id(ism, ISM_RESERVED_VLANID); @@ -672,7 +678,10 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) ret = -ENOMEM; goto err_resource; } + /* set this up before we enable interrupts */ ism->dibs = dibs; + dibs->drv_priv = ism; + dibs->ops = &ism_ops; ret = ism_dev_init(ism); if (ret) @@ -857,19 +866,6 @@ static void smcd_get_local_gid(struct smcd_dev *smcd, smcd_gid->gid_ext = 0; } -static u16 ism_get_chid(struct ism_dev *ism) -{ - if (!ism || !ism->pdev) - return 0; - - return to_zpci(ism->pdev)->pchid; -} - -static u16 smcd_get_chid(struct smcd_dev *smcd) -{ - return ism_get_chid(smcd->priv); -} - static inline struct device *smcd_get_dev(struct smcd_dev *dev) { struct ism_dev *ism = dev->priv; @@ -877,7 +873,7 @@ static inline struct device *smcd_get_dev(struct smcd_dev *dev) return &ism->dev; } -static const struct smcd_ops ism_ops = { +static const struct smcd_ops ism_smcd_ops = { .query_remote_gid = smcd_query_rgid, .register_dmb = smcd_register_dmb, .unregister_dmb = smcd_unregister_dmb, @@ -889,13 +885,12 @@ static const struct smcd_ops ism_ops = { .move_data = smcd_move, .supports_v2 = smcd_supports_v2, .get_local_gid = smcd_get_local_gid, - .get_chid = smcd_get_chid, .get_dev = smcd_get_dev, }; const struct smcd_ops *ism_get_smcd_ops(void) { - return &ism_ops; + return &ism_smcd_ops; } EXPORT_SYMBOL_GPL(ism_get_smcd_ops); #endif diff --git a/include/linux/dibs.h b/include/linux/dibs.h index c12db19c98c0..805ab33271b5 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -34,14 +34,45 @@ * clients. */ +struct dibs_dev; + /* DIBS client * ----------- */ #define MAX_DIBS_CLIENTS 8 +/* All dibs clients have access to all dibs devices. + * A dibs client provides the following functions to be called by dibs layer or + * dibs device drivers: + */ +struct dibs_client_ops { + /** + * add_dev() - add a dibs device + * @dev: device that was added + * + * Will be called during dibs_register_client() for all existing + * dibs devices and whenever a new dibs device is registered. + * dev is usable until dibs_client.remove() is called. + * *dev is protected by device refcounting. + */ + void (*add_dev)(struct dibs_dev *dev); + /** + * del_dev() - remove a dibs device + * @dev: device to be removed + * + * Will be called whenever a dibs device is removed. + * Will be called during dibs_unregister_client() for all existing + * dibs devices and whenever a dibs device is unregistered. + * The device has already stopped initiative for this client: + * No new handlers will be started. + * The device is no longer usable by this client after this call. + */ + void (*del_dev)(struct dibs_dev *dev); +}; struct dibs_client { /* client name for logging and debugging purposes */ const char *name; + const struct dibs_client_ops *ops; /* client index - provided and used by dibs layer */ u8 id; }; @@ -52,6 +83,7 @@ struct dibs_client { * dibs_register_client() - register a client with dibs layer * @client: this client * + * Will call client->ops->add_dev() for all existing dibs devices. * Return: zero on success. */ int dibs_register_client(struct dibs_client *client); @@ -59,21 +91,74 @@ int dibs_register_client(struct dibs_client *client); * dibs_unregister_client() - unregister a client with dibs layer * @client: this client * + * Will call client->ops->del_dev() for all existing dibs devices. * Return: zero on success. */ int dibs_unregister_client(struct dibs_client *client); +/* dibs clients can call dibs device ops. */ + /* DIBS devices * ------------ */ + +/* Defined fabric id / CHID for all loopback devices: + * All dibs loopback devices report this fabric id. In this case devices with + * the same fabric id can NOT communicate via dibs. Only loopback devices with + * the same dibs device gid can communicate (=same device with itself). + */ +#define DIBS_LOOPBACK_FABRIC 0xFFFF + +/* A dibs device provides the following functions to be called by dibs clients. + * They are mandatory, unless marked 'optional'. + */ +struct dibs_dev_ops { + /** + * get_fabric_id() + * @dev: local dibs device + * + * Only devices on the same dibs fabric can communicate. Fabric_id is + * unique inside the same HW system. Use fabric_id for fast negative + * checks, but only query_remote_gid() can give a reliable positive + * answer: + * Different fabric_id: dibs is not possible + * Same fabric_id: dibs may be possible or not + * (e.g. different HW systems) + * EXCEPTION: DIBS_LOOPBACK_FABRIC denotes an ism_loopback device + * that can only communicate with itself. Use dibs_dev.gid + * or query_remote_gid()to determine whether sender and + * receiver use the same ism_loopback device. + * Return: 2 byte dibs fabric id + */ + u16 (*get_fabric_id)(struct dibs_dev *dev); +}; + struct dibs_dev { struct list_head list; + /* To be filled by device driver, before calling dibs_dev_add(): */ + const struct dibs_dev_ops *ops; + /* priv pointer for device driver */ + void *drv_priv; + + /* priv pointer per client; for client usage only */ + void *priv[MAX_DIBS_CLIENTS]; }; +static inline void dibs_set_priv(struct dibs_dev *dev, + struct dibs_client *client, void *priv) +{ + dev->priv[client->id] = priv; +} + +static inline void *dibs_get_priv(struct dibs_dev *dev, + struct dibs_client *client) +{ + return dev->priv[client->id]; +} + /* ------- End of client-only functions ----------- */ -/* - * Functions to be called by dibs device drivers: +/* Functions to be called by dibs device drivers: */ /** * dibs_dev_alloc() - allocate and reference device structure diff --git a/include/linux/ism.h b/include/linux/ism.h index 9a53d3c48c16..c818a25996db 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -59,8 +59,6 @@ struct ism_event { struct ism_client { const char *name; - void (*add)(struct ism_dev *dev); - void (*remove)(struct ism_dev *dev); void (*handle_event)(struct ism_dev *dev, struct ism_event *event); /* Parameter dmbemask contains a bit vector with updated DMBEs, if sent * via ism_move_data(). Callback function must handle all active bits diff --git a/include/net/smc.h b/include/net/smc.h index a9c023dd1380..e271891b85e6 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -15,6 +15,7 @@ #include #include #include +#include #include "linux/ism.h" struct sock; @@ -62,7 +63,6 @@ struct smcd_ops { unsigned int size); int (*supports_v2)(void); void (*get_local_gid)(struct smcd_dev *dev, struct smcd_gid *gid); - u16 (*get_chid)(struct smcd_dev *dev); struct device* (*get_dev)(struct smcd_dev *dev); /* optional operations */ @@ -81,6 +81,7 @@ struct smcd_dev { const struct smcd_ops *ops; void *priv; void *client; + struct dibs_dev *dibs; struct list_head list; spinlock_t lock; struct smc_connection **conn; diff --git a/net/smc/Kconfig b/net/smc/Kconfig index 40dd60c1d23f..9535d88c2acb 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -20,16 +20,3 @@ config SMC_DIAG smcss. if unsure, say Y. - -config SMC_LO - bool "SMC intra-OS shortcut with loopback-ism" - depends on SMC - default n - help - SMC_LO enables the creation of an Emulated-ISM device named - loopback-ism in SMC and makes use of it for transferring data - when communication occurs within the same OS. This helps in - convenient testing of SMC-D since loopback-ism is independent - of architecture or hardware. - - if unsure, say N. diff --git a/net/smc/Makefile b/net/smc/Makefile index 60f1c87d5212..96ccfdf246df 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -6,4 +6,4 @@ smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o smc-y += smc_tracepoint.o smc_inet.o smc-$(CONFIG_SYSCTL) += smc_sysctl.o -smc-$(CONFIG_SMC_LO) += smc_loopback.o +smc-y += smc_loopback.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9097e4f24d2b..77b99e8ef35a 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -57,7 +57,6 @@ #include "smc_stats.h" #include "smc_tracepoint.h" #include "smc_sysctl.h" -#include "smc_loopback.h" #include "smc_inet.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group @@ -3591,16 +3590,10 @@ static int __init smc_init(void) goto out_sock; } - rc = smc_loopback_init(); - if (rc) { - pr_err("%s: smc_loopback_init fails with %d\n", __func__, rc); - goto out_ib; - } - rc = tcp_register_ulp(&smc_ulp_ops); if (rc) { pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); - goto out_lo; + goto out_ib; } rc = smc_inet_init(); if (rc) { @@ -3611,8 +3604,6 @@ static int __init smc_init(void) return 0; out_ulp: tcp_unregister_ulp(&smc_ulp_ops); -out_lo: - smc_loopback_exit(); out_ib: smc_ib_unregister_client(); out_sock: @@ -3651,7 +3642,6 @@ static void __exit smc_exit(void) tcp_unregister_ulp(&smc_ulp_ops); sock_unregister(PF_SMC); smc_core_exit(); - smc_loopback_exit(); smc_ib_unregister_client(); smc_ism_exit(); destroy_workqueue(smc_close_wq); diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index a7a965e3c0ce..415f03910c91 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -15,6 +15,7 @@ #include "smc.h" #include "smc_core.h" #include "smc_ism.h" +#include "smc_loopback.h" #include "smc_pnet.h" #include "smc_netlink.h" #include "linux/ism.h" @@ -28,23 +29,27 @@ struct smcd_dev_list smcd_dev_list = { static bool smc_ism_v2_capable; static u8 smc_ism_v2_system_eid[SMC_MAX_EID_LEN]; +static void smcd_register_dev(struct dibs_dev *dibs); +static void smcd_unregister_dev(struct dibs_dev *dibs); #if IS_ENABLED(CONFIG_ISM) -static void smcd_register_dev(struct ism_dev *ism); -static void smcd_unregister_dev(struct ism_dev *ism); static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event); static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, u16 dmbemask); static struct ism_client smc_ism_client = { .name = "SMC-D", - .add = smcd_register_dev, - .remove = smcd_unregister_dev, .handle_event = smcd_handle_event, .handle_irq = smcd_handle_irq, }; #endif +static struct dibs_client_ops smc_client_ops = { + .add_dev = smcd_register_dev, + .del_dev = smcd_unregister_dev, +}; + static struct dibs_client smc_dibs_client = { .name = "SMC-D", + .ops = &smc_client_ops, }; static void smc_ism_create_system_eid(void) @@ -86,7 +91,7 @@ void smc_ism_get_system_eid(u8 **eid) u16 smc_ism_get_chid(struct smcd_dev *smcd) { - return smcd->ops->get_chid(smcd); + return smcd->dibs->ops->get_fabric_id(smcd->dibs); } /* HW supports ISM V2 and thus System EID is defined */ @@ -318,7 +323,7 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0)) goto errattr; memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); - smc_set_pci_values(to_pci_dev(ism->dev.parent), &smc_pci_dev); + smc_set_pci_values(ism->pdev, &smc_pci_dev); if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid)) goto errattr; if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid)) @@ -368,7 +373,7 @@ static void smc_nl_prep_smcd_dev(struct smcd_dev_list *dev_list, list_for_each_entry(smcd, &dev_list->list, list) { if (num < snum) goto next; - if (smc_ism_is_loopback(smcd)) + if (smc_ism_is_loopback(smcd->dibs)) goto next; if (smc_nl_handle_smcd_dev(smcd, skb, cb)) goto errout; @@ -453,24 +458,26 @@ static void smc_ism_event_work(struct work_struct *work) } kfree(wrk); } +#endif -static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, - const struct smcd_ops *ops, int max_dmbs) +static struct smcd_dev *smcd_alloc_dev(const char *name, + const struct smcd_ops *ops, + int max_dmbs) { struct smcd_dev *smcd; - smcd = devm_kzalloc(parent, sizeof(*smcd), GFP_KERNEL); + smcd = kzalloc(sizeof(*smcd), GFP_KERNEL); if (!smcd) return NULL; - smcd->conn = devm_kcalloc(parent, max_dmbs, - sizeof(struct smc_connection *), GFP_KERNEL); + smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *), + GFP_KERNEL); if (!smcd->conn) - return NULL; + goto free_smcd; smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", WQ_MEM_RECLAIM, name); if (!smcd->event_wq) - return NULL; + goto free_conn; smcd->ops = ops; @@ -480,27 +487,58 @@ static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, INIT_LIST_HEAD(&smcd->lgr_list); init_waitqueue_head(&smcd->lgrs_deleted); return smcd; + +free_conn: + kfree(smcd->conn); +free_smcd: + kfree(smcd); + return NULL; } -static void smcd_register_dev(struct ism_dev *ism) +static void smcd_register_dev(struct dibs_dev *dibs) { - const struct smcd_ops *ops = ism_get_smcd_ops(); struct smcd_dev *smcd, *fentry; + const struct smcd_ops *ops; + struct smc_lo_dev *smc_lo; + struct ism_dev *ism; - if (!ops) - return; + if (smc_ism_is_loopback(dibs)) { + if (smc_loopback_init(&smc_lo)) + return; + } - smcd = smcd_alloc_dev(&ism->pdev->dev, dev_name(&ism->pdev->dev), ops, - ISM_NR_DMBS); + if (smc_ism_is_loopback(dibs)) { + ops = smc_lo_get_smcd_ops(); + smcd = smcd_alloc_dev(dev_name(&smc_lo->dev), ops, + SMC_LO_MAX_DMBS); + } else { + ism = dibs->drv_priv; +#if IS_ENABLED(CONFIG_ISM) + ops = ism_get_smcd_ops(); +#endif + smcd = smcd_alloc_dev(dev_name(&ism->pdev->dev), ops, + ISM_NR_DMBS); + } if (!smcd) return; - smcd->priv = ism; - smcd->client = &smc_ism_client; - ism_set_priv(ism, &smc_ism_client, smcd); - if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid)) - smc_pnetid_by_table_smcd(smcd); - if (smcd->ops->supports_v2()) + smcd->dibs = dibs; + dibs_set_priv(dibs, &smc_dibs_client, smcd); + + if (smc_ism_is_loopback(dibs)) { + smcd->priv = smc_lo; + smc_lo->smcd = smcd; + } else { + smcd->priv = ism; +#if IS_ENABLED(CONFIG_ISM) + ism_set_priv(ism, &smc_ism_client, smcd); + smcd->client = &smc_ism_client; +#endif + if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid)) + smc_pnetid_by_table_smcd(smcd); + } + + if (smc_ism_is_loopback(dibs) || smcd->ops->supports_v2()) smc_ism_set_v2_capable(); mutex_lock(&smcd_dev_list.mutex); /* sort list: @@ -510,7 +548,7 @@ static void smcd_register_dev(struct ism_dev *ism) if (!smcd->pnetid[0]) { fentry = list_first_entry_or_null(&smcd_dev_list.list, struct smcd_dev, list); - if (fentry && smc_ism_is_loopback(fentry)) + if (fentry && smc_ism_is_loopback(fentry->dibs)) list_add(&smcd->list, &fentry->list); else list_add(&smcd->list, &smcd_dev_list.list); @@ -519,32 +557,46 @@ static void smcd_register_dev(struct ism_dev *ism) } mutex_unlock(&smcd_dev_list.mutex); - if (smc_pnet_is_pnetid_set(smcd->pnetid)) - pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", - dev_name(&ism->dev), smcd->pnetid, - smcd->pnetid_by_user ? - " (user defined)" : - ""); - else - pr_warn_ratelimited("smc: adding smcd device %s without pnetid\n", - dev_name(&ism->dev)); + if (smc_ism_is_loopback(dibs)) { + pr_warn_ratelimited("smc: adding smcd loopback device\n"); + } else { + if (smc_pnet_is_pnetid_set(smcd->pnetid)) + pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", + dev_name(&ism->dev), smcd->pnetid, + smcd->pnetid_by_user ? + " (user defined)" : + ""); + else + pr_warn_ratelimited("smc: adding smcd device %s without pnetid\n", + dev_name(&ism->dev)); + } return; } -static void smcd_unregister_dev(struct ism_dev *ism) +static void smcd_unregister_dev(struct dibs_dev *dibs) { - struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); + struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client); + struct ism_dev *ism = dibs->drv_priv; - pr_warn_ratelimited("smc: removing smcd device %s\n", - dev_name(&ism->dev)); + if (smc_ism_is_loopback(dibs)) { + pr_warn_ratelimited("smc: removing smcd loopback device\n"); + } else { + pr_warn_ratelimited("smc: removing smcd device %s\n", + dev_name(&ism->dev)); + } smcd->going_away = 1; smc_smcd_terminate_all(smcd); mutex_lock(&smcd_dev_list.mutex); list_del_init(&smcd->list); mutex_unlock(&smcd_dev_list.mutex); destroy_workqueue(smcd->event_wq); + if (smc_ism_is_loopback(dibs)) + smc_loopback_exit(); + kfree(smcd->conn); + kfree(smcd); } +#if IS_ENABLED(CONFIG_ISM) /* SMCD Device event handler. Called from ISM device interrupt handler. * Parameters are ism device pointer, * - event->type (0 --> DMB, 1 --> GID), diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index 765aa8fae6fa..04699951d03f 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -12,6 +12,7 @@ #include #include #include +#include #include "smc.h" @@ -85,14 +86,14 @@ static inline bool __smc_ism_is_emulated(u16 chid) static inline bool smc_ism_is_emulated(struct smcd_dev *smcd) { - u16 chid = smcd->ops->get_chid(smcd); + u16 chid = smcd->dibs->ops->get_fabric_id(smcd->dibs); return __smc_ism_is_emulated(chid); } -static inline bool smc_ism_is_loopback(struct smcd_dev *smcd) +static inline bool smc_ism_is_loopback(struct dibs_dev *dibs) { - return (smcd->ops->get_chid(smcd) == 0xFFFF); + return (dibs->ops->get_fabric_id(dibs) == DIBS_LOOPBACK_FABRIC); } #endif diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c index 1853c26fbbbb..37d8366419f7 100644 --- a/net/smc/smc_loopback.c +++ b/net/smc/smc_loopback.c @@ -35,8 +35,6 @@ static void smc_lo_generate_ids(struct smc_lo_dev *ldev) memcpy(&lgid->gid, &uuid, sizeof(lgid->gid)); memcpy(&lgid->gid_ext, (u8 *)&uuid + sizeof(lgid->gid), sizeof(lgid->gid_ext)); - - ldev->chid = SMC_LO_RESERVED_CHID; } static int smc_lo_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid, @@ -257,11 +255,6 @@ static void smc_lo_get_local_gid(struct smcd_dev *smcd, smcd_gid->gid_ext = ldev->local_gid.gid_ext; } -static u16 smc_lo_get_chid(struct smcd_dev *smcd) -{ - return ((struct smc_lo_dev *)smcd->priv)->chid; -} - static struct device *smc_lo_get_dev(struct smcd_dev *smcd) { return &((struct smc_lo_dev *)smcd->priv)->dev; @@ -281,72 +274,15 @@ static const struct smcd_ops lo_ops = { .signal_event = NULL, .move_data = smc_lo_move_data, .get_local_gid = smc_lo_get_local_gid, - .get_chid = smc_lo_get_chid, .get_dev = smc_lo_get_dev, }; -static struct smcd_dev *smcd_lo_alloc_dev(const struct smcd_ops *ops, - int max_dmbs) -{ - struct smcd_dev *smcd; - - smcd = kzalloc(sizeof(*smcd), GFP_KERNEL); - if (!smcd) - return NULL; - - smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *), - GFP_KERNEL); - if (!smcd->conn) - goto out_smcd; - - smcd->ops = ops; - - spin_lock_init(&smcd->lock); - spin_lock_init(&smcd->lgr_lock); - INIT_LIST_HEAD(&smcd->vlan); - INIT_LIST_HEAD(&smcd->lgr_list); - init_waitqueue_head(&smcd->lgrs_deleted); - return smcd; - -out_smcd: - kfree(smcd); - return NULL; -} - -static int smcd_lo_register_dev(struct smc_lo_dev *ldev) -{ - struct smcd_dev *smcd; - - smcd = smcd_lo_alloc_dev(&lo_ops, SMC_LO_MAX_DMBS); - if (!smcd) - return -ENOMEM; - ldev->smcd = smcd; - smcd->priv = ldev; - smc_ism_set_v2_capable(); - mutex_lock(&smcd_dev_list.mutex); - list_add(&smcd->list, &smcd_dev_list.list); - mutex_unlock(&smcd_dev_list.mutex); - pr_warn_ratelimited("smc: adding smcd device %s\n", - dev_name(&ldev->dev)); - return 0; -} - -static void smcd_lo_unregister_dev(struct smc_lo_dev *ldev) +const struct smcd_ops *smc_lo_get_smcd_ops(void) { - struct smcd_dev *smcd = ldev->smcd; - - pr_warn_ratelimited("smc: removing smcd device %s\n", - dev_name(&ldev->dev)); - smcd->going_away = 1; - smc_smcd_terminate_all(smcd); - mutex_lock(&smcd_dev_list.mutex); - list_del_init(&smcd->list); - mutex_unlock(&smcd_dev_list.mutex); - kfree(smcd->conn); - kfree(smcd); + return &lo_ops; } -static int smc_lo_dev_init(struct smc_lo_dev *ldev) +static void smc_lo_dev_init(struct smc_lo_dev *ldev) { smc_lo_generate_ids(ldev); rwlock_init(&ldev->dmb_ht_lock); @@ -354,12 +290,11 @@ static int smc_lo_dev_init(struct smc_lo_dev *ldev) atomic_set(&ldev->dmb_cnt, 0); init_waitqueue_head(&ldev->ldev_release); - return smcd_lo_register_dev(ldev); + return; } static void smc_lo_dev_exit(struct smc_lo_dev *ldev) { - smcd_lo_unregister_dev(ldev); if (atomic_read(&ldev->dmb_cnt)) wait_event(ldev->ldev_release, !atomic_read(&ldev->dmb_cnt)); } @@ -375,7 +310,6 @@ static void smc_lo_dev_release(struct device *dev) static int smc_lo_dev_probe(void) { struct smc_lo_dev *ldev; - int ret; ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); if (!ldev) @@ -385,17 +319,11 @@ static int smc_lo_dev_probe(void) ldev->dev.release = smc_lo_dev_release; device_initialize(&ldev->dev); dev_set_name(&ldev->dev, smc_lo_dev_name); - - ret = smc_lo_dev_init(ldev); - if (ret) - goto free_dev; + smc_lo_dev_init(ldev); lo_dev = ldev; /* global loopback device */ - return 0; -free_dev: - put_device(&ldev->dev); - return ret; + return 0; } static void smc_lo_dev_remove(void) @@ -405,11 +333,17 @@ static void smc_lo_dev_remove(void) smc_lo_dev_exit(lo_dev); put_device(&lo_dev->dev); /* device_initialize in smc_lo_dev_probe */ + lo_dev = NULL; } -int smc_loopback_init(void) +int smc_loopback_init(struct smc_lo_dev **smc_lb) { - return smc_lo_dev_probe(); + int ret; + + ret = smc_lo_dev_probe(); + if (!ret) + *smc_lb = lo_dev; + return ret; } void smc_loopback_exit(void) diff --git a/net/smc/smc_loopback.h b/net/smc/smc_loopback.h index 04dc6808d2e1..76c62526e2e5 100644 --- a/net/smc/smc_loopback.h +++ b/net/smc/smc_loopback.h @@ -17,10 +17,8 @@ #include #include -#if IS_ENABLED(CONFIG_SMC_LO) #define SMC_LO_MAX_DMBS 5000 #define SMC_LO_DMBS_HASH_BITS 12 -#define SMC_LO_RESERVED_CHID 0xFFFF struct smc_lo_dmb_node { struct hlist_node list; @@ -35,7 +33,6 @@ struct smc_lo_dmb_node { struct smc_lo_dev { struct smcd_dev *smcd; struct device dev; - u16 chid; struct smcd_gid local_gid; atomic_t dmb_cnt; rwlock_t dmb_ht_lock; @@ -44,17 +41,9 @@ struct smc_lo_dev { wait_queue_head_t ldev_release; }; -int smc_loopback_init(void); +const struct smcd_ops *smc_lo_get_smcd_ops(void); + +int smc_loopback_init(struct smc_lo_dev **smc_lb); void smc_loopback_exit(void); -#else -static inline int smc_loopback_init(void) -{ - return 0; -} - -static inline void smc_loopback_exit(void) -{ -} -#endif #endif /* _SMC_LOOPBACK_H */ -- cgit v1.2.3 From 845c334a0186a23c2ac4abfb444e499fec831b24 Mon Sep 17 00:00:00 2001 From: Julian Ruess Date: Thu, 18 Sep 2025 13:04:54 +0200 Subject: dibs: Move struct device to dibs_dev Move struct device from ism_dev and smc_lo_dev to dibs_dev, and define a corresponding release function. Free ism_dev in ism_remove() and smc_lo_dev in smc_lo_dev_remove(). Replace smcd->ops->get_dev(smcd) by using dibs->dev directly. An alternative design would be to embed dibs_dev as a field in ism_dev and do the same for other dibs device driver specific structs. However that would have the disadvantage that each dibs device driver needs to allocate dibs_dev and each dibs device driver needs a different device release function. The advantage would be that ism_dev and other device driver specific structs would be covered by device reference counts. Signed-off-by: Julian Ruess Co-developed-by: Alexandra Winter Signed-off-by: Alexandra Winter Reviewed-by: Mahanta Jambigi Link: https://patch.msgid.link/20250918110500.1731261-9-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/dibs/dibs_loopback.c | 15 +++++++-------- drivers/dibs/dibs_main.c | 21 +++++++++++++++++++- drivers/s390/net/ism_drv.c | 40 ++++++++------------------------------ include/linux/dibs.h | 1 + include/linux/ism.h | 1 - include/net/smc.h | 1 - net/smc/smc_core.c | 4 ++-- net/smc/smc_ism.c | 46 ++++++++++++++++++-------------------------- net/smc/smc_loopback.c | 21 +------------------- net/smc/smc_loopback.h | 1 - net/smc/smc_pnet.c | 13 +++++-------- 11 files changed, 63 insertions(+), 101 deletions(-) (limited to 'net') diff --git a/drivers/dibs/dibs_loopback.c b/drivers/dibs/dibs_loopback.c index 215986ae54a4..76e479d5724b 100644 --- a/drivers/dibs/dibs_loopback.c +++ b/drivers/dibs/dibs_loopback.c @@ -15,6 +15,7 @@ #include "dibs_loopback.h" +static const char dibs_lo_dev_name[] = "lo"; /* global loopback device */ static struct dibs_lo_dev *lo_dev; @@ -27,11 +28,6 @@ static const struct dibs_dev_ops dibs_lo_ops = { .get_fabric_id = dibs_lo_get_fabric_id, }; -static void dibs_lo_dev_exit(struct dibs_lo_dev *ldev) -{ - dibs_dev_del(ldev->dibs); -} - static int dibs_lo_dev_probe(void) { struct dibs_lo_dev *ldev; @@ -52,6 +48,9 @@ static int dibs_lo_dev_probe(void) dibs->drv_priv = ldev; dibs->ops = &dibs_lo_ops; + dibs->dev.parent = NULL; + dev_set_name(&dibs->dev, "%s", dibs_lo_dev_name); + ret = dibs_dev_add(dibs); if (ret) goto err_reg; @@ -60,7 +59,7 @@ static int dibs_lo_dev_probe(void) err_reg: /* pairs with dibs_dev_alloc() */ - kfree(dibs); + put_device(&dibs->dev); kfree(ldev); return ret; @@ -71,9 +70,9 @@ static void dibs_lo_dev_remove(void) if (!lo_dev) return; - dibs_lo_dev_exit(lo_dev); + dibs_dev_del(lo_dev->dibs); /* pairs with dibs_dev_alloc() */ - kfree(lo_dev->dibs); + put_device(&lo_dev->dibs->dev); kfree(lo_dev); lo_dev = NULL; } diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c index f1cfa5849277..610b6c452211 100644 --- a/drivers/dibs/dibs_main.c +++ b/drivers/dibs/dibs_main.c @@ -88,11 +88,24 @@ int dibs_unregister_client(struct dibs_client *client) } EXPORT_SYMBOL_GPL(dibs_unregister_client); +static void dibs_dev_release(struct device *dev) +{ + struct dibs_dev *dibs; + + dibs = container_of(dev, struct dibs_dev, dev); + + kfree(dibs); +} + struct dibs_dev *dibs_dev_alloc(void) { struct dibs_dev *dibs; dibs = kzalloc(sizeof(*dibs), GFP_KERNEL); + if (!dibs) + return dibs; + dibs->dev.release = dibs_dev_release; + device_initialize(&dibs->dev); return dibs; } @@ -100,7 +113,11 @@ EXPORT_SYMBOL_GPL(dibs_dev_alloc); int dibs_dev_add(struct dibs_dev *dibs) { - int i; + int i, ret; + + ret = device_add(&dibs->dev); + if (ret) + return ret; mutex_lock(&dibs_dev_list.mutex); mutex_lock(&clients_lock); @@ -129,6 +146,8 @@ void dibs_dev_del(struct dibs_dev *dibs) mutex_unlock(&clients_lock); list_del_init(&dibs->list); mutex_unlock(&dibs_dev_list.mutex); + + device_del(&dibs->dev); } EXPORT_SYMBOL_GPL(dibs_dev_del); diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 2bd8f64ebb56..4096ea9faa7e 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -602,15 +602,6 @@ out: return ret; } -static void ism_dev_release(struct device *dev) -{ - struct ism_dev *ism; - - ism = container_of(dev, struct ism_dev, dev); - - kfree(ism); -} - static void ism_dev_exit(struct ism_dev *ism) { struct pci_dev *pdev = ism->pdev; @@ -649,17 +640,10 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) spin_lock_init(&ism->cmd_lock); dev_set_drvdata(&pdev->dev, ism); ism->pdev = pdev; - ism->dev.parent = &pdev->dev; - ism->dev.release = ism_dev_release; - device_initialize(&ism->dev); - dev_set_name(&ism->dev, "%s", dev_name(&pdev->dev)); - ret = device_add(&ism->dev); - if (ret) - goto err_dev; ret = pci_enable_device_mem(pdev); if (ret) - goto err; + goto err_dev; ret = pci_request_mem_regions(pdev, DRV_NAME); if (ret) @@ -687,6 +671,9 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (ret) goto err_dibs; + dibs->dev.parent = &pdev->dev; + dev_set_name(&dibs->dev, "%s", dev_name(&pdev->dev)); + ret = dibs_dev_add(dibs); if (ret) goto err_ism; @@ -697,16 +684,14 @@ err_ism: ism_dev_exit(ism); err_dibs: /* pairs with dibs_dev_alloc() */ - kfree(dibs); + put_device(&dibs->dev); err_resource: pci_release_mem_regions(pdev); err_disable: pci_disable_device(pdev); -err: - device_del(&ism->dev); err_dev: dev_set_drvdata(&pdev->dev, NULL); - put_device(&ism->dev); + kfree(ism); return ret; } @@ -719,13 +704,12 @@ static void ism_remove(struct pci_dev *pdev) dibs_dev_del(dibs); ism_dev_exit(ism); /* pairs with dibs_dev_alloc() */ - kfree(dibs); + put_device(&dibs->dev); pci_release_mem_regions(pdev); pci_disable_device(pdev); - device_del(&ism->dev); dev_set_drvdata(&pdev->dev, NULL); - put_device(&ism->dev); + kfree(ism); } static struct pci_driver ism_driver = { @@ -866,13 +850,6 @@ static void smcd_get_local_gid(struct smcd_dev *smcd, smcd_gid->gid_ext = 0; } -static inline struct device *smcd_get_dev(struct smcd_dev *dev) -{ - struct ism_dev *ism = dev->priv; - - return &ism->dev; -} - static const struct smcd_ops ism_smcd_ops = { .query_remote_gid = smcd_query_rgid, .register_dmb = smcd_register_dmb, @@ -885,7 +862,6 @@ static const struct smcd_ops ism_smcd_ops = { .move_data = smcd_move, .supports_v2 = smcd_supports_v2, .get_local_gid = smcd_get_local_gid, - .get_dev = smcd_get_dev, }; const struct smcd_ops *ism_get_smcd_ops(void) diff --git a/include/linux/dibs.h b/include/linux/dibs.h index 805ab33271b5..793c6e1ece0f 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -135,6 +135,7 @@ struct dibs_dev_ops { struct dibs_dev { struct list_head list; + struct device dev; /* To be filled by device driver, before calling dibs_dev_add(): */ const struct dibs_dev_ops *ops; /* priv pointer for device driver */ diff --git a/include/linux/ism.h b/include/linux/ism.h index c818a25996db..84f1afb3dded 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -42,7 +42,6 @@ struct ism_dev { struct ism_eq *ieq; dma_addr_t ieq_dma_addr; - struct device dev; u64 local_gid; int ieq_idx; diff --git a/include/net/smc.h b/include/net/smc.h index e271891b85e6..05faac83371e 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -63,7 +63,6 @@ struct smcd_ops { unsigned int size); int (*supports_v2)(void); void (*get_local_gid)(struct smcd_dev *dev, struct smcd_gid *gid); - struct device* (*get_dev)(struct smcd_dev *dev); /* optional operations */ int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index a9e80f44307d..42ab0795d563 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -924,7 +924,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) if (ini->is_smcd) { /* SMC-D specific settings */ smcd = ini->ism_dev[ini->ism_selected]; - get_device(smcd->ops->get_dev(smcd)); + get_device(&smcd->dibs->dev); lgr->peer_gid.gid = ini->ism_peer_gid[ini->ism_selected].gid; lgr->peer_gid.gid_ext = @@ -1474,7 +1474,7 @@ static void smc_lgr_free(struct smc_link_group *lgr) destroy_workqueue(lgr->tx_wq); if (lgr->is_smcd) { smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); - put_device(lgr->smcd->ops->get_dev(lgr->smcd)); + put_device(&lgr->smcd->dibs->dev); } smc_lgr_put(lgr); /* theoretically last lgr_put */ } diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 415f03910c91..6a6e7c9641e8 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -303,12 +303,12 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, char smc_pnet[SMC_MAX_PNETID_LEN + 1]; struct smc_pci_dev smc_pci_dev; struct nlattr *port_attrs; + struct dibs_dev *dibs; struct nlattr *attrs; - struct ism_dev *ism; int use_cnt = 0; void *nlh; - ism = smcd->priv; + dibs = smcd->dibs; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_DEV_SMCD); @@ -323,7 +323,7 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0)) goto errattr; memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); - smc_set_pci_values(ism->pdev, &smc_pci_dev); + smc_set_pci_values(to_pci_dev(dibs->dev.parent), &smc_pci_dev); if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid)) goto errattr; if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid)) @@ -509,14 +509,14 @@ static void smcd_register_dev(struct dibs_dev *dibs) if (smc_ism_is_loopback(dibs)) { ops = smc_lo_get_smcd_ops(); - smcd = smcd_alloc_dev(dev_name(&smc_lo->dev), ops, + smcd = smcd_alloc_dev(dev_name(&dibs->dev), ops, SMC_LO_MAX_DMBS); } else { ism = dibs->drv_priv; #if IS_ENABLED(CONFIG_ISM) ops = ism_get_smcd_ops(); #endif - smcd = smcd_alloc_dev(dev_name(&ism->pdev->dev), ops, + smcd = smcd_alloc_dev(dev_name(&dibs->dev), ops, ISM_NR_DMBS); } if (!smcd) @@ -534,10 +534,11 @@ static void smcd_register_dev(struct dibs_dev *dibs) ism_set_priv(ism, &smc_ism_client, smcd); smcd->client = &smc_ism_client; #endif - if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid)) - smc_pnetid_by_table_smcd(smcd); } + if (smc_pnetid_by_dev_port(dibs->dev.parent, 0, smcd->pnetid)) + smc_pnetid_by_table_smcd(smcd); + if (smc_ism_is_loopback(dibs) || smcd->ops->supports_v2()) smc_ism_set_v2_capable(); mutex_lock(&smcd_dev_list.mutex); @@ -557,33 +558,24 @@ static void smcd_register_dev(struct dibs_dev *dibs) } mutex_unlock(&smcd_dev_list.mutex); - if (smc_ism_is_loopback(dibs)) { - pr_warn_ratelimited("smc: adding smcd loopback device\n"); - } else { - if (smc_pnet_is_pnetid_set(smcd->pnetid)) - pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", - dev_name(&ism->dev), smcd->pnetid, - smcd->pnetid_by_user ? - " (user defined)" : - ""); - else - pr_warn_ratelimited("smc: adding smcd device %s without pnetid\n", - dev_name(&ism->dev)); - } + if (smc_pnet_is_pnetid_set(smcd->pnetid)) + pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", + dev_name(&dibs->dev), smcd->pnetid, + smcd->pnetid_by_user ? + " (user defined)" : + ""); + else + pr_warn_ratelimited("smc: adding smcd device %s without pnetid\n", + dev_name(&dibs->dev)); return; } static void smcd_unregister_dev(struct dibs_dev *dibs) { struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client); - struct ism_dev *ism = dibs->drv_priv; - if (smc_ism_is_loopback(dibs)) { - pr_warn_ratelimited("smc: removing smcd loopback device\n"); - } else { - pr_warn_ratelimited("smc: removing smcd device %s\n", - dev_name(&ism->dev)); - } + pr_warn_ratelimited("smc: removing smcd device %s\n", + dev_name(&dibs->dev)); smcd->going_away = 1; smc_smcd_terminate_all(smcd); mutex_lock(&smcd_dev_list.mutex); diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c index 37d8366419f7..262d0d0df4d0 100644 --- a/net/smc/smc_loopback.c +++ b/net/smc/smc_loopback.c @@ -23,7 +23,6 @@ #define SMC_LO_SUPPORT_NOCOPY 0x1 #define SMC_DMA_ADDR_INVALID (~(dma_addr_t)0) -static const char smc_lo_dev_name[] = "loopback-ism"; static struct smc_lo_dev *lo_dev; static void smc_lo_generate_ids(struct smc_lo_dev *ldev) @@ -255,11 +254,6 @@ static void smc_lo_get_local_gid(struct smcd_dev *smcd, smcd_gid->gid_ext = ldev->local_gid.gid_ext; } -static struct device *smc_lo_get_dev(struct smcd_dev *smcd) -{ - return &((struct smc_lo_dev *)smcd->priv)->dev; -} - static const struct smcd_ops lo_ops = { .query_remote_gid = smc_lo_query_rgid, .register_dmb = smc_lo_register_dmb, @@ -274,7 +268,6 @@ static const struct smcd_ops lo_ops = { .signal_event = NULL, .move_data = smc_lo_move_data, .get_local_gid = smc_lo_get_local_gid, - .get_dev = smc_lo_get_dev, }; const struct smcd_ops *smc_lo_get_smcd_ops(void) @@ -299,14 +292,6 @@ static void smc_lo_dev_exit(struct smc_lo_dev *ldev) wait_event(ldev->ldev_release, !atomic_read(&ldev->dmb_cnt)); } -static void smc_lo_dev_release(struct device *dev) -{ - struct smc_lo_dev *ldev = - container_of(dev, struct smc_lo_dev, dev); - - kfree(ldev); -} - static int smc_lo_dev_probe(void) { struct smc_lo_dev *ldev; @@ -315,10 +300,6 @@ static int smc_lo_dev_probe(void) if (!ldev) return -ENOMEM; - ldev->dev.parent = NULL; - ldev->dev.release = smc_lo_dev_release; - device_initialize(&ldev->dev); - dev_set_name(&ldev->dev, smc_lo_dev_name); smc_lo_dev_init(ldev); lo_dev = ldev; /* global loopback device */ @@ -332,7 +313,7 @@ static void smc_lo_dev_remove(void) return; smc_lo_dev_exit(lo_dev); - put_device(&lo_dev->dev); /* device_initialize in smc_lo_dev_probe */ + kfree(lo_dev); lo_dev = NULL; } diff --git a/net/smc/smc_loopback.h b/net/smc/smc_loopback.h index 76c62526e2e5..a033bf10890a 100644 --- a/net/smc/smc_loopback.h +++ b/net/smc/smc_loopback.h @@ -32,7 +32,6 @@ struct smc_lo_dmb_node { struct smc_lo_dev { struct smcd_dev *smcd; - struct device dev; struct smcd_gid local_gid; atomic_t dmb_cnt; rwlock_t dmb_ht_lock; diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 7225b5fa17a6..d0df7f2b03aa 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -169,7 +169,7 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) pr_warn_ratelimited("smc: smcd device %s " "erased user defined pnetid " "%.16s\n", - dev_name(smcd->ops->get_dev(smcd)), + dev_name(&smcd->dibs->dev), smcd->pnetid); memset(smcd->pnetid, 0, SMC_MAX_PNETID_LEN); smcd->pnetid_by_user = false; @@ -332,7 +332,7 @@ static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name) mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { - if (!strncmp(dev_name(smcd_dev->ops->get_dev(smcd_dev)), + if (!strncmp(dev_name(&smcd_dev->dibs->dev), smcd_name, IB_DEVICE_NAME_MAX - 1)) goto out; } @@ -413,7 +413,6 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, bool smcddev_applied = true; bool ibdev_applied = true; struct smcd_dev *smcd; - struct device *dev; bool new_ibdev; /* try to apply the pnetid to active devices */ @@ -431,10 +430,8 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, if (smcd) { smcddev_applied = smc_pnet_apply_smcd(smcd, pnet_name); if (smcddev_applied) { - dev = smcd->ops->get_dev(smcd); - pr_warn_ratelimited("smc: smcd device %s " - "applied user defined pnetid " - "%.16s\n", dev_name(dev), + pr_warn_ratelimited("smc: smcd device %s applied user defined pnetid %.16s\n", + dev_name(&smcd->dibs->dev), smcd->pnetid); } } @@ -1193,7 +1190,7 @@ int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) */ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) { - const char *ib_name = dev_name(smcddev->ops->get_dev(smcddev)); + const char *ib_name = dev_name(&smcddev->dibs->dev); struct smc_pnettable *pnettable; struct smc_pnetentry *tmp_pe; struct smc_net *sn; -- cgit v1.2.3 From 804737349813a4ffc0e2a66579cb3cc42eb46446 Mon Sep 17 00:00:00 2001 From: Julian Ruess Date: Thu, 18 Sep 2025 13:04:55 +0200 Subject: dibs: Create class dibs Create '/sys/class/dibs' to represent multiple kinds of dibs devices in sysfs. Show s390/ism devices as well as dibs_loopback devices. Show attribute fabric_id using dibs_ops.get_fabric_id(). This can help users understand which dibs devices are connected to the same fabric in different systems and which dibs devices are loopback devices (fabric_id 0xffff) Instead of using the same name as the pci device, give the ism devices their own readable names based on uid or fid from the HW definition. smc_loopback was never visible in sysfs. dibs_loopback is now represented as a virtual device. For the SMC feature "software defined pnet-id" either the ib device name or the PCI-ID (actually the parent device name) can be used for SMC-R entries. Mimic this behaviour for SMC-D, and check the parent device name as well. So device name or PCI-ID can be used for ism and device name can be used for dibs-loopback. Note that this: IB_DEVICE_NAME_MAX - 1 == smc_pnet_policy.[SMC_PNETID_IBNAME].len is the length of smcd_name. Future SW-pnetid cleanup patches to could use a meaningful define, but that would touch too much unrelated code here. Examples: --------- ism before: > ls /sys/bus/pci/devices/0000:00:00.0/0000:00:00.0 uevent ism now: > ls /sys/bus/pci/devices/0000:00:00.0/dibs/ism30 device -> ../../../0000:00:00.0/ fabric_id subsystem -> ../../../../../class/dibs/ uevent dibs loopback: > ls /sys/devices/virtual/dibs/lo/ fabric_id subsystem -> ../../../../class/dibs/ uevent dibs class: > ls -l /sys/class/dibs/ ism30 -> ../../devices/pci0000:00/0000:00:00.0/dibs/ism30/ lo -> ../../devices/virtual/dibs/lo/ For comparison: > ls -l /sys/class/net/ enc8410 -> ../../devices/qeth/0.0.8410/net/enc8410/ ens1693 -> ../../devices/pci0001:00/0001:00:00.0/net/ens1693/ lo -> ../../devices/virtual/net/lo/ Signed-off-by: Julian Ruess Co-developed-by: Alexandra Winter Signed-off-by: Alexandra Winter Link: https://patch.msgid.link/20250918110500.1731261-10-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/dibs/dibs_main.c | 40 ++++++++++++++++++++++++++++++++++++++++ drivers/s390/net/ism_drv.c | 5 ++++- net/smc/smc_pnet.c | 16 ++++++++++++---- 3 files changed, 56 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c index 610b6c452211..b3f21805aa59 100644 --- a/drivers/dibs/dibs_main.c +++ b/drivers/dibs/dibs_main.c @@ -20,6 +20,8 @@ MODULE_DESCRIPTION("Direct Internal Buffer Sharing class"); MODULE_LICENSE("GPL"); +static struct class *dibs_class; + /* use an array rather a list for fast mapping: */ static struct dibs_client *clients[MAX_DIBS_CLIENTS]; static u8 max_client; @@ -105,12 +107,35 @@ struct dibs_dev *dibs_dev_alloc(void) if (!dibs) return dibs; dibs->dev.release = dibs_dev_release; + dibs->dev.class = dibs_class; device_initialize(&dibs->dev); return dibs; } EXPORT_SYMBOL_GPL(dibs_dev_alloc); +static ssize_t fabric_id_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct dibs_dev *dibs; + u16 fabric_id; + + dibs = container_of(dev, struct dibs_dev, dev); + fabric_id = dibs->ops->get_fabric_id(dibs); + + return sysfs_emit(buf, "0x%04x\n", fabric_id); +} +static DEVICE_ATTR_RO(fabric_id); + +static struct attribute *dibs_dev_attrs[] = { + &dev_attr_fabric_id.attr, + NULL, +}; + +static const struct attribute_group dibs_dev_attr_group = { + .attrs = dibs_dev_attrs, +}; + int dibs_dev_add(struct dibs_dev *dibs) { int i, ret; @@ -119,6 +144,11 @@ int dibs_dev_add(struct dibs_dev *dibs) if (ret) return ret; + ret = sysfs_create_group(&dibs->dev.kobj, &dibs_dev_attr_group); + if (ret) { + dev_err(&dibs->dev, "sysfs_create_group failed for dibs_dev\n"); + goto err_device_del; + } mutex_lock(&dibs_dev_list.mutex); mutex_lock(&clients_lock); for (i = 0; i < max_client; ++i) { @@ -130,6 +160,11 @@ int dibs_dev_add(struct dibs_dev *dibs) mutex_unlock(&dibs_dev_list.mutex); return 0; + +err_device_del: + device_del(&dibs->dev); + return ret; + } EXPORT_SYMBOL_GPL(dibs_dev_add); @@ -158,6 +193,10 @@ static int __init dibs_init(void) memset(clients, 0, sizeof(clients)); max_client = 0; + dibs_class = class_create("dibs"); + if (IS_ERR(&dibs_class)) + return PTR_ERR(&dibs_class); + rc = dibs_loopback_init(); if (rc) pr_err("%s fails with %d\n", __func__, rc); @@ -168,6 +207,7 @@ static int __init dibs_init(void) static void __exit dibs_exit(void) { dibs_loopback_exit(); + class_destroy(dibs_class); } module_init(dibs_init); diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 4096ea9faa7e..ab1d61eb3e3b 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -629,6 +629,7 @@ static void ism_dev_exit(struct ism_dev *ism) static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct dibs_dev *dibs; + struct zpci_dev *zdev; struct ism_dev *ism; int ret; @@ -672,7 +673,9 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto err_dibs; dibs->dev.parent = &pdev->dev; - dev_set_name(&dibs->dev, "%s", dev_name(&pdev->dev)); + + zdev = to_zpci(pdev); + dev_set_name(&dibs->dev, "ism%x", zdev->uid ? zdev->uid : zdev->fid); ret = dibs_dev_add(dibs); if (ret) diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index d0df7f2b03aa..a3a1e1fde8eb 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -332,8 +332,11 @@ static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name) mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { - if (!strncmp(dev_name(&smcd_dev->dibs->dev), - smcd_name, IB_DEVICE_NAME_MAX - 1)) + if (!strncmp(dev_name(&smcd_dev->dibs->dev), smcd_name, + IB_DEVICE_NAME_MAX - 1) || + (smcd_dev->dibs->dev.parent && + !strncmp(dev_name(smcd_dev->dibs->dev.parent), smcd_name, + IB_DEVICE_NAME_MAX - 1))) goto out; } smcd_dev = NULL; @@ -1190,7 +1193,6 @@ int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) */ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) { - const char *ib_name = dev_name(&smcddev->dibs->dev); struct smc_pnettable *pnettable; struct smc_pnetentry *tmp_pe; struct smc_net *sn; @@ -1203,7 +1205,13 @@ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_IB && - !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { + (!strncmp(tmp_pe->ib_name, + dev_name(&smcddev->dibs->dev), + sizeof(tmp_pe->ib_name)) || + (smcddev->dibs->dev.parent && + !strncmp(tmp_pe->ib_name, + dev_name(smcddev->dibs->dev.parent), + sizeof(tmp_pe->ib_name))))) { smc_pnet_apply_smcd(smcddev, tmp_pe->pnet_name); rc = 0; break; -- cgit v1.2.3 From 05e68d8dedf34f270cc3769ffe7f0ed413f23add Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:56 +0200 Subject: dibs: Local gid for dibs devices Define a uuid_t GID attribute to identify a dibs device. SMC uses 64 Bit and 128 Bit Global Identifiers (GIDs) per device, that need to be sent via the SMC protocol. Because the smc code uses integers, network endianness and host endianness need to be considered. Avoid this in the dibs layer by using uuid_t byte arrays. Future patches could change SMC to use uuid_t. For now conversion helper functions are introduced. ISM devices provide 64 Bit GIDs. Map them to dibs uuid_t GIDs like this: _________________________________________ | 64 Bit ISM-vPCI GID | 00000000_00000000 | ----------------------------------------- If interpreted as UUID [1], this would be interpreted as the UIID variant, that is reserved for NCS backward compatibility. So it will not collide with UUIDs that were generated according to the standard. smc_loopback already uses version 4 UUIDs as 128 Bit GIDs, move that to dibs loopback. A temporary change to smc_lo_query_rgid() is required, that will be moved to dibs_loopback with a follow-on patch. Provide gid of a dibs device as sysfs read-only attribute. Link: https://datatracker.ietf.org/doc/html/rfc4122 [1] Signed-off-by: Alexandra Winter Reviewed-by: Julian Ruess Reviewed-by: Mahanta Jambigi Link: https://patch.msgid.link/20250918110500.1731261-11-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/dibs/dibs_loopback.c | 1 + drivers/dibs/dibs_main.c | 12 ++++++++++++ drivers/s390/net/ism.h | 9 +++++++++ drivers/s390/net/ism_drv.c | 30 +++++++++--------------------- include/linux/dibs.h | 3 +++ include/linux/ism.h | 1 - include/net/smc.h | 1 - net/smc/smc_clc.c | 6 +++--- net/smc/smc_core.c | 2 +- net/smc/smc_diag.c | 2 +- net/smc/smc_ism.h | 22 ++++++++++++++++++++++ net/smc/smc_loopback.c | 29 ++++------------------------- net/smc/smc_loopback.h | 1 - 13 files changed, 65 insertions(+), 54 deletions(-) (limited to 'net') diff --git a/drivers/dibs/dibs_loopback.c b/drivers/dibs/dibs_loopback.c index 76e479d5724b..d7e6fa5e90f3 100644 --- a/drivers/dibs/dibs_loopback.c +++ b/drivers/dibs/dibs_loopback.c @@ -46,6 +46,7 @@ static int dibs_lo_dev_probe(void) ldev->dibs = dibs; dibs->drv_priv = ldev; + uuid_gen(&dibs->gid); dibs->ops = &dibs_lo_ops; dibs->dev.parent = NULL; diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c index b3f21805aa59..f20ed0594a51 100644 --- a/drivers/dibs/dibs_main.c +++ b/drivers/dibs/dibs_main.c @@ -114,6 +114,17 @@ struct dibs_dev *dibs_dev_alloc(void) } EXPORT_SYMBOL_GPL(dibs_dev_alloc); +static ssize_t gid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct dibs_dev *dibs; + + dibs = container_of(dev, struct dibs_dev, dev); + + return sysfs_emit(buf, "%pUb\n", &dibs->gid); +} +static DEVICE_ATTR_RO(gid); + static ssize_t fabric_id_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -128,6 +139,7 @@ static ssize_t fabric_id_show(struct device *dev, struct device_attribute *attr, static DEVICE_ATTR_RO(fabric_id); static struct attribute *dibs_dev_attrs[] = { + &dev_attr_gid.attr, &dev_attr_fabric_id.attr, NULL, }; diff --git a/drivers/s390/net/ism.h b/drivers/s390/net/ism.h index 3078779fa71e..1b9fa14da20c 100644 --- a/drivers/s390/net/ism.h +++ b/drivers/s390/net/ism.h @@ -67,6 +67,15 @@ union ism_reg_ieq { } response; } __aligned(16); +/* ISM-vPCI devices provide 64 Bit GIDs + * Map them to ISM UUID GIDs like this: + * _________________________________________ + * | 64 Bit ISM-vPCI GID | 00000000_00000000 | + * ----------------------------------------- + * This will be interpreted as a UIID variant, that is reserved + * for NCS backward compatibility. So it will not collide with + * proper UUIDs. + */ union ism_read_gid { struct { struct ism_req_hdr hdr; diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index ab1d61eb3e3b..e58c55fb03c2 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -272,8 +272,9 @@ static int unregister_ieq(struct ism_dev *ism) return 0; } -static int ism_read_local_gid(struct ism_dev *ism) +static int ism_read_local_gid(struct dibs_dev *dibs) { + struct ism_dev *ism = dibs->drv_priv; union ism_read_gid cmd; int ret; @@ -285,7 +286,8 @@ static int ism_read_local_gid(struct ism_dev *ism) if (ret) goto out; - ism->local_gid = cmd.response.gid; + memset(&dibs->gid, 0, sizeof(dibs->gid)); + memcpy(&dibs->gid, &cmd.response.gid, sizeof(cmd.response.gid)); out: return ret; } @@ -563,10 +565,6 @@ static int ism_dev_init(struct ism_dev *ism) if (ret) goto unreg_sba; - ret = ism_read_local_gid(ism); - if (ret) - goto unreg_ieq; - if (!ism_add_vlan_id(ism, ISM_RESERVED_VLANID)) /* hardware is V2 capable */ ism_v2_capable = true; @@ -588,8 +586,6 @@ static int ism_dev_init(struct ism_dev *ism) query_info(ism); return 0; -unreg_ieq: - unregister_ieq(ism); unreg_sba: unregister_sba(ism); free_irq: @@ -672,6 +668,11 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (ret) goto err_dibs; + /* after ism_dev_init() we can call ism function to set gid */ + ret = ism_read_local_gid(dibs); + if (ret) + goto err_ism; + dibs->dev.parent = &pdev->dev; zdev = to_zpci(pdev); @@ -841,18 +842,6 @@ static int smcd_supports_v2(void) return ism_v2_capable; } -static u64 ism_get_local_gid(struct ism_dev *ism) -{ - return ism->local_gid; -} - -static void smcd_get_local_gid(struct smcd_dev *smcd, - struct smcd_gid *smcd_gid) -{ - smcd_gid->gid = ism_get_local_gid(smcd->priv); - smcd_gid->gid_ext = 0; -} - static const struct smcd_ops ism_smcd_ops = { .query_remote_gid = smcd_query_rgid, .register_dmb = smcd_register_dmb, @@ -864,7 +853,6 @@ static const struct smcd_ops ism_smcd_ops = { .signal_event = smcd_signal_ieq, .move_data = smcd_move, .supports_v2 = smcd_supports_v2, - .get_local_gid = smcd_get_local_gid, }; const struct smcd_ops *ism_get_smcd_ops(void) diff --git a/include/linux/dibs.h b/include/linux/dibs.h index 793c6e1ece0f..904f37505c27 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -10,6 +10,8 @@ #define _DIBS_H #include +#include + /* DIBS - Direct Internal Buffer Sharing - concept * ----------------------------------------------- * In the case of multiple system sharing the same hardware, dibs fabrics can @@ -138,6 +140,7 @@ struct dibs_dev { struct device dev; /* To be filled by device driver, before calling dibs_dev_add(): */ const struct dibs_dev_ops *ops; + uuid_t gid; /* priv pointer for device driver */ void *drv_priv; diff --git a/include/linux/ism.h b/include/linux/ism.h index 84f1afb3dded..a926dd61b5a1 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -42,7 +42,6 @@ struct ism_dev { struct ism_eq *ieq; dma_addr_t ieq_dma_addr; - u64 local_gid; int ieq_idx; struct ism_client *subs[MAX_CLIENTS]; diff --git a/include/net/smc.h b/include/net/smc.h index 05faac83371e..9cb8385bbc6e 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -62,7 +62,6 @@ struct smcd_ops { bool sf, unsigned int offset, void *data, unsigned int size); int (*supports_v2)(void); - void (*get_local_gid)(struct smcd_dev *dev, struct smcd_gid *gid); /* optional operations */ int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id); diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 09745baa1017..157aace169d4 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -916,7 +916,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) /* add SMC-D specifics */ if (ini->ism_dev[0]) { smcd = ini->ism_dev[0]; - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); pclc_smcd->ism.gid = htonll(smcd_gid.gid); pclc_smcd->ism.chid = htons(smc_ism_get_chid(ini->ism_dev[0])); @@ -966,7 +966,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) if (ini->ism_offered_cnt) { for (i = 1; i <= ini->ism_offered_cnt; i++) { smcd = ini->ism_dev[i]; - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); gidchids[entry].chid = htons(smc_ism_get_chid(ini->ism_dev[i])); gidchids[entry].gid = htonll(smcd_gid.gid); @@ -1059,7 +1059,7 @@ smcd_clc_prep_confirm_accept(struct smc_connection *conn, /* SMC-D specific settings */ memcpy(clc->hdr.eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)); - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); clc->hdr.typev1 = SMC_TYPE_D; clc->d0.gid = htonll(smcd_gid.gid); clc->d0.token = htonll(conn->rmb_desc->token); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 42ab0795d563..be0c2da83d2b 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -555,7 +555,7 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, if (nla_put_u32(skb, SMC_NLA_LGR_D_ID, *((u32 *)&lgr->id))) goto errattr; - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_GID, smcd_gid.gid, SMC_NLA_LGR_D_PAD)) goto errattr; diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 8ed2f6689b01..bf0beaa23bdb 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -175,7 +175,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, dinfo.linkid = *((u32 *)conn->lgr->id); dinfo.peer_gid = conn->lgr->peer_gid.gid; dinfo.peer_gid_ext = conn->lgr->peer_gid.gid_ext; - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); dinfo.my_gid = smcd_gid.gid; dinfo.my_gid_ext = smcd_gid.gid_ext; dinfo.token = conn->rmb_desc->token; diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index 04699951d03f..139e99da2c9f 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -96,4 +96,26 @@ static inline bool smc_ism_is_loopback(struct dibs_dev *dibs) return (dibs->ops->get_fabric_id(dibs) == DIBS_LOOPBACK_FABRIC); } +static inline void copy_to_smcdgid(struct smcd_gid *sgid, uuid_t *dibs_gid) +{ + __be64 temp; + + memcpy(&temp, dibs_gid, sizeof(sgid->gid)); + sgid->gid = ntohll(temp); + memcpy(&temp, (uint8_t *)dibs_gid + sizeof(sgid->gid), + sizeof(sgid->gid_ext)); + sgid->gid_ext = ntohll(temp); +} + +static inline void copy_to_dibsgid(uuid_t *dibs_gid, struct smcd_gid *sgid) +{ + __be64 temp; + + temp = htonll(sgid->gid); + memcpy(dibs_gid, &temp, sizeof(sgid->gid)); + temp = htonll(sgid->gid_ext); + memcpy((uint8_t *)dibs_gid + sizeof(sgid->gid), &temp, + sizeof(sgid->gid_ext)); +} + #endif diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c index 262d0d0df4d0..454d9d6a6e8f 100644 --- a/net/smc/smc_loopback.c +++ b/net/smc/smc_loopback.c @@ -13,6 +13,7 @@ #include #include +#include #include #include "smc_cdc.h" @@ -25,25 +26,14 @@ static struct smc_lo_dev *lo_dev; -static void smc_lo_generate_ids(struct smc_lo_dev *ldev) -{ - struct smcd_gid *lgid = &ldev->local_gid; - uuid_t uuid; - - uuid_gen(&uuid); - memcpy(&lgid->gid, &uuid, sizeof(lgid->gid)); - memcpy(&lgid->gid_ext, (u8 *)&uuid + sizeof(lgid->gid), - sizeof(lgid->gid_ext)); -} - static int smc_lo_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid, u32 vid_valid, u32 vid) { - struct smc_lo_dev *ldev = smcd->priv; + uuid_t temp; + copy_to_dibsgid(&temp, rgid); /* rgid should be the same as lgid */ - if (!ldev || rgid->gid != ldev->local_gid.gid || - rgid->gid_ext != ldev->local_gid.gid_ext) + if (!uuid_equal(&temp, &smcd->dibs->gid)) return -ENETUNREACH; return 0; } @@ -245,15 +235,6 @@ static int smc_lo_move_data(struct smcd_dev *smcd, u64 dmb_tok, return 0; } -static void smc_lo_get_local_gid(struct smcd_dev *smcd, - struct smcd_gid *smcd_gid) -{ - struct smc_lo_dev *ldev = smcd->priv; - - smcd_gid->gid = ldev->local_gid.gid; - smcd_gid->gid_ext = ldev->local_gid.gid_ext; -} - static const struct smcd_ops lo_ops = { .query_remote_gid = smc_lo_query_rgid, .register_dmb = smc_lo_register_dmb, @@ -267,7 +248,6 @@ static const struct smcd_ops lo_ops = { .reset_vlan_required = NULL, .signal_event = NULL, .move_data = smc_lo_move_data, - .get_local_gid = smc_lo_get_local_gid, }; const struct smcd_ops *smc_lo_get_smcd_ops(void) @@ -277,7 +257,6 @@ const struct smcd_ops *smc_lo_get_smcd_ops(void) static void smc_lo_dev_init(struct smc_lo_dev *ldev) { - smc_lo_generate_ids(ldev); rwlock_init(&ldev->dmb_ht_lock); hash_init(ldev->dmb_ht); atomic_set(&ldev->dmb_cnt, 0); diff --git a/net/smc/smc_loopback.h b/net/smc/smc_loopback.h index a033bf10890a..33bb96ec8b77 100644 --- a/net/smc/smc_loopback.h +++ b/net/smc/smc_loopback.h @@ -32,7 +32,6 @@ struct smc_lo_dmb_node { struct smc_lo_dev { struct smcd_dev *smcd; - struct smcd_gid local_gid; atomic_t dmb_cnt; rwlock_t dmb_ht_lock; DECLARE_BITMAP(sba_idx_mask, SMC_LO_MAX_DMBS); -- cgit v1.2.3 From 92a0f7bb081dde6e88368816b8ba51352ddabb1d Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:57 +0200 Subject: dibs: Move vlan support to dibs_dev_ops It can be debated how much benefit definition of vlan ids for dibs devices brings, as the dmbs are accessible only by a single peer anyhow. But ism provides vlan support and smcd exploits it, so move it to dibs layer as an optional feature. smcd_loopback simply ignores all vlan settings, do the same in dibs_loopback. SMC-D and ISM have a method to use the invalid VLAN ID 1FFF (ISM_RESERVED_VLANID), to indicate that both communication peers support routable SMC-Dv2. Tolerate it in dibs, but move it to SMC only. Signed-off-by: Alexandra Winter Link: https://patch.msgid.link/20250918110500.1731261-12-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/s390/net/ism_drv.c | 47 ++++++---------------------------------------- include/linux/dibs.h | 19 +++++++++++++++++++ include/net/smc.h | 5 ----- net/smc/smc_ism.c | 14 +++++++++----- net/smc/smc_loopback.c | 5 ----- 5 files changed, 34 insertions(+), 56 deletions(-) (limited to 'net') diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index e58c55fb03c2..ed4c28ca355b 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -36,7 +36,6 @@ static struct ism_client *clients[MAX_CLIENTS]; /* use an array rather than */ /* a list for fast mapping */ static u8 max_client; static DEFINE_MUTEX(clients_lock); -static bool ism_v2_capable; struct ism_dev_list { struct list_head list; struct mutex mutex; /* protects ism device list */ @@ -409,8 +408,9 @@ out: } EXPORT_SYMBOL_GPL(ism_unregister_dmb); -static int ism_add_vlan_id(struct ism_dev *ism, u64 vlan_id) +static int ism_add_vlan_id(struct dibs_dev *dibs, u64 vlan_id) { + struct ism_dev *ism = dibs->drv_priv; union ism_set_vlan_id cmd; memset(&cmd, 0, sizeof(cmd)); @@ -422,8 +422,9 @@ static int ism_add_vlan_id(struct ism_dev *ism, u64 vlan_id) return ism_cmd(ism, &cmd); } -static int ism_del_vlan_id(struct ism_dev *ism, u64 vlan_id) +static int ism_del_vlan_id(struct dibs_dev *dibs, u64 vlan_id) { + struct ism_dev *ism = dibs->drv_priv; union ism_set_vlan_id cmd; memset(&cmd, 0, sizeof(cmd)); @@ -536,6 +537,8 @@ static irqreturn_t ism_handle_irq(int irq, void *data) static const struct dibs_dev_ops ism_ops = { .get_fabric_id = ism_get_chid, + .add_vlan_id = ism_add_vlan_id, + .del_vlan_id = ism_del_vlan_id, }; static int ism_dev_init(struct ism_dev *ism) @@ -565,12 +568,6 @@ static int ism_dev_init(struct ism_dev *ism) if (ret) goto unreg_sba; - if (!ism_add_vlan_id(ism, ISM_RESERVED_VLANID)) - /* hardware is V2 capable */ - ism_v2_capable = true; - else - ism_v2_capable = false; - mutex_lock(&ism_dev_list.mutex); mutex_lock(&clients_lock); for (i = 0; i < max_client; ++i) { @@ -611,8 +608,6 @@ static void ism_dev_exit(struct ism_dev *ism) mutex_lock(&ism_dev_list.mutex); - if (ism_v2_capable) - ism_del_vlan_id(ism, ISM_RESERVED_VLANID); unregister_ieq(ism); unregister_sba(ism); free_irq(pci_irq_vector(pdev, 0), ism); @@ -786,26 +781,6 @@ static int smcd_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) return ism_unregister_dmb(smcd->priv, (struct ism_dmb *)dmb); } -static int smcd_add_vlan_id(struct smcd_dev *smcd, u64 vlan_id) -{ - return ism_add_vlan_id(smcd->priv, vlan_id); -} - -static int smcd_del_vlan_id(struct smcd_dev *smcd, u64 vlan_id) -{ - return ism_del_vlan_id(smcd->priv, vlan_id); -} - -static int smcd_set_vlan_required(struct smcd_dev *smcd) -{ - return ism_cmd_simple(smcd->priv, ISM_SET_VLAN); -} - -static int smcd_reset_vlan_required(struct smcd_dev *smcd) -{ - return ism_cmd_simple(smcd->priv, ISM_RESET_VLAN); -} - static int ism_signal_ieq(struct ism_dev *ism, u64 rgid, u32 trigger_irq, u32 event_code, u64 info) { @@ -837,22 +812,12 @@ static int smcd_move(struct smcd_dev *smcd, u64 dmb_tok, unsigned int idx, return ism_move(smcd->priv, dmb_tok, idx, sf, offset, data, size); } -static int smcd_supports_v2(void) -{ - return ism_v2_capable; -} - static const struct smcd_ops ism_smcd_ops = { .query_remote_gid = smcd_query_rgid, .register_dmb = smcd_register_dmb, .unregister_dmb = smcd_unregister_dmb, - .add_vlan_id = smcd_add_vlan_id, - .del_vlan_id = smcd_del_vlan_id, - .set_vlan_required = smcd_set_vlan_required, - .reset_vlan_required = smcd_reset_vlan_required, .signal_event = smcd_signal_ieq, .move_data = smcd_move, - .supports_v2 = smcd_supports_v2, }; const struct smcd_ops *ism_get_smcd_ops(void) diff --git a/include/linux/dibs.h b/include/linux/dibs.h index 904f37505c27..166148fb8d76 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -133,6 +133,25 @@ struct dibs_dev_ops { * Return: 2 byte dibs fabric id */ u16 (*get_fabric_id)(struct dibs_dev *dev); + /** + * add_vlan_id() - add dibs device to vlan (optional, deprecated) + * @dev: dibs device + * @vlan_id: vlan id + * + * In order to write into a vlan-tagged dmb, the remote device needs + * to belong to the this vlan. A device can belong to more than 1 vlan. + * Any device can access an untagged dmb. + * Deprecated, only supported for backwards compatibility. + * Return: zero on success + */ + int (*add_vlan_id)(struct dibs_dev *dev, u64 vlan_id); + /** + * del_vlan_id() - remove dibs device from vlan (optional, deprecated) + * @dev: dibs device + * @vlan_id: vlan id + * Return: zero on success + */ + int (*del_vlan_id)(struct dibs_dev *dev, u64 vlan_id); }; struct dibs_dev { diff --git a/include/net/smc.h b/include/net/smc.h index 9cb8385bbc6e..51b4aefc106a 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -61,13 +61,8 @@ struct smcd_ops { int (*move_data)(struct smcd_dev *dev, u64 dmb_tok, unsigned int idx, bool sf, unsigned int offset, void *data, unsigned int size); - int (*supports_v2)(void); /* optional operations */ - int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id); - int (*del_vlan_id)(struct smcd_dev *dev, u64 vlan_id); - int (*set_vlan_required)(struct smcd_dev *dev); - int (*reset_vlan_required)(struct smcd_dev *dev); int (*signal_event)(struct smcd_dev *dev, struct smcd_gid *rgid, u32 trigger_irq, u32 event_code, u64 info); int (*support_dmb_nocopy)(struct smcd_dev *dev); diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 6a6e7c9641e8..5118441bed18 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -140,7 +140,7 @@ int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) if (!vlanid) /* No valid vlan id */ return -EINVAL; - if (!smcd->ops->add_vlan_id) + if (!smcd->dibs->ops->add_vlan_id) return -EOPNOTSUPP; /* create new vlan entry, in case we need it */ @@ -163,7 +163,7 @@ int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) /* no existing entry found. * add new entry to device; might fail, e.g., if HW limit reached */ - if (smcd->ops->add_vlan_id(smcd, vlanid)) { + if (smcd->dibs->ops->add_vlan_id(smcd->dibs, vlanid)) { kfree(new_vlan); rc = -EIO; goto out; @@ -187,7 +187,7 @@ int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid) if (!vlanid) /* No valid vlan id */ return -EINVAL; - if (!smcd->ops->del_vlan_id) + if (!smcd->dibs->ops->del_vlan_id) return -EOPNOTSUPP; spin_lock_irqsave(&smcd->lock, flags); @@ -205,7 +205,7 @@ int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid) } /* Found and the last reference just gone */ - if (smcd->ops->del_vlan_id(smcd, vlanid)) + if (smcd->dibs->ops->del_vlan_id(smcd->dibs, vlanid)) rc = -EIO; list_del(&vlan->list); kfree(vlan); @@ -539,8 +539,12 @@ static void smcd_register_dev(struct dibs_dev *dibs) if (smc_pnetid_by_dev_port(dibs->dev.parent, 0, smcd->pnetid)) smc_pnetid_by_table_smcd(smcd); - if (smc_ism_is_loopback(dibs) || smcd->ops->supports_v2()) + if (smc_ism_is_loopback(dibs) || + (dibs->ops->add_vlan_id && + !dibs->ops->add_vlan_id(dibs, ISM_RESERVED_VLANID))) { smc_ism_set_v2_capable(); + } + mutex_lock(&smcd_dev_list.mutex); /* sort list: * - devices without pnetid before devices with pnetid; diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c index 454d9d6a6e8f..982a19430313 100644 --- a/net/smc/smc_loopback.c +++ b/net/smc/smc_loopback.c @@ -20,7 +20,6 @@ #include "smc_ism.h" #include "smc_loopback.h" -#define SMC_LO_V2_CAPABLE 0x1 /* loopback-ism acts as ISMv2 */ #define SMC_LO_SUPPORT_NOCOPY 0x1 #define SMC_DMA_ADDR_INVALID (~(dma_addr_t)0) @@ -242,10 +241,6 @@ static const struct smcd_ops lo_ops = { .support_dmb_nocopy = smc_lo_support_dmb_nocopy, .attach_dmb = smc_lo_attach_dmb, .detach_dmb = smc_lo_detach_dmb, - .add_vlan_id = NULL, - .del_vlan_id = NULL, - .set_vlan_required = NULL, - .reset_vlan_required = NULL, .signal_event = NULL, .move_data = smc_lo_move_data, }; -- cgit v1.2.3 From 719c3b67bb7ea95bb8158b03c75641c8fc8f94a0 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:58 +0200 Subject: dibs: Move query_remote_gid() to dibs_dev_ops Provide the dibs_dev_ops->query_remote_gid() in ism and dibs_loopback dibs_devices. And call it in smc dibs_client. Reviewed-by: Julian Ruess Signed-off-by: Alexandra Winter Link: https://patch.msgid.link/20250918110500.1731261-13-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/dibs/dibs_loopback.c | 10 ++++++++++ drivers/s390/net/ism_drv.c | 41 ++++++++++++++++++----------------------- include/linux/dibs.h | 14 ++++++++++++++ include/net/smc.h | 2 -- net/smc/smc_ism.c | 8 ++++++-- net/smc/smc_loopback.c | 13 ------------- 6 files changed, 48 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/drivers/dibs/dibs_loopback.c b/drivers/dibs/dibs_loopback.c index d7e6fa5e90f3..6b53e626a6d1 100644 --- a/drivers/dibs/dibs_loopback.c +++ b/drivers/dibs/dibs_loopback.c @@ -24,8 +24,18 @@ static u16 dibs_lo_get_fabric_id(struct dibs_dev *dibs) return DIBS_LOOPBACK_FABRIC; } +static int dibs_lo_query_rgid(struct dibs_dev *dibs, const uuid_t *rgid, + u32 vid_valid, u32 vid) +{ + /* rgid should be the same as lgid */ + if (!uuid_equal(rgid, &dibs->gid)) + return -ENETUNREACH; + return 0; +} + static const struct dibs_dev_ops dibs_lo_ops = { .get_fabric_id = dibs_lo_get_fabric_id, + .query_remote_gid = dibs_lo_query_rgid, }; static int dibs_lo_dev_probe(void) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index ed4c28ca355b..121b3a2be760 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -291,6 +291,23 @@ out: return ret; } +static int ism_query_rgid(struct dibs_dev *dibs, const uuid_t *rgid, + u32 vid_valid, u32 vid) +{ + struct ism_dev *ism = dibs->drv_priv; + union ism_query_rgid cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.request.hdr.cmd = ISM_QUERY_RGID; + cmd.request.hdr.len = sizeof(cmd.request); + + memcpy(&cmd.request.rgid, rgid, sizeof(cmd.request.rgid)); + cmd.request.vlan_valid = vid_valid; + cmd.request.vlan_id = vid; + + return ism_cmd(ism, &cmd); +} + static void ism_free_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { clear_bit(dmb->sba_idx, ism->sba_bitmap); @@ -537,6 +554,7 @@ static irqreturn_t ism_handle_irq(int irq, void *data) static const struct dibs_dev_ops ism_ops = { .get_fabric_id = ism_get_chid, + .query_remote_gid = ism_query_rgid, .add_vlan_id = ism_add_vlan_id, .del_vlan_id = ism_del_vlan_id, }; @@ -748,28 +766,6 @@ module_exit(ism_exit); /*************************** SMC-D Implementation *****************************/ #if IS_ENABLED(CONFIG_SMC) -static int ism_query_rgid(struct ism_dev *ism, u64 rgid, u32 vid_valid, - u32 vid) -{ - union ism_query_rgid cmd; - - memset(&cmd, 0, sizeof(cmd)); - cmd.request.hdr.cmd = ISM_QUERY_RGID; - cmd.request.hdr.len = sizeof(cmd.request); - - cmd.request.rgid = rgid; - cmd.request.vlan_valid = vid_valid; - cmd.request.vlan_id = vid; - - return ism_cmd(ism, &cmd); -} - -static int smcd_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid, - u32 vid_valid, u32 vid) -{ - return ism_query_rgid(smcd->priv, rgid->gid, vid_valid, vid); -} - static int smcd_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, void *client) { @@ -813,7 +809,6 @@ static int smcd_move(struct smcd_dev *smcd, u64 dmb_tok, unsigned int idx, } static const struct smcd_ops ism_smcd_ops = { - .query_remote_gid = smcd_query_rgid, .register_dmb = smcd_register_dmb, .unregister_dmb = smcd_unregister_dmb, .signal_event = smcd_signal_ieq, diff --git a/include/linux/dibs.h b/include/linux/dibs.h index 166148fb8d76..c75a40fe3039 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -133,6 +133,20 @@ struct dibs_dev_ops { * Return: 2 byte dibs fabric id */ u16 (*get_fabric_id)(struct dibs_dev *dev); + /** + * query_remote_gid() + * @dev: local dibs device + * @rgid: gid of remote dibs device + * @vid_valid: if zero, vid will be ignored; + * deprecated, ignored if device does not support vlan + * @vid: VLAN id; deprecated, ignored if device does not support vlan + * + * Query whether a remote dibs device is reachable via this local device + * and this vlan id. + * Return: 0 if remote gid is reachable. + */ + int (*query_remote_gid)(struct dibs_dev *dev, const uuid_t *rgid, + u32 vid_valid, u32 vid); /** * add_vlan_id() - add dibs device to vlan (optional, deprecated) * @dev: dibs device diff --git a/include/net/smc.h b/include/net/smc.h index 51b4aefc106a..5bd135fb4d49 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -53,8 +53,6 @@ struct smcd_gid { }; struct smcd_ops { - int (*query_remote_gid)(struct smcd_dev *dev, struct smcd_gid *rgid, - u32 vid_valid, u32 vid); int (*register_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb, void *client); int (*unregister_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 5118441bed18..d20d00b46825 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -77,8 +77,12 @@ static void smc_ism_create_system_eid(void) int smc_ism_cantalk(struct smcd_gid *peer_gid, unsigned short vlan_id, struct smcd_dev *smcd) { - return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0, - vlan_id); + struct dibs_dev *dibs = smcd->dibs; + uuid_t ism_rgid; + + copy_to_dibsgid(&ism_rgid, peer_gid); + return dibs->ops->query_remote_gid(dibs, &ism_rgid, vlan_id ? 1 : 0, + vlan_id); } void smc_ism_get_system_eid(u8 **eid) diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c index 982a19430313..52cba01cb209 100644 --- a/net/smc/smc_loopback.c +++ b/net/smc/smc_loopback.c @@ -25,18 +25,6 @@ static struct smc_lo_dev *lo_dev; -static int smc_lo_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid, - u32 vid_valid, u32 vid) -{ - uuid_t temp; - - copy_to_dibsgid(&temp, rgid); - /* rgid should be the same as lgid */ - if (!uuid_equal(&temp, &smcd->dibs->gid)) - return -ENETUNREACH; - return 0; -} - static int smc_lo_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, void *client_priv) { @@ -235,7 +223,6 @@ static int smc_lo_move_data(struct smcd_dev *smcd, u64 dmb_tok, } static const struct smcd_ops lo_ops = { - .query_remote_gid = smc_lo_query_rgid, .register_dmb = smc_lo_register_dmb, .unregister_dmb = smc_lo_unregister_dmb, .support_dmb_nocopy = smc_lo_support_dmb_nocopy, -- cgit v1.2.3 From cc21191b584c6f7836b0f10774f8278b7cbfba10 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:59 +0200 Subject: dibs: Move data path to dibs layer Use struct dibs_dmb instead of struct smc_dmb and move the corresponding client tables to dibs_dev. Leave driver specific implementation details like sba in the device drivers. Register and unregister dmbs via dibs_dev_ops. A dmb is dedicated to a single client, but a dibs device can have dmbs for more than one client. Trigger dibs clients via dibs_client_ops->handle_irq(), when data is received into a dmb. For dibs_loopback replace scheduling an smcd receive tasklet with calling dibs_client_ops->handle_irq(). For loopback devices attach_dmb(), detach_dmb() and move_data() need to access the dmb tables, so move those to dibs_dev_ops in this patch as well. Remove remaining definitions of smc_loopback as they are no longer required, now that everything is in dibs_loopback. Note that struct ism_client and struct ism_dev are still required in smc until a follow-on patch moves event handling to dibs. (Loopback does not use events). Signed-off-by: Alexandra Winter Link: https://patch.msgid.link/20250918110500.1731261-14-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/dibs/dibs_loopback.c | 257 +++++++++++++++++++++++++++++++++++++ drivers/dibs/dibs_loopback.h | 19 +++ drivers/dibs/dibs_main.c | 56 ++++++++- drivers/s390/net/ism_drv.c | 121 ++++++++---------- include/linux/dibs.h | 177 ++++++++++++++++++++++++++ include/linux/ism.h | 23 ---- include/net/smc.h | 22 ---- net/smc/Makefile | 1 - net/smc/smc_ism.c | 70 +++++------ net/smc/smc_ism.h | 4 +- net/smc/smc_loopback.c | 294 ------------------------------------------- net/smc/smc_loopback.h | 47 ------- 12 files changed, 591 insertions(+), 500 deletions(-) delete mode 100644 net/smc/smc_loopback.c delete mode 100644 net/smc/smc_loopback.h (limited to 'net') diff --git a/drivers/dibs/dibs_loopback.c b/drivers/dibs/dibs_loopback.c index 6b53e626a6d1..b3fd0f8100d4 100644 --- a/drivers/dibs/dibs_loopback.c +++ b/drivers/dibs/dibs_loopback.c @@ -9,12 +9,18 @@ * */ +#include +#include #include #include +#include #include #include "dibs_loopback.h" +#define DIBS_LO_SUPPORT_NOCOPY 0x1 +#define DIBS_DMA_ADDR_INVALID (~(dma_addr_t)0) + static const char dibs_lo_dev_name[] = "lo"; /* global loopback device */ static struct dibs_lo_dev *lo_dev; @@ -33,11 +39,259 @@ static int dibs_lo_query_rgid(struct dibs_dev *dibs, const uuid_t *rgid, return 0; } +static int dibs_lo_max_dmbs(void) +{ + return DIBS_LO_MAX_DMBS; +} + +static int dibs_lo_register_dmb(struct dibs_dev *dibs, struct dibs_dmb *dmb, + struct dibs_client *client) +{ + struct dibs_lo_dmb_node *dmb_node, *tmp_node; + struct dibs_lo_dev *ldev; + unsigned long flags; + int sba_idx, rc; + + ldev = dibs->drv_priv; + sba_idx = dmb->idx; + /* check space for new dmb */ + for_each_clear_bit(sba_idx, ldev->sba_idx_mask, DIBS_LO_MAX_DMBS) { + if (!test_and_set_bit(sba_idx, ldev->sba_idx_mask)) + break; + } + if (sba_idx == DIBS_LO_MAX_DMBS) + return -ENOSPC; + + dmb_node = kzalloc(sizeof(*dmb_node), GFP_KERNEL); + if (!dmb_node) { + rc = -ENOMEM; + goto err_bit; + } + + dmb_node->sba_idx = sba_idx; + dmb_node->len = dmb->dmb_len; + dmb_node->cpu_addr = kzalloc(dmb_node->len, GFP_KERNEL | + __GFP_NOWARN | __GFP_NORETRY | + __GFP_NOMEMALLOC); + if (!dmb_node->cpu_addr) { + rc = -ENOMEM; + goto err_node; + } + dmb_node->dma_addr = DIBS_DMA_ADDR_INVALID; + refcount_set(&dmb_node->refcnt, 1); + +again: + /* add new dmb into hash table */ + get_random_bytes(&dmb_node->token, sizeof(dmb_node->token)); + write_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_node->token) { + if (tmp_node->token == dmb_node->token) { + write_unlock_bh(&ldev->dmb_ht_lock); + goto again; + } + } + hash_add(ldev->dmb_ht, &dmb_node->list, dmb_node->token); + write_unlock_bh(&ldev->dmb_ht_lock); + atomic_inc(&ldev->dmb_cnt); + + dmb->idx = dmb_node->sba_idx; + dmb->dmb_tok = dmb_node->token; + dmb->cpu_addr = dmb_node->cpu_addr; + dmb->dma_addr = dmb_node->dma_addr; + dmb->dmb_len = dmb_node->len; + + spin_lock_irqsave(&dibs->lock, flags); + dibs->dmb_clientid_arr[sba_idx] = client->id; + spin_unlock_irqrestore(&dibs->lock, flags); + + return 0; + +err_node: + kfree(dmb_node); +err_bit: + clear_bit(sba_idx, ldev->sba_idx_mask); + return rc; +} + +static void __dibs_lo_unregister_dmb(struct dibs_lo_dev *ldev, + struct dibs_lo_dmb_node *dmb_node) +{ + /* remove dmb from hash table */ + write_lock_bh(&ldev->dmb_ht_lock); + hash_del(&dmb_node->list); + write_unlock_bh(&ldev->dmb_ht_lock); + + clear_bit(dmb_node->sba_idx, ldev->sba_idx_mask); + kfree(dmb_node->cpu_addr); + kfree(dmb_node); + + if (atomic_dec_and_test(&ldev->dmb_cnt)) + wake_up(&ldev->ldev_release); +} + +static int dibs_lo_unregister_dmb(struct dibs_dev *dibs, struct dibs_dmb *dmb) +{ + struct dibs_lo_dmb_node *dmb_node = NULL, *tmp_node; + struct dibs_lo_dev *ldev; + unsigned long flags; + + ldev = dibs->drv_priv; + + /* find dmb from hash table */ + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) { + if (tmp_node->token == dmb->dmb_tok) { + dmb_node = tmp_node; + break; + } + } + read_unlock_bh(&ldev->dmb_ht_lock); + if (!dmb_node) + return -EINVAL; + + if (refcount_dec_and_test(&dmb_node->refcnt)) { + spin_lock_irqsave(&dibs->lock, flags); + dibs->dmb_clientid_arr[dmb_node->sba_idx] = NO_DIBS_CLIENT; + spin_unlock_irqrestore(&dibs->lock, flags); + + __dibs_lo_unregister_dmb(ldev, dmb_node); + } + return 0; +} + +static int dibs_lo_support_dmb_nocopy(struct dibs_dev *dibs) +{ + return DIBS_LO_SUPPORT_NOCOPY; +} + +static int dibs_lo_attach_dmb(struct dibs_dev *dibs, struct dibs_dmb *dmb) +{ + struct dibs_lo_dmb_node *dmb_node = NULL, *tmp_node; + struct dibs_lo_dev *ldev; + + ldev = dibs->drv_priv; + + /* find dmb_node according to dmb->dmb_tok */ + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) { + if (tmp_node->token == dmb->dmb_tok) { + dmb_node = tmp_node; + break; + } + } + if (!dmb_node) { + read_unlock_bh(&ldev->dmb_ht_lock); + return -EINVAL; + } + read_unlock_bh(&ldev->dmb_ht_lock); + + if (!refcount_inc_not_zero(&dmb_node->refcnt)) + /* the dmb is being unregistered, but has + * not been removed from the hash table. + */ + return -EINVAL; + + /* provide dmb information */ + dmb->idx = dmb_node->sba_idx; + dmb->dmb_tok = dmb_node->token; + dmb->cpu_addr = dmb_node->cpu_addr; + dmb->dma_addr = dmb_node->dma_addr; + dmb->dmb_len = dmb_node->len; + return 0; +} + +static int dibs_lo_detach_dmb(struct dibs_dev *dibs, u64 token) +{ + struct dibs_lo_dmb_node *dmb_node = NULL, *tmp_node; + struct dibs_lo_dev *ldev; + + ldev = dibs->drv_priv; + + /* find dmb_node according to dmb->dmb_tok */ + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, token) { + if (tmp_node->token == token) { + dmb_node = tmp_node; + break; + } + } + if (!dmb_node) { + read_unlock_bh(&ldev->dmb_ht_lock); + return -EINVAL; + } + read_unlock_bh(&ldev->dmb_ht_lock); + + if (refcount_dec_and_test(&dmb_node->refcnt)) + __dibs_lo_unregister_dmb(ldev, dmb_node); + return 0; +} + +static int dibs_lo_move_data(struct dibs_dev *dibs, u64 dmb_tok, + unsigned int idx, bool sf, unsigned int offset, + void *data, unsigned int size) +{ + struct dibs_lo_dmb_node *rmb_node = NULL, *tmp_node; + struct dibs_lo_dev *ldev; + u16 s_mask; + u8 client_id; + u32 sba_idx; + + ldev = dibs->drv_priv; + + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_tok) { + if (tmp_node->token == dmb_tok) { + rmb_node = tmp_node; + break; + } + } + if (!rmb_node) { + read_unlock_bh(&ldev->dmb_ht_lock); + return -EINVAL; + } + memcpy((char *)rmb_node->cpu_addr + offset, data, size); + sba_idx = rmb_node->sba_idx; + read_unlock_bh(&ldev->dmb_ht_lock); + + if (!sf) + return 0; + + spin_lock(&dibs->lock); + client_id = dibs->dmb_clientid_arr[sba_idx]; + s_mask = ror16(0x1000, idx); + if (likely(client_id != NO_DIBS_CLIENT && dibs->subs[client_id])) + dibs->subs[client_id]->ops->handle_irq(dibs, sba_idx, s_mask); + spin_unlock(&dibs->lock); + + return 0; +} + static const struct dibs_dev_ops dibs_lo_ops = { .get_fabric_id = dibs_lo_get_fabric_id, .query_remote_gid = dibs_lo_query_rgid, + .max_dmbs = dibs_lo_max_dmbs, + .register_dmb = dibs_lo_register_dmb, + .unregister_dmb = dibs_lo_unregister_dmb, + .move_data = dibs_lo_move_data, + .support_mmapped_rdmb = dibs_lo_support_dmb_nocopy, + .attach_dmb = dibs_lo_attach_dmb, + .detach_dmb = dibs_lo_detach_dmb, }; +static void dibs_lo_dev_init(struct dibs_lo_dev *ldev) +{ + rwlock_init(&ldev->dmb_ht_lock); + hash_init(ldev->dmb_ht); + atomic_set(&ldev->dmb_cnt, 0); + init_waitqueue_head(&ldev->ldev_release); +} + +static void dibs_lo_dev_exit(struct dibs_lo_dev *ldev) +{ + if (atomic_read(&ldev->dmb_cnt)) + wait_event(ldev->ldev_release, !atomic_read(&ldev->dmb_cnt)); +} + static int dibs_lo_dev_probe(void) { struct dibs_lo_dev *ldev; @@ -56,6 +310,7 @@ static int dibs_lo_dev_probe(void) ldev->dibs = dibs; dibs->drv_priv = ldev; + dibs_lo_dev_init(ldev); uuid_gen(&dibs->gid); dibs->ops = &dibs_lo_ops; @@ -69,6 +324,7 @@ static int dibs_lo_dev_probe(void) return 0; err_reg: + kfree(dibs->dmb_clientid_arr); /* pairs with dibs_dev_alloc() */ put_device(&dibs->dev); kfree(ldev); @@ -82,6 +338,7 @@ static void dibs_lo_dev_remove(void) return; dibs_dev_del(lo_dev->dibs); + dibs_lo_dev_exit(lo_dev); /* pairs with dibs_dev_alloc() */ put_device(&lo_dev->dibs->dev); kfree(lo_dev); diff --git a/drivers/dibs/dibs_loopback.h b/drivers/dibs/dibs_loopback.h index fd03b6333a24..0664f6a8e662 100644 --- a/drivers/dibs/dibs_loopback.h +++ b/drivers/dibs/dibs_loopback.h @@ -13,13 +13,32 @@ #define _DIBS_LOOPBACK_H #include +#include +#include #include #include #if IS_ENABLED(CONFIG_DIBS_LO) +#define DIBS_LO_DMBS_HASH_BITS 12 +#define DIBS_LO_MAX_DMBS 5000 + +struct dibs_lo_dmb_node { + struct hlist_node list; + u64 token; + u32 len; + u32 sba_idx; + void *cpu_addr; + dma_addr_t dma_addr; + refcount_t refcnt; +}; struct dibs_lo_dev { struct dibs_dev *dibs; + atomic_t dmb_cnt; + rwlock_t dmb_ht_lock; + DECLARE_BITMAP(sba_idx_mask, DIBS_LO_MAX_DMBS); + DECLARE_HASHTABLE(dmb_ht, DIBS_LO_DMBS_HASH_BITS); + wait_queue_head_t ldev_release; }; int dibs_loopback_init(void); diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c index f20ed0594a51..aacb3ea7825a 100644 --- a/drivers/dibs/dibs_main.c +++ b/drivers/dibs/dibs_main.c @@ -36,6 +36,16 @@ static struct dibs_dev_list dibs_dev_list = { .mutex = __MUTEX_INITIALIZER(dibs_dev_list.mutex), }; +static void dibs_setup_forwarding(struct dibs_client *client, + struct dibs_dev *dibs) +{ + unsigned long flags; + + spin_lock_irqsave(&dibs->lock, flags); + dibs->subs[client->id] = client; + spin_unlock_irqrestore(&dibs->lock, flags); +} + int dibs_register_client(struct dibs_client *client) { struct dibs_dev *dibs; @@ -60,6 +70,7 @@ int dibs_register_client(struct dibs_client *client) list_for_each_entry(dibs, &dibs_dev_list.list, list) { dibs->priv[i] = NULL; client->ops->add_dev(dibs); + dibs_setup_forwarding(client, dibs); } } mutex_unlock(&dibs_dev_list.mutex); @@ -71,10 +82,25 @@ EXPORT_SYMBOL_GPL(dibs_register_client); int dibs_unregister_client(struct dibs_client *client) { struct dibs_dev *dibs; + unsigned long flags; + int max_dmbs; int rc = 0; mutex_lock(&dibs_dev_list.mutex); list_for_each_entry(dibs, &dibs_dev_list.list, list) { + spin_lock_irqsave(&dibs->lock, flags); + max_dmbs = dibs->ops->max_dmbs(); + for (int i = 0; i < max_dmbs; ++i) { + if (dibs->dmb_clientid_arr[i] == client->id) { + WARN(1, "%s: attempt to unregister '%s' with registered dmb(s)\n", + __func__, client->name); + rc = -EBUSY; + goto err_reg_dmb; + } + } + /* Stop forwarding IRQs */ + dibs->subs[client->id] = NULL; + spin_unlock_irqrestore(&dibs->lock, flags); clients[client->id]->ops->del_dev(dibs); dibs->priv[client->id] = NULL; } @@ -87,6 +113,11 @@ int dibs_unregister_client(struct dibs_client *client) mutex_unlock(&dibs_dev_list.mutex); return rc; + +err_reg_dmb: + spin_unlock_irqrestore(&dibs->lock, flags); + mutex_unlock(&dibs_dev_list.mutex); + return rc; } EXPORT_SYMBOL_GPL(dibs_unregister_client); @@ -150,11 +181,19 @@ static const struct attribute_group dibs_dev_attr_group = { int dibs_dev_add(struct dibs_dev *dibs) { + int max_dmbs; int i, ret; + max_dmbs = dibs->ops->max_dmbs(); + spin_lock_init(&dibs->lock); + dibs->dmb_clientid_arr = kzalloc(max_dmbs, GFP_KERNEL); + if (!dibs->dmb_clientid_arr) + return -ENOMEM; + memset(dibs->dmb_clientid_arr, NO_DIBS_CLIENT, max_dmbs); + ret = device_add(&dibs->dev); if (ret) - return ret; + goto free_client_arr; ret = sysfs_create_group(&dibs->dev.kobj, &dibs_dev_attr_group); if (ret) { @@ -164,8 +203,10 @@ int dibs_dev_add(struct dibs_dev *dibs) mutex_lock(&dibs_dev_list.mutex); mutex_lock(&clients_lock); for (i = 0; i < max_client; ++i) { - if (clients[i]) + if (clients[i]) { clients[i]->ops->add_dev(dibs); + dibs_setup_forwarding(clients[i], dibs); + } } mutex_unlock(&clients_lock); list_add(&dibs->list, &dibs_dev_list.list); @@ -175,6 +216,8 @@ int dibs_dev_add(struct dibs_dev *dibs) err_device_del: device_del(&dibs->dev); +free_client_arr: + kfree(dibs->dmb_clientid_arr); return ret; } @@ -182,8 +225,16 @@ EXPORT_SYMBOL_GPL(dibs_dev_add); void dibs_dev_del(struct dibs_dev *dibs) { + unsigned long flags; int i; + sysfs_remove_group(&dibs->dev.kobj, &dibs_dev_attr_group); + + spin_lock_irqsave(&dibs->lock, flags); + for (i = 0; i < MAX_DIBS_CLIENTS; ++i) + dibs->subs[i] = NULL; + spin_unlock_irqrestore(&dibs->lock, flags); + mutex_lock(&dibs_dev_list.mutex); mutex_lock(&clients_lock); for (i = 0; i < max_client; ++i) { @@ -195,6 +246,7 @@ void dibs_dev_del(struct dibs_dev *dibs) mutex_unlock(&dibs_dev_list.mutex); device_del(&dibs->dev); + kfree(dibs->dmb_clientid_arr); } EXPORT_SYMBOL_GPL(dibs_dev_del); diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 121b3a2be760..346d1ea8650b 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -98,14 +98,6 @@ int ism_unregister_client(struct ism_client *client) spin_lock_irqsave(&ism->lock, flags); /* Stop forwarding IRQs and events */ ism->subs[client->id] = NULL; - for (int i = 0; i < ISM_NR_DMBS; ++i) { - if (ism->sba_client_arr[i] == client->id) { - WARN(1, "%s: attempt to unregister '%s' with registered dmb(s)\n", - __func__, client->name); - rc = -EBUSY; - goto err_reg_dmb; - } - } spin_unlock_irqrestore(&ism->lock, flags); } mutex_unlock(&ism_dev_list.mutex); @@ -116,11 +108,6 @@ int ism_unregister_client(struct ism_client *client) max_client--; mutex_unlock(&clients_lock); return rc; - -err_reg_dmb: - spin_unlock_irqrestore(&ism->lock, flags); - mutex_unlock(&ism_dev_list.mutex); - return rc; } EXPORT_SYMBOL_GPL(ism_unregister_client); @@ -308,15 +295,20 @@ static int ism_query_rgid(struct dibs_dev *dibs, const uuid_t *rgid, return ism_cmd(ism, &cmd); } -static void ism_free_dmb(struct ism_dev *ism, struct ism_dmb *dmb) +static int ism_max_dmbs(void) +{ + return ISM_NR_DMBS; +} + +static void ism_free_dmb(struct ism_dev *ism, struct dibs_dmb *dmb) { - clear_bit(dmb->sba_idx, ism->sba_bitmap); + clear_bit(dmb->idx, ism->sba_bitmap); dma_unmap_page(&ism->pdev->dev, dmb->dma_addr, dmb->dmb_len, DMA_FROM_DEVICE); folio_put(virt_to_folio(dmb->cpu_addr)); } -static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) +static int ism_alloc_dmb(struct ism_dev *ism, struct dibs_dmb *dmb) { struct folio *folio; unsigned long bit; @@ -325,16 +317,16 @@ static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) if (PAGE_ALIGN(dmb->dmb_len) > dma_get_max_seg_size(&ism->pdev->dev)) return -EINVAL; - if (!dmb->sba_idx) { + if (!dmb->idx) { bit = find_next_zero_bit(ism->sba_bitmap, ISM_NR_DMBS, ISM_DMB_BIT_OFFSET); if (bit == ISM_NR_DMBS) return -ENOSPC; - dmb->sba_idx = bit; + dmb->idx = bit; } - if (dmb->sba_idx < ISM_DMB_BIT_OFFSET || - test_and_set_bit(dmb->sba_idx, ism->sba_bitmap)) + if (dmb->idx < ISM_DMB_BIT_OFFSET || + test_and_set_bit(dmb->idx, ism->sba_bitmap)) return -EINVAL; folio = folio_alloc(GFP_KERNEL | __GFP_NOWARN | __GFP_NOMEMALLOC | @@ -359,13 +351,14 @@ static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) out_free: kfree(dmb->cpu_addr); out_bit: - clear_bit(dmb->sba_idx, ism->sba_bitmap); + clear_bit(dmb->idx, ism->sba_bitmap); return rc; } -int ism_register_dmb(struct ism_dev *ism, struct ism_dmb *dmb, - struct ism_client *client) +static int ism_register_dmb(struct dibs_dev *dibs, struct dibs_dmb *dmb, + struct dibs_client *client) { + struct ism_dev *ism = dibs->drv_priv; union ism_reg_dmb cmd; unsigned long flags; int ret; @@ -380,10 +373,10 @@ int ism_register_dmb(struct ism_dev *ism, struct ism_dmb *dmb, cmd.request.dmb = dmb->dma_addr; cmd.request.dmb_len = dmb->dmb_len; - cmd.request.sba_idx = dmb->sba_idx; + cmd.request.sba_idx = dmb->idx; cmd.request.vlan_valid = dmb->vlan_valid; cmd.request.vlan_id = dmb->vlan_id; - cmd.request.rgid = dmb->rgid; + memcpy(&cmd.request.rgid, &dmb->rgid, sizeof(u64)); ret = ism_cmd(ism, &cmd); if (ret) { @@ -391,16 +384,16 @@ int ism_register_dmb(struct ism_dev *ism, struct ism_dmb *dmb, goto out; } dmb->dmb_tok = cmd.response.dmb_tok; - spin_lock_irqsave(&ism->lock, flags); - ism->sba_client_arr[dmb->sba_idx - ISM_DMB_BIT_OFFSET] = client->id; - spin_unlock_irqrestore(&ism->lock, flags); + spin_lock_irqsave(&dibs->lock, flags); + dibs->dmb_clientid_arr[dmb->idx - ISM_DMB_BIT_OFFSET] = client->id; + spin_unlock_irqrestore(&dibs->lock, flags); out: return ret; } -EXPORT_SYMBOL_GPL(ism_register_dmb); -int ism_unregister_dmb(struct ism_dev *ism, struct ism_dmb *dmb) +static int ism_unregister_dmb(struct dibs_dev *dibs, struct dibs_dmb *dmb) { + struct ism_dev *ism = dibs->drv_priv; union ism_unreg_dmb cmd; unsigned long flags; int ret; @@ -411,9 +404,9 @@ int ism_unregister_dmb(struct ism_dev *ism, struct ism_dmb *dmb) cmd.request.dmb_tok = dmb->dmb_tok; - spin_lock_irqsave(&ism->lock, flags); - ism->sba_client_arr[dmb->sba_idx - ISM_DMB_BIT_OFFSET] = NO_CLIENT; - spin_unlock_irqrestore(&ism->lock, flags); + spin_lock_irqsave(&dibs->lock, flags); + dibs->dmb_clientid_arr[dmb->idx - ISM_DMB_BIT_OFFSET] = NO_DIBS_CLIENT; + spin_unlock_irqrestore(&dibs->lock, flags); ret = ism_cmd(ism, &cmd); if (ret && ret != ISM_ERROR) @@ -423,7 +416,6 @@ int ism_unregister_dmb(struct ism_dev *ism, struct ism_dmb *dmb) out: return ret; } -EXPORT_SYMBOL_GPL(ism_unregister_dmb); static int ism_add_vlan_id(struct dibs_dev *dibs, u64 vlan_id) { @@ -459,9 +451,11 @@ static unsigned int max_bytes(unsigned int start, unsigned int len, return min(boundary - (start & (boundary - 1)), len); } -int ism_move(struct ism_dev *ism, u64 dmb_tok, unsigned int idx, bool sf, - unsigned int offset, void *data, unsigned int size) +static int ism_move(struct dibs_dev *dibs, u64 dmb_tok, unsigned int idx, + bool sf, unsigned int offset, void *data, + unsigned int size) { + struct ism_dev *ism = dibs->drv_priv; unsigned int bytes; u64 dmb_req; int ret; @@ -482,7 +476,6 @@ int ism_move(struct ism_dev *ism, u64 dmb_tok, unsigned int idx, bool sf, return 0; } -EXPORT_SYMBOL_GPL(ism_move); static u16 ism_get_chid(struct dibs_dev *dibs) { @@ -518,14 +511,17 @@ static irqreturn_t ism_handle_irq(int irq, void *data) { struct ism_dev *ism = data; unsigned long bit, end; + struct dibs_dev *dibs; unsigned long *bv; u16 dmbemask; u8 client_id; + dibs = ism->dibs; + bv = (void *) &ism->sba->dmb_bits[ISM_DMB_WORD_OFFSET]; end = sizeof(ism->sba->dmb_bits) * BITS_PER_BYTE - ISM_DMB_BIT_OFFSET; - spin_lock(&ism->lock); + spin_lock(&dibs->lock); ism->sba->s = 0; barrier(); for (bit = 0;;) { @@ -537,10 +533,13 @@ static irqreturn_t ism_handle_irq(int irq, void *data) dmbemask = ism->sba->dmbe_mask[bit + ISM_DMB_BIT_OFFSET]; ism->sba->dmbe_mask[bit + ISM_DMB_BIT_OFFSET] = 0; barrier(); - client_id = ism->sba_client_arr[bit]; - if (unlikely(client_id == NO_CLIENT || !ism->subs[client_id])) + client_id = dibs->dmb_clientid_arr[bit]; + if (unlikely(client_id == NO_DIBS_CLIENT || + !dibs->subs[client_id])) continue; - ism->subs[client_id]->handle_irq(ism, bit + ISM_DMB_BIT_OFFSET, dmbemask); + dibs->subs[client_id]->ops->handle_irq(dibs, + bit + ISM_DMB_BIT_OFFSET, + dmbemask); } if (ism->sba->e) { @@ -548,13 +547,17 @@ static irqreturn_t ism_handle_irq(int irq, void *data) barrier(); ism_handle_event(ism); } - spin_unlock(&ism->lock); + spin_unlock(&dibs->lock); return IRQ_HANDLED; } static const struct dibs_dev_ops ism_ops = { .get_fabric_id = ism_get_chid, .query_remote_gid = ism_query_rgid, + .max_dmbs = ism_max_dmbs, + .register_dmb = ism_register_dmb, + .unregister_dmb = ism_unregister_dmb, + .move_data = ism_move, .add_vlan_id = ism_add_vlan_id, .del_vlan_id = ism_del_vlan_id, }; @@ -568,15 +571,10 @@ static int ism_dev_init(struct ism_dev *ism) if (ret <= 0) goto out; - ism->sba_client_arr = kzalloc(ISM_NR_DMBS, GFP_KERNEL); - if (!ism->sba_client_arr) - goto free_vectors; - memset(ism->sba_client_arr, NO_CLIENT, ISM_NR_DMBS); - ret = request_irq(pci_irq_vector(pdev, 0), ism_handle_irq, 0, pci_name(pdev), ism); if (ret) - goto free_client_arr; + goto free_vectors; ret = register_sba(ism); if (ret) @@ -605,8 +603,6 @@ unreg_sba: unregister_sba(ism); free_irq: free_irq(pci_irq_vector(pdev, 0), ism); -free_client_arr: - kfree(ism->sba_client_arr); free_vectors: pci_free_irq_vectors(pdev); out: @@ -629,7 +625,6 @@ static void ism_dev_exit(struct ism_dev *ism) unregister_ieq(ism); unregister_sba(ism); free_irq(pci_irq_vector(pdev, 0), ism); - kfree(ism->sba_client_arr); pci_free_irq_vectors(pdev); list_del_init(&ism->list); mutex_unlock(&ism_dev_list.mutex); @@ -677,6 +672,9 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) dibs->drv_priv = ism; dibs->ops = &ism_ops; + /* enable ism device, but any interrupts and events will be ignored + * before dibs_dev_add() adds it to any clients. + */ ret = ism_dev_init(ism); if (ret) goto err_dibs; @@ -766,17 +764,6 @@ module_exit(ism_exit); /*************************** SMC-D Implementation *****************************/ #if IS_ENABLED(CONFIG_SMC) -static int smcd_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, - void *client) -{ - return ism_register_dmb(smcd->priv, (struct ism_dmb *)dmb, client); -} - -static int smcd_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) -{ - return ism_unregister_dmb(smcd->priv, (struct ism_dmb *)dmb); -} - static int ism_signal_ieq(struct ism_dev *ism, u64 rgid, u32 trigger_irq, u32 event_code, u64 info) { @@ -801,18 +788,8 @@ static int smcd_signal_ieq(struct smcd_dev *smcd, struct smcd_gid *rgid, trigger_irq, event_code, info); } -static int smcd_move(struct smcd_dev *smcd, u64 dmb_tok, unsigned int idx, - bool sf, unsigned int offset, void *data, - unsigned int size) -{ - return ism_move(smcd->priv, dmb_tok, idx, sf, offset, data, size); -} - static const struct smcd_ops ism_smcd_ops = { - .register_dmb = smcd_register_dmb, - .unregister_dmb = smcd_unregister_dmb, .signal_event = smcd_signal_ieq, - .move_data = smcd_move, }; const struct smcd_ops *ism_get_smcd_ops(void) diff --git a/include/linux/dibs.h b/include/linux/dibs.h index c75a40fe3039..be009c614205 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -36,12 +36,44 @@ * clients. */ +/* DMB - Direct Memory Buffer + * -------------------------- + * A dibs client provides a dmb as input buffer for a local receiving + * dibs device for exactly one (remote) sending dibs device. Only this + * sending device can send data into this dmb using move_data(). Sender + * and receiver can be the same device. A dmb belongs to exactly one client. + */ +struct dibs_dmb { + /* tok - Token for this dmb + * Used by remote and local devices and clients to address this dmb. + * Provided by dibs fabric. Unique per dibs fabric. + */ + u64 dmb_tok; + /* rgid - GID of designated remote sending device */ + uuid_t rgid; + /* cpu_addr - buffer address */ + void *cpu_addr; + /* len - buffer length */ + u32 dmb_len; + /* idx - Index of this DMB on this receiving device */ + u32 idx; + /* VLAN support (deprecated) + * In order to write into a vlan-tagged dmb, the remote device needs + * to belong to the this vlan + */ + u32 vlan_valid; + u32 vlan_id; + /* optional, used by device driver */ + dma_addr_t dma_addr; +}; + struct dibs_dev; /* DIBS client * ----------- */ #define MAX_DIBS_CLIENTS 8 +#define NO_DIBS_CLIENT 0xff /* All dibs clients have access to all dibs devices. * A dibs client provides the following functions to be called by dibs layer or * dibs device drivers: @@ -69,6 +101,22 @@ struct dibs_client_ops { * The device is no longer usable by this client after this call. */ void (*del_dev)(struct dibs_dev *dev); + /** + * handle_irq() - Handle signaling for a DMB + * @dev: device that owns the dmb + * @idx: Index of the dmb that got signalled + * @dmbemask: signaling mask of the dmb + * + * Handle signaling for a dmb that was registered by this client + * for this device. + * The dibs device can coalesce multiple signaling triggers into a + * single call of handle_irq(). dmbemask can be used to indicate + * different kinds of triggers. + * + * Context: Called in IRQ context by dibs device driver + */ + void (*handle_irq)(struct dibs_dev *dev, unsigned int idx, + u16 dmbemask); }; struct dibs_client { @@ -147,6 +195,77 @@ struct dibs_dev_ops { */ int (*query_remote_gid)(struct dibs_dev *dev, const uuid_t *rgid, u32 vid_valid, u32 vid); + /** + * max_dmbs() + * Return: Max number of DMBs that can be registered for this kind of + * dibs_dev + */ + int (*max_dmbs)(void); + /** + * register_dmb() - allocate and register a dmb + * @dev: dibs device + * @dmb: dmb struct to be registered + * @client: dibs client + * @vid: VLAN id; deprecated, ignored if device does not support vlan + * + * The following fields of dmb must provide valid input: + * @rgid: gid of remote user device + * @dmb_len: buffer length + * @idx: Optionally:requested idx (if non-zero) + * @vlan_valid: if zero, vlan_id will be ignored; + * deprecated, ignored if device does not support vlan + * @vlan_id: deprecated, ignored if device does not support vlan + * Upon return in addition the following fields will be valid: + * @dmb_tok: for usage by remote and local devices and clients + * @cpu_addr: allocated buffer + * @idx: dmb index, unique per dibs device + * @dma_addr: to be used by device driver,if applicable + * + * Allocate a dmb buffer and register it with this device and for this + * client. + * Return: zero on success + */ + int (*register_dmb)(struct dibs_dev *dev, struct dibs_dmb *dmb, + struct dibs_client *client); + /** + * unregister_dmb() - unregister and free a dmb + * @dev: dibs device + * @dmb: dmb struct to be unregistered + * The following fields of dmb must provide valid input: + * @dmb_tok + * @cpu_addr + * @idx + * + * Free dmb.cpu_addr and unregister the dmb from this device. + * Return: zero on success + */ + int (*unregister_dmb)(struct dibs_dev *dev, struct dibs_dmb *dmb); + /** + * move_data() - write into a remote dmb + * @dev: Local sending dibs device + * @dmb_tok: Token of the remote dmb + * @idx: signaling index in dmbemask + * @sf: signaling flag; + * if true, idx will be turned on at target dmbemask mask + * and target device will be signaled. + * @offset: offset within target dmb + * @data: pointer to data to be sent + * @size: length of data to be sent, can be zero. + * + * Use dev to write data of size at offset into a remote dmb + * identified by dmb_tok. Data is moved synchronously, *data can + * be freed when this function returns. + * + * If signaling flag (sf) is true, bit number idx bit will be turned + * on in the dmbemask mask when handle_irq() is called at the remote + * dibs client that owns the target dmb. The target device may chose + * to coalesce the signaling triggers of multiple move_data() calls + * to the same target dmb into a single handle_irq() call. + * Return: zero on success + */ + int (*move_data)(struct dibs_dev *dev, u64 dmb_tok, unsigned int idx, + bool sf, unsigned int offset, void *data, + unsigned int size); /** * add_vlan_id() - add dibs device to vlan (optional, deprecated) * @dev: dibs device @@ -166,6 +285,55 @@ struct dibs_dev_ops { * Return: zero on success */ int (*del_vlan_id)(struct dibs_dev *dev, u64 vlan_id); + /** + * support_mmapped_rdmb() - can this device provide memory mapped + * remote dmbs? (optional) + * @dev: dibs device + * + * A dibs device can provide a kernel address + length, that represent + * a remote target dmb (like MMIO). Alternatively to calling + * move_data(), a dibs client can write into such a ghost-send-buffer + * (= to this kernel address) and the data will automatically + * immediately appear in the target dmb, even without calling + * move_data(). + * + * Either all 3 function pointers for support_dmb_nocopy(), + * attach_dmb() and detach_dmb() are defined, or all of them must + * be NULL. + * + * Return: non-zero, if memory mapped remote dmbs are supported. + */ + int (*support_mmapped_rdmb)(struct dibs_dev *dev); + /** + * attach_dmb() - attach local memory to a remote dmb + * @dev: Local sending ism device + * @dmb: all other parameters are passed in the form of a + * dmb struct + * TODO: (THIS IS CONFUSING, should be changed) + * dmb_tok: (in) Token of the remote dmb, we want to attach to + * cpu_addr: (out) MMIO address + * dma_addr: (out) MMIO address (if applicable, invalid otherwise) + * dmb_len: (out) length of local MMIO region, + * equal to length of remote DMB. + * sba_idx: (out) index of remote dmb (NOT HELPFUL, should be removed) + * + * Provides a memory address to the sender that can be used to + * directly write into the remote dmb. + * Memory is available until detach_dmb is called + * + * Return: Zero upon success, Error code otherwise + */ + int (*attach_dmb)(struct dibs_dev *dev, struct dibs_dmb *dmb); + /** + * detach_dmb() - Detach the ghost buffer from a remote dmb + * @dev: ism device + * @token: dmb token of the remote dmb + * + * No need to free cpu_addr. + * + * Return: Zero upon success, Error code otherwise + */ + int (*detach_dmb)(struct dibs_dev *dev, u64 token); }; struct dibs_dev { @@ -179,6 +347,15 @@ struct dibs_dev { /* priv pointer per client; for client usage only */ void *priv[MAX_DIBS_CLIENTS]; + + /* get this lock before accessing any of the fields below */ + spinlock_t lock; + /* array of client ids indexed by dmb idx; + * can be used as indices into priv and subs arrays + */ + u8 *dmb_clientid_arr; + /* Sparse array of all ISM clients */ + struct dibs_client *subs[MAX_DIBS_CLIENTS]; }; static inline void dibs_set_priv(struct dibs_dev *dev, diff --git a/include/linux/ism.h b/include/linux/ism.h index a926dd61b5a1..b7feb4dcd5a8 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -11,17 +11,6 @@ #include -struct ism_dmb { - u64 dmb_tok; - u64 rgid; - u32 dmb_len; - u32 sba_idx; - u32 vlan_valid; - u32 vlan_id; - void *cpu_addr; - dma_addr_t dma_addr; -}; - /* Unless we gain unexpected popularity, this limit should hold for a while */ #define MAX_CLIENTS 8 #define ISM_NR_DMBS 1920 @@ -36,7 +25,6 @@ struct ism_dev { struct ism_sba *sba; dma_addr_t sba_dma_addr; DECLARE_BITMAP(sba_bitmap, ISM_NR_DMBS); - u8 *sba_client_arr; /* entries are indices into 'clients' array */ void *priv[MAX_CLIENTS]; struct ism_eq *ieq; @@ -58,11 +46,6 @@ struct ism_event { struct ism_client { const char *name; void (*handle_event)(struct ism_dev *dev, struct ism_event *event); - /* Parameter dmbemask contains a bit vector with updated DMBEs, if sent - * via ism_move_data(). Callback function must handle all active bits - * indicated by dmbemask. - */ - void (*handle_irq)(struct ism_dev *dev, unsigned int bit, u16 dmbemask); /* Private area - don't touch! */ u8 id; }; @@ -79,12 +62,6 @@ static inline void ism_set_priv(struct ism_dev *dev, struct ism_client *client, dev->priv[client->id] = priv; } -int ism_register_dmb(struct ism_dev *dev, struct ism_dmb *dmb, - struct ism_client *client); -int ism_unregister_dmb(struct ism_dev *dev, struct ism_dmb *dmb); -int ism_move(struct ism_dev *dev, u64 dmb_tok, unsigned int idx, bool sf, - unsigned int offset, void *data, unsigned int size); - const struct smcd_ops *ism_get_smcd_ops(void); #endif /* _ISM_H */ diff --git a/include/net/smc.h b/include/net/smc.h index 5bd135fb4d49..8e3debcf7db5 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -28,17 +28,6 @@ struct smc_hashinfo { }; /* SMCD/ISM device driver interface */ -struct smcd_dmb { - u64 dmb_tok; - u64 rgid; - u32 dmb_len; - u32 sba_idx; - u32 vlan_valid; - u32 vlan_id; - void *cpu_addr; - dma_addr_t dma_addr; -}; - #define ISM_EVENT_DMB 0 #define ISM_EVENT_GID 1 #define ISM_EVENT_SWR 2 @@ -53,25 +42,14 @@ struct smcd_gid { }; struct smcd_ops { - int (*register_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb, - void *client); - int (*unregister_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); - int (*move_data)(struct smcd_dev *dev, u64 dmb_tok, unsigned int idx, - bool sf, unsigned int offset, void *data, - unsigned int size); - /* optional operations */ int (*signal_event)(struct smcd_dev *dev, struct smcd_gid *rgid, u32 trigger_irq, u32 event_code, u64 info); - int (*support_dmb_nocopy)(struct smcd_dev *dev); - int (*attach_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); - int (*detach_dmb)(struct smcd_dev *dev, u64 token); }; struct smcd_dev { const struct smcd_ops *ops; void *priv; - void *client; struct dibs_dev *dibs; struct list_head list; spinlock_t lock; diff --git a/net/smc/Makefile b/net/smc/Makefile index 96ccfdf246df..0e754cbc38f9 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -6,4 +6,3 @@ smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o smc-y += smc_tracepoint.o smc_inet.o smc-$(CONFIG_SYSCTL) += smc_sysctl.o -smc-y += smc_loopback.o diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index d20d00b46825..01e49371d23d 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -15,7 +15,6 @@ #include "smc.h" #include "smc_core.h" #include "smc_ism.h" -#include "smc_loopback.h" #include "smc_pnet.h" #include "smc_netlink.h" #include "linux/ism.h" @@ -33,18 +32,19 @@ static void smcd_register_dev(struct dibs_dev *dibs); static void smcd_unregister_dev(struct dibs_dev *dibs); #if IS_ENABLED(CONFIG_ISM) static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event); -static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, - u16 dmbemask); static struct ism_client smc_ism_client = { .name = "SMC-D", .handle_event = smcd_handle_event, - .handle_irq = smcd_handle_irq, }; #endif +static void smcd_handle_irq(struct dibs_dev *dibs, unsigned int dmbno, + u16 dmbemask); + static struct dibs_client_ops smc_client_ops = { .add_dev = smcd_register_dev, .del_dev = smcd_unregister_dev, + .handle_irq = smcd_handle_irq, }; static struct dibs_client smc_dibs_client = { @@ -221,18 +221,19 @@ out: void smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) { - struct smcd_dmb dmb; + struct dibs_dmb dmb; if (!dmb_desc->dma_addr) return; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_tok = dmb_desc->token; - dmb.sba_idx = dmb_desc->sba_idx; + dmb.idx = dmb_desc->sba_idx; dmb.cpu_addr = dmb_desc->cpu_addr; dmb.dma_addr = dmb_desc->dma_addr; dmb.dmb_len = dmb_desc->len; - smcd->ops->unregister_dmb(smcd, &dmb); + + smcd->dibs->ops->unregister_dmb(smcd->dibs, &dmb); return; } @@ -240,17 +241,20 @@ void smc_ism_unregister_dmb(struct smcd_dev *smcd, int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, struct smc_buf_desc *dmb_desc) { - struct smcd_dmb dmb; + struct dibs_dev *dibs; + struct dibs_dmb dmb; int rc; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_len = dmb_len; - dmb.sba_idx = dmb_desc->sba_idx; + dmb.idx = dmb_desc->sba_idx; dmb.vlan_id = lgr->vlan_id; - dmb.rgid = lgr->peer_gid.gid; - rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, lgr->smcd->client); + copy_to_dibsgid(&dmb.rgid, &lgr->peer_gid); + + dibs = lgr->smcd->dibs; + rc = dibs->ops->register_dmb(dibs, &dmb, &smc_dibs_client); if (!rc) { - dmb_desc->sba_idx = dmb.sba_idx; + dmb_desc->sba_idx = dmb.idx; dmb_desc->token = dmb.dmb_tok; dmb_desc->cpu_addr = dmb.cpu_addr; dmb_desc->dma_addr = dmb.dma_addr; @@ -265,24 +269,24 @@ bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd) * merging sndbuf with peer DMB to avoid * data copies between them. */ - return (smcd->ops->support_dmb_nocopy && - smcd->ops->support_dmb_nocopy(smcd)); + return (smcd->dibs->ops->support_mmapped_rdmb && + smcd->dibs->ops->support_mmapped_rdmb(smcd->dibs)); } int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, struct smc_buf_desc *dmb_desc) { - struct smcd_dmb dmb; + struct dibs_dmb dmb; int rc = 0; - if (!dev->ops->attach_dmb) + if (!dev->dibs->ops->attach_dmb) return -EINVAL; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_tok = token; - rc = dev->ops->attach_dmb(dev, &dmb); + rc = dev->dibs->ops->attach_dmb(dev->dibs, &dmb); if (!rc) { - dmb_desc->sba_idx = dmb.sba_idx; + dmb_desc->sba_idx = dmb.idx; dmb_desc->token = dmb.dmb_tok; dmb_desc->cpu_addr = dmb.cpu_addr; dmb_desc->dma_addr = dmb.dma_addr; @@ -294,10 +298,10 @@ int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, int smc_ism_detach_dmb(struct smcd_dev *dev, u64 token) { - if (!dev->ops->detach_dmb) + if (!dev->dibs->ops->detach_dmb) return -EINVAL; - return dev->ops->detach_dmb(dev, token); + return dev->dibs->ops->detach_dmb(dev->dibs, token); } static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, @@ -503,26 +507,20 @@ static void smcd_register_dev(struct dibs_dev *dibs) { struct smcd_dev *smcd, *fentry; const struct smcd_ops *ops; - struct smc_lo_dev *smc_lo; struct ism_dev *ism; + int max_dmbs; - if (smc_ism_is_loopback(dibs)) { - if (smc_loopback_init(&smc_lo)) - return; - } + max_dmbs = dibs->ops->max_dmbs(); if (smc_ism_is_loopback(dibs)) { - ops = smc_lo_get_smcd_ops(); - smcd = smcd_alloc_dev(dev_name(&dibs->dev), ops, - SMC_LO_MAX_DMBS); + ops = NULL; } else { ism = dibs->drv_priv; #if IS_ENABLED(CONFIG_ISM) ops = ism_get_smcd_ops(); #endif - smcd = smcd_alloc_dev(dev_name(&dibs->dev), ops, - ISM_NR_DMBS); } + smcd = smcd_alloc_dev(dev_name(&dibs->dev), ops, max_dmbs); if (!smcd) return; @@ -530,13 +528,11 @@ static void smcd_register_dev(struct dibs_dev *dibs) dibs_set_priv(dibs, &smc_dibs_client, smcd); if (smc_ism_is_loopback(dibs)) { - smcd->priv = smc_lo; - smc_lo->smcd = smcd; + smcd->priv = NULL; } else { smcd->priv = ism; #if IS_ENABLED(CONFIG_ISM) ism_set_priv(ism, &smc_ism_client, smcd); - smcd->client = &smc_ism_client; #endif } @@ -590,8 +586,6 @@ static void smcd_unregister_dev(struct dibs_dev *dibs) list_del_init(&smcd->list); mutex_unlock(&smcd_dev_list.mutex); destroy_workqueue(smcd->event_wq); - if (smc_ism_is_loopback(dibs)) - smc_loopback_exit(); kfree(smcd->conn); kfree(smcd); } @@ -624,6 +618,7 @@ static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) wrk->event = *event; queue_work(smcd->event_wq, &wrk->work); } +#endif /* SMCD Device interrupt handler. Called from ISM device interrupt handler. * Parameters are the ism device pointer, DMB number, and the DMBE bitmask. @@ -632,10 +627,10 @@ static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) * Context: * - Function called in IRQ context from ISM device driver IRQ handler. */ -static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, +static void smcd_handle_irq(struct dibs_dev *dibs, unsigned int dmbno, u16 dmbemask) { - struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); + struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client); struct smc_connection *conn = NULL; unsigned long flags; @@ -645,7 +640,6 @@ static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, tasklet_schedule(&conn->rx_tsklet); spin_unlock_irqrestore(&smcd->lock, flags); } -#endif int smc_ism_signal_shutdown(struct smc_link_group *lgr) { diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index 139e99da2c9f..a1575e31df73 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -69,7 +69,9 @@ static inline int smc_ism_write(struct smcd_dev *smcd, u64 dmb_tok, { int rc; - rc = smcd->ops->move_data(smcd, dmb_tok, idx, sf, offset, data, len); + rc = smcd->dibs->ops->move_data(smcd->dibs, dmb_tok, idx, sf, offset, + data, len); + return rc < 0 ? rc : 0; } diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c deleted file mode 100644 index 52cba01cb209..000000000000 --- a/net/smc/smc_loopback.c +++ /dev/null @@ -1,294 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Shared Memory Communications Direct over loopback-ism device. - * - * Functions for loopback-ism device. - * - * Copyright (c) 2024, Alibaba Inc. - * - * Author: Wen Gu - * Tony Lu - * - */ - -#include -#include -#include -#include - -#include "smc_cdc.h" -#include "smc_ism.h" -#include "smc_loopback.h" - -#define SMC_LO_SUPPORT_NOCOPY 0x1 -#define SMC_DMA_ADDR_INVALID (~(dma_addr_t)0) - -static struct smc_lo_dev *lo_dev; - -static int smc_lo_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, - void *client_priv) -{ - struct smc_lo_dmb_node *dmb_node, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - int sba_idx, rc; - - /* check space for new dmb */ - for_each_clear_bit(sba_idx, ldev->sba_idx_mask, SMC_LO_MAX_DMBS) { - if (!test_and_set_bit(sba_idx, ldev->sba_idx_mask)) - break; - } - if (sba_idx == SMC_LO_MAX_DMBS) - return -ENOSPC; - - dmb_node = kzalloc(sizeof(*dmb_node), GFP_KERNEL); - if (!dmb_node) { - rc = -ENOMEM; - goto err_bit; - } - - dmb_node->sba_idx = sba_idx; - dmb_node->len = dmb->dmb_len; - dmb_node->cpu_addr = kzalloc(dmb_node->len, GFP_KERNEL | - __GFP_NOWARN | __GFP_NORETRY | - __GFP_NOMEMALLOC); - if (!dmb_node->cpu_addr) { - rc = -ENOMEM; - goto err_node; - } - dmb_node->dma_addr = SMC_DMA_ADDR_INVALID; - refcount_set(&dmb_node->refcnt, 1); - -again: - /* add new dmb into hash table */ - get_random_bytes(&dmb_node->token, sizeof(dmb_node->token)); - write_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_node->token) { - if (tmp_node->token == dmb_node->token) { - write_unlock_bh(&ldev->dmb_ht_lock); - goto again; - } - } - hash_add(ldev->dmb_ht, &dmb_node->list, dmb_node->token); - write_unlock_bh(&ldev->dmb_ht_lock); - atomic_inc(&ldev->dmb_cnt); - - dmb->sba_idx = dmb_node->sba_idx; - dmb->dmb_tok = dmb_node->token; - dmb->cpu_addr = dmb_node->cpu_addr; - dmb->dma_addr = dmb_node->dma_addr; - dmb->dmb_len = dmb_node->len; - - return 0; - -err_node: - kfree(dmb_node); -err_bit: - clear_bit(sba_idx, ldev->sba_idx_mask); - return rc; -} - -static void __smc_lo_unregister_dmb(struct smc_lo_dev *ldev, - struct smc_lo_dmb_node *dmb_node) -{ - /* remove dmb from hash table */ - write_lock_bh(&ldev->dmb_ht_lock); - hash_del(&dmb_node->list); - write_unlock_bh(&ldev->dmb_ht_lock); - - clear_bit(dmb_node->sba_idx, ldev->sba_idx_mask); - kvfree(dmb_node->cpu_addr); - kfree(dmb_node); - - if (atomic_dec_and_test(&ldev->dmb_cnt)) - wake_up(&ldev->ldev_release); -} - -static int smc_lo_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) -{ - struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - - /* find dmb from hash table */ - read_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) { - if (tmp_node->token == dmb->dmb_tok) { - dmb_node = tmp_node; - break; - } - } - if (!dmb_node) { - read_unlock_bh(&ldev->dmb_ht_lock); - return -EINVAL; - } - read_unlock_bh(&ldev->dmb_ht_lock); - - if (refcount_dec_and_test(&dmb_node->refcnt)) - __smc_lo_unregister_dmb(ldev, dmb_node); - return 0; -} - -static int smc_lo_support_dmb_nocopy(struct smcd_dev *smcd) -{ - return SMC_LO_SUPPORT_NOCOPY; -} - -static int smc_lo_attach_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) -{ - struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - - /* find dmb_node according to dmb->dmb_tok */ - read_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) { - if (tmp_node->token == dmb->dmb_tok) { - dmb_node = tmp_node; - break; - } - } - if (!dmb_node) { - read_unlock_bh(&ldev->dmb_ht_lock); - return -EINVAL; - } - read_unlock_bh(&ldev->dmb_ht_lock); - - if (!refcount_inc_not_zero(&dmb_node->refcnt)) - /* the dmb is being unregistered, but has - * not been removed from the hash table. - */ - return -EINVAL; - - /* provide dmb information */ - dmb->sba_idx = dmb_node->sba_idx; - dmb->dmb_tok = dmb_node->token; - dmb->cpu_addr = dmb_node->cpu_addr; - dmb->dma_addr = dmb_node->dma_addr; - dmb->dmb_len = dmb_node->len; - return 0; -} - -static int smc_lo_detach_dmb(struct smcd_dev *smcd, u64 token) -{ - struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - - /* find dmb_node according to dmb->dmb_tok */ - read_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, token) { - if (tmp_node->token == token) { - dmb_node = tmp_node; - break; - } - } - if (!dmb_node) { - read_unlock_bh(&ldev->dmb_ht_lock); - return -EINVAL; - } - read_unlock_bh(&ldev->dmb_ht_lock); - - if (refcount_dec_and_test(&dmb_node->refcnt)) - __smc_lo_unregister_dmb(ldev, dmb_node); - return 0; -} - -static int smc_lo_move_data(struct smcd_dev *smcd, u64 dmb_tok, - unsigned int idx, bool sf, unsigned int offset, - void *data, unsigned int size) -{ - struct smc_lo_dmb_node *rmb_node = NULL, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - struct smc_connection *conn; - - read_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_tok) { - if (tmp_node->token == dmb_tok) { - rmb_node = tmp_node; - break; - } - } - if (!rmb_node) { - read_unlock_bh(&ldev->dmb_ht_lock); - return -EINVAL; - } - memcpy((char *)rmb_node->cpu_addr + offset, data, size); - read_unlock_bh(&ldev->dmb_ht_lock); - - if (!sf) - return 0; - - conn = smcd->conn[rmb_node->sba_idx]; - if (!conn || conn->killed) - return -EPIPE; - tasklet_schedule(&conn->rx_tsklet); - return 0; -} - -static const struct smcd_ops lo_ops = { - .register_dmb = smc_lo_register_dmb, - .unregister_dmb = smc_lo_unregister_dmb, - .support_dmb_nocopy = smc_lo_support_dmb_nocopy, - .attach_dmb = smc_lo_attach_dmb, - .detach_dmb = smc_lo_detach_dmb, - .signal_event = NULL, - .move_data = smc_lo_move_data, -}; - -const struct smcd_ops *smc_lo_get_smcd_ops(void) -{ - return &lo_ops; -} - -static void smc_lo_dev_init(struct smc_lo_dev *ldev) -{ - rwlock_init(&ldev->dmb_ht_lock); - hash_init(ldev->dmb_ht); - atomic_set(&ldev->dmb_cnt, 0); - init_waitqueue_head(&ldev->ldev_release); - - return; -} - -static void smc_lo_dev_exit(struct smc_lo_dev *ldev) -{ - if (atomic_read(&ldev->dmb_cnt)) - wait_event(ldev->ldev_release, !atomic_read(&ldev->dmb_cnt)); -} - -static int smc_lo_dev_probe(void) -{ - struct smc_lo_dev *ldev; - - ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); - if (!ldev) - return -ENOMEM; - - smc_lo_dev_init(ldev); - - lo_dev = ldev; /* global loopback device */ - - return 0; -} - -static void smc_lo_dev_remove(void) -{ - if (!lo_dev) - return; - - smc_lo_dev_exit(lo_dev); - kfree(lo_dev); - lo_dev = NULL; -} - -int smc_loopback_init(struct smc_lo_dev **smc_lb) -{ - int ret; - - ret = smc_lo_dev_probe(); - if (!ret) - *smc_lb = lo_dev; - return ret; -} - -void smc_loopback_exit(void) -{ - smc_lo_dev_remove(); -} diff --git a/net/smc/smc_loopback.h b/net/smc/smc_loopback.h deleted file mode 100644 index 33bb96ec8b77..000000000000 --- a/net/smc/smc_loopback.h +++ /dev/null @@ -1,47 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Shared Memory Communications Direct over loopback-ism device. - * - * SMC-D loopback-ism device structure definitions. - * - * Copyright (c) 2024, Alibaba Inc. - * - * Author: Wen Gu - * Tony Lu - * - */ - -#ifndef _SMC_LOOPBACK_H -#define _SMC_LOOPBACK_H - -#include -#include - -#define SMC_LO_MAX_DMBS 5000 -#define SMC_LO_DMBS_HASH_BITS 12 - -struct smc_lo_dmb_node { - struct hlist_node list; - u64 token; - u32 len; - u32 sba_idx; - void *cpu_addr; - dma_addr_t dma_addr; - refcount_t refcnt; -}; - -struct smc_lo_dev { - struct smcd_dev *smcd; - atomic_t dmb_cnt; - rwlock_t dmb_ht_lock; - DECLARE_BITMAP(sba_idx_mask, SMC_LO_MAX_DMBS); - DECLARE_HASHTABLE(dmb_ht, SMC_LO_DMBS_HASH_BITS); - wait_queue_head_t ldev_release; -}; - -const struct smcd_ops *smc_lo_get_smcd_ops(void); - -int smc_loopback_init(struct smc_lo_dev **smc_lb); -void smc_loopback_exit(void); - -#endif /* _SMC_LOOPBACK_H */ -- cgit v1.2.3 From a612dbe8d04d47af91fa88f0599c1370cc70f687 Mon Sep 17 00:00:00 2001 From: Julian Ruess Date: Thu, 18 Sep 2025 13:05:00 +0200 Subject: dibs: Move event handling to dibs layer Add defines for all event types and subtypes an ism device is known to produce as it can be helpful for debugging purposes. Introduces a generic 'struct dibs_event' and adopt ism device driver and smc-d client accordingly. Tolerate and ignore other type and subtype values to enable future device extensions. SMC-D and ISM are now independent. struct ism_dev can be moved to drivers/s390/net/ism.h. Note that in smc, the term 'ism' is still used. Future patches could replace that with 'dibs' or 'smc-d' as appropriate. Signed-off-by: Julian Ruess Co-developed-by: Alexandra Winter Signed-off-by: Alexandra Winter Link: https://patch.msgid.link/20250918110500.1731261-15-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- MAINTAINERS | 2 - arch/s390/configs/debug_defconfig | 1 + arch/s390/configs/defconfig | 1 + drivers/dibs/dibs_main.c | 2 +- drivers/s390/net/Kconfig | 1 - drivers/s390/net/ism.h | 42 +++++++- drivers/s390/net/ism_drv.c | 221 ++++++++++++-------------------------- include/linux/dibs.h | 62 +++++++++++ include/net/smc.h | 15 --- net/smc/Kconfig | 1 - net/smc/smc_ism.c | 99 ++++++----------- 11 files changed, 208 insertions(+), 239 deletions(-) (limited to 'net') diff --git a/MAINTAINERS b/MAINTAINERS index ecc55fae5f9d..1fbe541198f7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17580,7 +17580,6 @@ F: include/linux/fddidevice.h F: include/linux/hippidevice.h F: include/linux/if_* F: include/linux/inetdevice.h -F: include/linux/ism.h F: include/linux/netdev* F: include/linux/platform_data/wiznet.h F: include/uapi/linux/cn_proc.h @@ -22237,7 +22236,6 @@ L: linux-s390@vger.kernel.org L: netdev@vger.kernel.org S: Supported F: drivers/s390/net/ -F: include/linux/ism.h S390 PCI SUBSYSTEM M: Niklas Schnelle diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index a97c8d19f643..fdde8ee0d7bd 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -122,6 +122,7 @@ CONFIG_XFRM_USER=m CONFIG_NET_KEY=m CONFIG_DIBS=y CONFIG_DIBS_LO=y +CONFIG_SMC=m CONFIG_SMC_DIAG=m CONFIG_INET=y CONFIG_IP_MULTICAST=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 7f7b52d9a33c..bf9e7dbd4a89 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -113,6 +113,7 @@ CONFIG_XFRM_USER=m CONFIG_NET_KEY=m CONFIG_DIBS=y CONFIG_DIBS_LO=y +CONFIG_SMC=m CONFIG_SMC_DIAG=m CONFIG_INET=y CONFIG_IP_MULTICAST=y diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c index aacb3ea7825a..5425238d5a42 100644 --- a/drivers/dibs/dibs_main.c +++ b/drivers/dibs/dibs_main.c @@ -98,7 +98,7 @@ int dibs_unregister_client(struct dibs_client *client) goto err_reg_dmb; } } - /* Stop forwarding IRQs */ + /* Stop forwarding IRQs and events */ dibs->subs[client->id] = NULL; spin_unlock_irqrestore(&dibs->lock, flags); clients[client->id]->ops->del_dev(dibs); diff --git a/drivers/s390/net/Kconfig b/drivers/s390/net/Kconfig index 92985f595d59..0fd700c5745a 100644 --- a/drivers/s390/net/Kconfig +++ b/drivers/s390/net/Kconfig @@ -82,7 +82,6 @@ config CCWGROUP config ISM tristate "Support for ISM vPCI Adapter" depends on PCI && DIBS - imply SMC default n help Select this option if you want to use the Internal Shared Memory diff --git a/drivers/s390/net/ism.h b/drivers/s390/net/ism.h index 1b9fa14da20c..08d17956cb36 100644 --- a/drivers/s390/net/ism.h +++ b/drivers/s390/net/ism.h @@ -6,13 +6,13 @@ #include #include #include -#include -#include #include #define UTIL_STR_LEN 16 #define ISM_ERROR 0xFFFF +#define ISM_NR_DMBS 1920 + /* * Do not use the first word of the DMB bits to ensure 8 byte aligned access. */ @@ -34,6 +34,23 @@ #define ISM_UNREG_SBA 0x11 #define ISM_UNREG_IEQ 0x12 +enum ism_event_type { + ISM_EVENT_BUF = 0x00, + ISM_EVENT_DEV = 0x01, + ISM_EVENT_SWR = 0x02 +}; + +enum ism_event_code { + ISM_BUF_DMB_UNREGISTERED = 0x04, + ISM_BUF_USING_ISM_DEV_DISABLED = 0x08, + ISM_BUF_OWNING_ISM_DEV_IN_ERR_STATE = 0x02, + ISM_BUF_USING_ISM_DEV_IN_ERR_STATE = 0x03, + ISM_BUF_VLAN_MISMATCH_WITH_OWNER = 0x05, + ISM_BUF_VLAN_MISMATCH_WITH_USER = 0x06, + ISM_DEV_GID_DISABLED = 0x07, + ISM_DEV_GID_ERR_STATE = 0x01 +}; + struct ism_req_hdr { u32 cmd; u16 : 16; @@ -185,6 +202,14 @@ struct ism_eq_header { u64 : 64; }; +struct ism_event { + u32 type; + u32 code; + u64 tok; + u64 time; + u64 info; +}; + struct ism_eq { struct ism_eq_header header; struct ism_event entry[15]; @@ -199,6 +224,19 @@ struct ism_sba { u16 dmbe_mask[ISM_NR_DMBS]; }; +struct ism_dev { + spinlock_t cmd_lock; /* serializes cmds */ + struct dibs_dev *dibs; + struct pci_dev *pdev; + struct ism_sba *sba; + dma_addr_t sba_dma_addr; + DECLARE_BITMAP(sba_bitmap, ISM_NR_DMBS); + + struct ism_eq *ieq; + dma_addr_t ieq_dma_addr; + int ieq_idx; +}; + #define ISM_CREATE_REQ(dmb, idx, sf, offset) \ ((dmb) | (idx) << 24 | (sf) << 23 | (offset)) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 346d1ea8650b..f84aa2e676e9 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -31,86 +31,6 @@ MODULE_DEVICE_TABLE(pci, ism_device_table); static debug_info_t *ism_debug_info; -#define NO_CLIENT 0xff /* must be >= MAX_CLIENTS */ -static struct ism_client *clients[MAX_CLIENTS]; /* use an array rather than */ - /* a list for fast mapping */ -static u8 max_client; -static DEFINE_MUTEX(clients_lock); -struct ism_dev_list { - struct list_head list; - struct mutex mutex; /* protects ism device list */ -}; - -static struct ism_dev_list ism_dev_list = { - .list = LIST_HEAD_INIT(ism_dev_list.list), - .mutex = __MUTEX_INITIALIZER(ism_dev_list.mutex), -}; - -static void ism_setup_forwarding(struct ism_client *client, struct ism_dev *ism) -{ - unsigned long flags; - - spin_lock_irqsave(&ism->lock, flags); - ism->subs[client->id] = client; - spin_unlock_irqrestore(&ism->lock, flags); -} - -int ism_register_client(struct ism_client *client) -{ - struct ism_dev *ism; - int i, rc = -ENOSPC; - - mutex_lock(&ism_dev_list.mutex); - mutex_lock(&clients_lock); - for (i = 0; i < MAX_CLIENTS; ++i) { - if (!clients[i]) { - clients[i] = client; - client->id = i; - if (i == max_client) - max_client++; - rc = 0; - break; - } - } - mutex_unlock(&clients_lock); - - if (i < MAX_CLIENTS) { - /* initialize with all devices that we got so far */ - list_for_each_entry(ism, &ism_dev_list.list, list) { - ism->priv[i] = NULL; - ism_setup_forwarding(client, ism); - } - } - mutex_unlock(&ism_dev_list.mutex); - - return rc; -} -EXPORT_SYMBOL_GPL(ism_register_client); - -int ism_unregister_client(struct ism_client *client) -{ - struct ism_dev *ism; - unsigned long flags; - int rc = 0; - - mutex_lock(&ism_dev_list.mutex); - list_for_each_entry(ism, &ism_dev_list.list, list) { - spin_lock_irqsave(&ism->lock, flags); - /* Stop forwarding IRQs and events */ - ism->subs[client->id] = NULL; - spin_unlock_irqrestore(&ism->lock, flags); - } - mutex_unlock(&ism_dev_list.mutex); - - mutex_lock(&clients_lock); - clients[client->id] = NULL; - if (client->id + 1 == max_client) - max_client--; - mutex_unlock(&clients_lock); - return rc; -} -EXPORT_SYMBOL_GPL(ism_unregister_client); - static int ism_cmd(struct ism_dev *ism, void *cmd) { struct ism_req_hdr *req = cmd; @@ -445,6 +365,24 @@ static int ism_del_vlan_id(struct dibs_dev *dibs, u64 vlan_id) return ism_cmd(ism, &cmd); } +static int ism_signal_ieq(struct dibs_dev *dibs, const uuid_t *rgid, + u32 trigger_irq, u32 event_code, u64 info) +{ + struct ism_dev *ism = dibs->drv_priv; + union ism_sig_ieq cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.request.hdr.cmd = ISM_SIGNAL_IEQ; + cmd.request.hdr.len = sizeof(cmd.request); + + memcpy(&cmd.request.rgid, rgid, sizeof(cmd.request.rgid)); + cmd.request.trigger_irq = trigger_irq; + cmd.request.event_code = event_code; + cmd.request.info = info; + + return ism_cmd(ism, &cmd); +} + static unsigned int max_bytes(unsigned int start, unsigned int len, unsigned int boundary) { @@ -487,22 +425,68 @@ static u16 ism_get_chid(struct dibs_dev *dibs) return to_zpci(ism->pdev)->pchid; } +static int ism_match_event_type(u32 s390_event_type) +{ + switch (s390_event_type) { + case ISM_EVENT_BUF: + return DIBS_BUF_EVENT; + case ISM_EVENT_DEV: + return DIBS_DEV_EVENT; + case ISM_EVENT_SWR: + return DIBS_SW_EVENT; + default: + return DIBS_OTHER_TYPE; + } +} + +static int ism_match_event_subtype(u32 s390_event_subtype) +{ + switch (s390_event_subtype) { + case ISM_BUF_DMB_UNREGISTERED: + return DIBS_BUF_UNREGISTERED; + case ISM_DEV_GID_DISABLED: + return DIBS_DEV_DISABLED; + case ISM_DEV_GID_ERR_STATE: + return DIBS_DEV_ERR_STATE; + default: + return DIBS_OTHER_SUBTYPE; + } +} + static void ism_handle_event(struct ism_dev *ism) { + struct dibs_dev *dibs = ism->dibs; + struct dibs_event event; struct ism_event *entry; - struct ism_client *clt; + struct dibs_client *clt; int i; while ((ism->ieq_idx + 1) != READ_ONCE(ism->ieq->header.idx)) { - if (++(ism->ieq_idx) == ARRAY_SIZE(ism->ieq->entry)) + if (++ism->ieq_idx == ARRAY_SIZE(ism->ieq->entry)) ism->ieq_idx = 0; entry = &ism->ieq->entry[ism->ieq_idx]; debug_event(ism_debug_info, 2, entry, sizeof(*entry)); - for (i = 0; i < max_client; ++i) { - clt = ism->subs[i]; + __memset(&event, 0, sizeof(event)); + event.type = ism_match_event_type(entry->type); + if (event.type == DIBS_SW_EVENT) + event.subtype = entry->code; + else + event.subtype = ism_match_event_subtype(entry->code); + event.time = entry->time; + event.data = entry->info; + switch (event.type) { + case DIBS_BUF_EVENT: + event.buffer_tok = entry->tok; + break; + case DIBS_DEV_EVENT: + case DIBS_SW_EVENT: + memcpy(&event.gid, &entry->tok, sizeof(u64)); + } + for (i = 0; i < MAX_DIBS_CLIENTS; ++i) { + clt = dibs->subs[i]; if (clt) - clt->handle_event(ism, entry); + clt->ops->handle_event(dibs, &event); } } } @@ -560,12 +544,13 @@ static const struct dibs_dev_ops ism_ops = { .move_data = ism_move, .add_vlan_id = ism_add_vlan_id, .del_vlan_id = ism_del_vlan_id, + .signal_event = ism_signal_ieq, }; static int ism_dev_init(struct ism_dev *ism) { struct pci_dev *pdev = ism->pdev; - int i, ret; + int ret; ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSI); if (ret <= 0) @@ -584,18 +569,6 @@ static int ism_dev_init(struct ism_dev *ism) if (ret) goto unreg_sba; - mutex_lock(&ism_dev_list.mutex); - mutex_lock(&clients_lock); - for (i = 0; i < max_client; ++i) { - if (clients[i]) { - ism_setup_forwarding(clients[i], ism); - } - } - mutex_unlock(&clients_lock); - - list_add(&ism->list, &ism_dev_list.list); - mutex_unlock(&ism_dev_list.mutex); - query_info(ism); return 0; @@ -612,22 +585,11 @@ out: static void ism_dev_exit(struct ism_dev *ism) { struct pci_dev *pdev = ism->pdev; - unsigned long flags; - int i; - - spin_lock_irqsave(&ism->lock, flags); - for (i = 0; i < max_client; ++i) - ism->subs[i] = NULL; - spin_unlock_irqrestore(&ism->lock, flags); - - mutex_lock(&ism_dev_list.mutex); unregister_ieq(ism); unregister_sba(ism); free_irq(pci_irq_vector(pdev, 0), ism); pci_free_irq_vectors(pdev); - list_del_init(&ism->list); - mutex_unlock(&ism_dev_list.mutex); } static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) @@ -641,7 +603,6 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (!ism) return -ENOMEM; - spin_lock_init(&ism->lock); spin_lock_init(&ism->cmd_lock); dev_set_drvdata(&pdev->dev, ism); ism->pdev = pdev; @@ -742,8 +703,6 @@ static int __init ism_init(void) if (!ism_debug_info) return -ENODEV; - memset(clients, 0, sizeof(clients)); - max_client = 0; debug_register_view(ism_debug_info, &debug_hex_ascii_view); ret = pci_register_driver(&ism_driver); if (ret) @@ -760,41 +719,3 @@ static void __exit ism_exit(void) module_init(ism_init); module_exit(ism_exit); - -/*************************** SMC-D Implementation *****************************/ - -#if IS_ENABLED(CONFIG_SMC) -static int ism_signal_ieq(struct ism_dev *ism, u64 rgid, u32 trigger_irq, - u32 event_code, u64 info) -{ - union ism_sig_ieq cmd; - - memset(&cmd, 0, sizeof(cmd)); - cmd.request.hdr.cmd = ISM_SIGNAL_IEQ; - cmd.request.hdr.len = sizeof(cmd.request); - - cmd.request.rgid = rgid; - cmd.request.trigger_irq = trigger_irq; - cmd.request.event_code = event_code; - cmd.request.info = info; - - return ism_cmd(ism, &cmd); -} - -static int smcd_signal_ieq(struct smcd_dev *smcd, struct smcd_gid *rgid, - u32 trigger_irq, u32 event_code, u64 info) -{ - return ism_signal_ieq(smcd->priv, rgid->gid, - trigger_irq, event_code, info); -} - -static const struct smcd_ops ism_smcd_ops = { - .signal_event = smcd_signal_ieq, -}; - -const struct smcd_ops *ism_get_smcd_ops(void) -{ - return &ism_smcd_ops; -} -EXPORT_SYMBOL_GPL(ism_get_smcd_ops); -#endif diff --git a/include/linux/dibs.h b/include/linux/dibs.h index be009c614205..c75607f8a5cf 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -67,6 +67,41 @@ struct dibs_dmb { dma_addr_t dma_addr; }; +/* DIBS events + * ----------- + * Dibs devices can optionally notify dibs clients about events that happened + * in the fabric or at the remote device or remote dmb. + */ +enum dibs_event_type { + /* Buffer event, e.g. a remote dmb was unregistered */ + DIBS_BUF_EVENT, + /* Device event, e.g. a remote dibs device was disabled */ + DIBS_DEV_EVENT, + /* Software event, a dibs client can send an event signal to a + * remote dibs device. + */ + DIBS_SW_EVENT, + DIBS_OTHER_TYPE }; + +enum dibs_event_subtype { + DIBS_BUF_UNREGISTERED, + DIBS_DEV_DISABLED, + DIBS_DEV_ERR_STATE, + DIBS_OTHER_SUBTYPE +}; + +struct dibs_event { + u32 type; + u32 subtype; + /* uuid_null if invalid */ + uuid_t gid; + /* zero if invalid */ + u64 buffer_tok; + u64 time; + /* additional data or zero */ + u64 data; +}; + struct dibs_dev; /* DIBS client @@ -117,6 +152,15 @@ struct dibs_client_ops { */ void (*handle_irq)(struct dibs_dev *dev, unsigned int idx, u16 dmbemask); + /** + * handle_event() - Handle control information sent by device + * @dev: device reporting the event + * @event: ism event structure + * + * * Context: Called in IRQ context by dibs device driver + */ + void (*handle_event)(struct dibs_dev *dev, + const struct dibs_event *event); }; struct dibs_client { @@ -285,6 +329,24 @@ struct dibs_dev_ops { * Return: zero on success */ int (*del_vlan_id)(struct dibs_dev *dev, u64 vlan_id); + /** + * signal_event() - trigger an event at a remote dibs device (optional) + * @dev: local dibs device + * @rgid: gid of remote dibs device + * trigger_irq: zero: notification may be coalesced with other events + * non-zero: notify immediately + * @subtype: 4 byte event code, meaning is defined by dibs client + * @data: 8 bytes of additional information, + * meaning is defined by dibs client + * + * dibs devices can offer support for sending a control event of type + * EVENT_SWR to a remote dibs device. + * NOTE: handle_event() will be called for all registered dibs clients + * at the remote device. + * Return: zero on success + */ + int (*signal_event)(struct dibs_dev *dev, const uuid_t *rgid, + u32 trigger_irq, u32 event_code, u64 info); /** * support_mmapped_rdmb() - can this device provide memory mapped * remote dmbs? (optional) diff --git a/include/net/smc.h b/include/net/smc.h index 8e3debcf7db5..08bee529ed8d 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -16,7 +16,6 @@ #include #include #include -#include "linux/ism.h" struct sock; @@ -28,28 +27,14 @@ struct smc_hashinfo { }; /* SMCD/ISM device driver interface */ -#define ISM_EVENT_DMB 0 -#define ISM_EVENT_GID 1 -#define ISM_EVENT_SWR 2 - #define ISM_RESERVED_VLANID 0x1FFF -struct smcd_dev; - struct smcd_gid { u64 gid; u64 gid_ext; }; -struct smcd_ops { - /* optional operations */ - int (*signal_event)(struct smcd_dev *dev, struct smcd_gid *rgid, - u32 trigger_irq, u32 event_code, u64 info); -}; - struct smcd_dev { - const struct smcd_ops *ops; - void *priv; struct dibs_dev *dibs; struct list_head list; spinlock_t lock; diff --git a/net/smc/Kconfig b/net/smc/Kconfig index 9535d88c2acb..99ecd59d1f4b 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -2,7 +2,6 @@ config SMC tristate "SMC socket protocol family" depends on INET && INFINIBAND && DIBS - depends on m || ISM != m help SMC-R provides a "sockets over RDMA" solution making use of RDMA over Converged Ethernet (RoCE) technology to upgrade diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 01e49371d23d..7b228ca2f96a 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -17,7 +17,6 @@ #include "smc_ism.h" #include "smc_pnet.h" #include "smc_netlink.h" -#include "linux/ism.h" #include "linux/dibs.h" struct smcd_dev_list smcd_dev_list = { @@ -30,20 +29,15 @@ static u8 smc_ism_v2_system_eid[SMC_MAX_EID_LEN]; static void smcd_register_dev(struct dibs_dev *dibs); static void smcd_unregister_dev(struct dibs_dev *dibs); -#if IS_ENABLED(CONFIG_ISM) -static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event); - -static struct ism_client smc_ism_client = { - .name = "SMC-D", - .handle_event = smcd_handle_event, -}; -#endif +static void smcd_handle_event(struct dibs_dev *dibs, + const struct dibs_event *event); static void smcd_handle_irq(struct dibs_dev *dibs, unsigned int dmbno, u16 dmbemask); static struct dibs_client_ops smc_client_ops = { .add_dev = smcd_register_dev, .del_dev = smcd_unregister_dev, + .handle_event = smcd_handle_event, .handle_irq = smcd_handle_irq, }; @@ -399,11 +393,10 @@ int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -#if IS_ENABLED(CONFIG_ISM) struct smc_ism_event_work { struct work_struct work; struct smcd_dev *smcd; - struct ism_event event; + struct dibs_event event; }; #define ISM_EVENT_REQUEST 0x0001 @@ -423,25 +416,27 @@ union smcd_sw_event_info { static void smcd_handle_sw_event(struct smc_ism_event_work *wrk) { - struct smcd_gid peer_gid = { .gid = wrk->event.tok, - .gid_ext = 0 }; + struct dibs_dev *dibs = wrk->smcd->dibs; union smcd_sw_event_info ev_info; + struct smcd_gid peer_gid; + uuid_t ism_rgid; - ev_info.info = wrk->event.info; - switch (wrk->event.code) { + copy_to_smcdgid(&peer_gid, &wrk->event.gid); + ev_info.info = wrk->event.data; + switch (wrk->event.subtype) { case ISM_EVENT_CODE_SHUTDOWN: /* Peer shut down DMBs */ smc_smcd_terminate(wrk->smcd, &peer_gid, ev_info.vlan_id); break; case ISM_EVENT_CODE_TESTLINK: /* Activity timer */ if (ev_info.code == ISM_EVENT_REQUEST && - wrk->smcd->ops->signal_event) { + dibs->ops->signal_event) { ev_info.code = ISM_EVENT_RESPONSE; - wrk->smcd->ops->signal_event(wrk->smcd, - &peer_gid, - ISM_EVENT_REQUEST_IR, - ISM_EVENT_CODE_TESTLINK, - ev_info.info); - } + copy_to_dibsgid(&ism_rgid, &peer_gid); + dibs->ops->signal_event(dibs, &ism_rgid, + ISM_EVENT_REQUEST_IR, + ISM_EVENT_CODE_TESTLINK, + ev_info.info); + } break; } } @@ -451,26 +446,24 @@ static void smc_ism_event_work(struct work_struct *work) { struct smc_ism_event_work *wrk = container_of(work, struct smc_ism_event_work, work); - struct smcd_gid smcd_gid = { .gid = wrk->event.tok, - .gid_ext = 0 }; + struct smcd_gid smcd_gid; + + copy_to_smcdgid(&smcd_gid, &wrk->event.gid); switch (wrk->event.type) { - case ISM_EVENT_GID: /* GID event, token is peer GID */ + case DIBS_DEV_EVENT: /* GID event, token is peer GID */ smc_smcd_terminate(wrk->smcd, &smcd_gid, VLAN_VID_MASK); break; - case ISM_EVENT_DMB: + case DIBS_BUF_EVENT: break; - case ISM_EVENT_SWR: /* Software defined event */ + case DIBS_SW_EVENT: /* Software defined event */ smcd_handle_sw_event(wrk); break; } kfree(wrk); } -#endif -static struct smcd_dev *smcd_alloc_dev(const char *name, - const struct smcd_ops *ops, - int max_dmbs) +static struct smcd_dev *smcd_alloc_dev(const char *name, int max_dmbs) { struct smcd_dev *smcd; @@ -487,8 +480,6 @@ static struct smcd_dev *smcd_alloc_dev(const char *name, if (!smcd->event_wq) goto free_conn; - smcd->ops = ops; - spin_lock_init(&smcd->lock); spin_lock_init(&smcd->lgr_lock); INIT_LIST_HEAD(&smcd->vlan); @@ -506,36 +497,17 @@ free_smcd: static void smcd_register_dev(struct dibs_dev *dibs) { struct smcd_dev *smcd, *fentry; - const struct smcd_ops *ops; - struct ism_dev *ism; int max_dmbs; max_dmbs = dibs->ops->max_dmbs(); - if (smc_ism_is_loopback(dibs)) { - ops = NULL; - } else { - ism = dibs->drv_priv; -#if IS_ENABLED(CONFIG_ISM) - ops = ism_get_smcd_ops(); -#endif - } - smcd = smcd_alloc_dev(dev_name(&dibs->dev), ops, max_dmbs); + smcd = smcd_alloc_dev(dev_name(&dibs->dev), max_dmbs); if (!smcd) return; smcd->dibs = dibs; dibs_set_priv(dibs, &smc_dibs_client, smcd); - if (smc_ism_is_loopback(dibs)) { - smcd->priv = NULL; - } else { - smcd->priv = ism; -#if IS_ENABLED(CONFIG_ISM) - ism_set_priv(ism, &smc_ism_client, smcd); -#endif - } - if (smc_pnetid_by_dev_port(dibs->dev.parent, 0, smcd->pnetid)) smc_pnetid_by_table_smcd(smcd); @@ -590,7 +562,6 @@ static void smcd_unregister_dev(struct dibs_dev *dibs) kfree(smcd); } -#if IS_ENABLED(CONFIG_ISM) /* SMCD Device event handler. Called from ISM device interrupt handler. * Parameters are ism device pointer, * - event->type (0 --> DMB, 1 --> GID), @@ -602,9 +573,10 @@ static void smcd_unregister_dev(struct dibs_dev *dibs) * Context: * - Function called in IRQ context from ISM device driver event handler. */ -static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) +static void smcd_handle_event(struct dibs_dev *dibs, + const struct dibs_event *event) { - struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); + struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client); struct smc_ism_event_work *wrk; if (smcd->going_away) @@ -618,7 +590,6 @@ static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) wrk->event = *event; queue_work(smcd->event_wq, &wrk->work); } -#endif /* SMCD Device interrupt handler. Called from ISM device interrupt handler. * Parameters are the ism device pointer, DMB number, and the DMBE bitmask. @@ -644,22 +615,22 @@ static void smcd_handle_irq(struct dibs_dev *dibs, unsigned int dmbno, int smc_ism_signal_shutdown(struct smc_link_group *lgr) { int rc = 0; -#if IS_ENABLED(CONFIG_ISM) union smcd_sw_event_info ev_info; + uuid_t ism_rgid; if (lgr->peer_shutdown) return 0; - if (!lgr->smcd->ops->signal_event) + if (!lgr->smcd->dibs->ops->signal_event) return 0; memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); ev_info.vlan_id = lgr->vlan_id; ev_info.code = ISM_EVENT_REQUEST; - rc = lgr->smcd->ops->signal_event(lgr->smcd, &lgr->peer_gid, + copy_to_dibsgid(&ism_rgid, &lgr->peer_gid); + rc = lgr->smcd->dibs->ops->signal_event(lgr->smcd->dibs, &ism_rgid, ISM_EVENT_REQUEST_IR, ISM_EVENT_CODE_SHUTDOWN, ev_info.info); -#endif return rc; } @@ -670,9 +641,6 @@ int smc_ism_init(void) smc_ism_v2_capable = false; smc_ism_create_system_eid(); -#if IS_ENABLED(CONFIG_ISM) - rc = ism_register_client(&smc_ism_client); -#endif rc = dibs_register_client(&smc_dibs_client); return rc; } @@ -680,7 +648,4 @@ int smc_ism_init(void) void smc_ism_exit(void) { dibs_unregister_client(&smc_dibs_client); -#if IS_ENABLED(CONFIG_ISM) - ism_unregister_client(&smc_ism_client); -#endif } -- cgit v1.2.3 From 3d18f80ce181ba27f37d0ec1c550b22acb01dd49 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 22 Sep 2025 14:29:52 +1000 Subject: VFS: rename kern_path_locked() and related functions. kern_path_locked() is now only used to prepare for removing an object from the filesystem (and that is the only credible reason for wanting a positive locked dentry). Thus it corresponds to kern_path_create() and so should have a corresponding name. Unfortunately the name "kern_path_create" is somewhat misleading as it doesn't actually create anything. The recently added simple_start_creating() provides a better pattern I believe. The "start" can be matched with "end" to bracket the creating or removing. So this patch changes names: kern_path_locked -> start_removing_path kern_path_create -> start_creating_path user_path_create -> start_creating_user_path user_path_locked_at -> start_removing_user_path_at done_path_create -> end_creating_path and also introduces end_removing_path() which is identical to end_creating_path(). __start_removing_path (which was __kern_path_locked) is enhanced to call mnt_want_write() for consistency with the start_creating_path(). Reviewed-by: Amir Goldstein Signed-off-by: NeilBrown Signed-off-by: Christian Brauner --- Documentation/filesystems/porting.rst | 12 ++++++ arch/powerpc/platforms/cell/spufs/syscalls.c | 4 +- drivers/base/devtmpfs.c | 22 +++++------ fs/bcachefs/fs-ioctl.c | 10 ++--- fs/init.c | 17 ++++---- fs/namei.c | 59 +++++++++++++++++----------- fs/ocfs2/refcounttree.c | 4 +- fs/smb/server/vfs.c | 8 ++-- include/linux/namei.h | 14 ++++--- kernel/bpf/inode.c | 4 +- net/unix/af_unix.c | 6 +-- 11 files changed, 93 insertions(+), 67 deletions(-) (limited to 'net') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 85f590254f07..e0494860be6b 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1285,3 +1285,15 @@ rather than a VMA, as the VMA at this stage is not yet valid. The vm_area_desc provides the minimum required information for a filesystem to initialise state upon memory mapping of a file-backed region, and output parameters for the file system to set this state. + +--- + +**mandatory** + +Several functions are renamed: + +- kern_path_locked -> start_removing_path +- kern_path_create -> start_creating_path +- user_path_create -> start_creating_user_path +- user_path_locked_at -> start_removing_user_path_at +- done_path_create -> end_creating_path diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c index 157e046e6e93..ea4ba1b6ce6a 100644 --- a/arch/powerpc/platforms/cell/spufs/syscalls.c +++ b/arch/powerpc/platforms/cell/spufs/syscalls.c @@ -67,11 +67,11 @@ static long do_spu_create(const char __user *pathname, unsigned int flags, struct dentry *dentry; int ret; - dentry = user_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY); + dentry = start_creating_user_path(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY); ret = PTR_ERR(dentry); if (!IS_ERR(dentry)) { ret = spufs_create(&path, dentry, flags, mode, neighbor); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); } return ret; diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index 31bfb3194b4c..9d4e46ad8352 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -176,7 +176,7 @@ static int dev_mkdir(const char *name, umode_t mode) struct dentry *dentry; struct path path; - dentry = kern_path_create(AT_FDCWD, name, &path, LOOKUP_DIRECTORY); + dentry = start_creating_path(AT_FDCWD, name, &path, LOOKUP_DIRECTORY); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -184,7 +184,7 @@ static int dev_mkdir(const char *name, umode_t mode) if (!IS_ERR(dentry)) /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return PTR_ERR_OR_ZERO(dentry); } @@ -222,10 +222,10 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid, struct path path; int err; - dentry = kern_path_create(AT_FDCWD, nodename, &path, 0); + dentry = start_creating_path(AT_FDCWD, nodename, &path, 0); if (dentry == ERR_PTR(-ENOENT)) { create_path(nodename); - dentry = kern_path_create(AT_FDCWD, nodename, &path, 0); + dentry = start_creating_path(AT_FDCWD, nodename, &path, 0); } if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -246,7 +246,7 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid, /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return err; } @@ -256,7 +256,7 @@ static int dev_rmdir(const char *name) struct dentry *dentry; int err; - dentry = kern_path_locked(name, &parent); + dentry = start_removing_path(name, &parent); if (IS_ERR(dentry)) return PTR_ERR(dentry); if (d_inode(dentry)->i_private == &thread) @@ -265,9 +265,7 @@ static int dev_rmdir(const char *name) else err = -EPERM; - dput(dentry); - inode_unlock(d_inode(parent.dentry)); - path_put(&parent); + end_removing_path(&parent, dentry); return err; } @@ -325,7 +323,7 @@ static int handle_remove(const char *nodename, struct device *dev) int deleted = 0; int err = 0; - dentry = kern_path_locked(nodename, &parent); + dentry = start_removing_path(nodename, &parent); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -349,10 +347,8 @@ static int handle_remove(const char *nodename, struct device *dev) if (!err || err == -ENOENT) deleted = 1; } - dput(dentry); - inode_unlock(d_inode(parent.dentry)); + end_removing_path(&parent, dentry); - path_put(&parent); if (deleted && strchr(nodename, '/')) delete_path(nodename); return err; diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 4e72e654da96..43510da5e734 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -255,7 +255,7 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode)); } - dst_dentry = user_path_create(arg.dirfd, + dst_dentry = start_creating_user_path(arg.dirfd, (const char __user *)(unsigned long)arg.dst_ptr, &dst_path, lookup_flags); error = PTR_ERR_OR_ZERO(dst_dentry); @@ -314,7 +314,7 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, d_instantiate(dst_dentry, &inode->v); fsnotify_mkdir(dir, dst_dentry); err3: - done_path_create(&dst_path, dst_dentry); + end_creating_path(&dst_path, dst_dentry); err2: if (arg.src_ptr) path_put(&src_path); @@ -334,7 +334,7 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, if (arg.flags) return -EINVAL; - victim = user_path_locked_at(arg.dirfd, name, &path); + victim = start_removing_user_path_at(arg.dirfd, name, &path); if (IS_ERR(victim)) return PTR_ERR(victim); @@ -351,9 +351,7 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, d_invalidate(victim); } err: - inode_unlock(dir); - dput(victim); - path_put(&path); + end_removing_path(&path, victim); return ret; } diff --git a/fs/init.c b/fs/init.c index eef5124885e3..07f592ccdba8 100644 --- a/fs/init.c +++ b/fs/init.c @@ -149,7 +149,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev) else if (!(S_ISBLK(mode) || S_ISCHR(mode))) return -EINVAL; - dentry = kern_path_create(AT_FDCWD, filename, &path, 0); + dentry = start_creating_path(AT_FDCWD, filename, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -158,7 +158,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev) if (!error) error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, mode, new_decode_dev(dev)); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return error; } @@ -173,7 +173,7 @@ int __init init_link(const char *oldname, const char *newname) if (error) return error; - new_dentry = kern_path_create(AT_FDCWD, newname, &new_path, 0); + new_dentry = start_creating_path(AT_FDCWD, newname, &new_path, 0); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto out; @@ -191,7 +191,7 @@ int __init init_link(const char *oldname, const char *newname) error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, NULL); out_dput: - done_path_create(&new_path, new_dentry); + end_creating_path(&new_path, new_dentry); out: path_put(&old_path); return error; @@ -203,14 +203,14 @@ int __init init_symlink(const char *oldname, const char *newname) struct path path; int error; - dentry = kern_path_create(AT_FDCWD, newname, &path, 0); + dentry = start_creating_path(AT_FDCWD, newname, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); error = security_path_symlink(&path, dentry, oldname); if (!error) error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, oldname); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return error; } @@ -225,7 +225,8 @@ int __init init_mkdir(const char *pathname, umode_t mode) struct path path; int error; - dentry = kern_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY); + dentry = start_creating_path(AT_FDCWD, pathname, &path, + LOOKUP_DIRECTORY); if (IS_ERR(dentry)) return PTR_ERR(dentry); mode = mode_strip_umask(d_inode(path.dentry), mode); @@ -236,7 +237,7 @@ int __init init_mkdir(const char *pathname, umode_t mode) if (IS_ERR(dentry)) error = PTR_ERR(dentry); } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return error; } diff --git a/fs/namei.c b/fs/namei.c index 4017bc8641d3..92973a7a8091 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2758,7 +2758,8 @@ static int filename_parentat(int dfd, struct filename *name, } /* does lookup, returns the object with parent locked */ -static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path) +static struct dentry *__start_removing_path(int dfd, struct filename *name, + struct path *path) { struct path parent_path __free(path_put) = {}; struct dentry *d; @@ -2770,15 +2771,26 @@ static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct return ERR_PTR(error); if (unlikely(type != LAST_NORM)) return ERR_PTR(-EINVAL); + /* don't fail immediately if it's r/o, at least try to report other errors */ + error = mnt_want_write(parent_path.mnt); inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); d = lookup_one_qstr_excl(&last, parent_path.dentry, 0); - if (IS_ERR(d)) { - inode_unlock(parent_path.dentry->d_inode); - return d; - } + if (IS_ERR(d)) + goto unlock; + if (error) + goto fail; path->dentry = no_free_ptr(parent_path.dentry); path->mnt = no_free_ptr(parent_path.mnt); return d; + +fail: + dput(d); + d = ERR_PTR(error); +unlock: + inode_unlock(parent_path.dentry->d_inode); + if (!error) + mnt_drop_write(parent_path.mnt); + return d; } /** @@ -2816,24 +2828,26 @@ struct dentry *kern_path_parent(const char *name, struct path *path) return d; } -struct dentry *kern_path_locked(const char *name, struct path *path) +struct dentry *start_removing_path(const char *name, struct path *path) { struct filename *filename = getname_kernel(name); - struct dentry *res = __kern_path_locked(AT_FDCWD, filename, path); + struct dentry *res = __start_removing_path(AT_FDCWD, filename, path); putname(filename); return res; } -struct dentry *user_path_locked_at(int dfd, const char __user *name, struct path *path) +struct dentry *start_removing_user_path_at(int dfd, + const char __user *name, + struct path *path) { struct filename *filename = getname(name); - struct dentry *res = __kern_path_locked(dfd, filename, path); + struct dentry *res = __start_removing_path(dfd, filename, path); putname(filename); return res; } -EXPORT_SYMBOL(user_path_locked_at); +EXPORT_SYMBOL(start_removing_user_path_at); int kern_path(const char *name, unsigned int flags, struct path *path) { @@ -4223,8 +4237,8 @@ out: return dentry; } -struct dentry *kern_path_create(int dfd, const char *pathname, - struct path *path, unsigned int lookup_flags) +struct dentry *start_creating_path(int dfd, const char *pathname, + struct path *path, unsigned int lookup_flags) { struct filename *filename = getname_kernel(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); @@ -4232,9 +4246,9 @@ struct dentry *kern_path_create(int dfd, const char *pathname, putname(filename); return res; } -EXPORT_SYMBOL(kern_path_create); +EXPORT_SYMBOL(start_creating_path); -void done_path_create(struct path *path, struct dentry *dentry) +void end_creating_path(struct path *path, struct dentry *dentry) { if (!IS_ERR(dentry)) dput(dentry); @@ -4242,10 +4256,11 @@ void done_path_create(struct path *path, struct dentry *dentry) mnt_drop_write(path->mnt); path_put(path); } -EXPORT_SYMBOL(done_path_create); +EXPORT_SYMBOL(end_creating_path); -inline struct dentry *user_path_create(int dfd, const char __user *pathname, - struct path *path, unsigned int lookup_flags) +inline struct dentry *start_creating_user_path( + int dfd, const char __user *pathname, + struct path *path, unsigned int lookup_flags) { struct filename *filename = getname(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); @@ -4253,7 +4268,7 @@ inline struct dentry *user_path_create(int dfd, const char __user *pathname, putname(filename); return res; } -EXPORT_SYMBOL(user_path_create); +EXPORT_SYMBOL(start_creating_user_path); /** * vfs_mknod - create device node or file @@ -4361,7 +4376,7 @@ retry: break; } out2: - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4465,7 +4480,7 @@ retry: if (IS_ERR(dentry)) error = PTR_ERR(dentry); } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4819,7 +4834,7 @@ retry: if (!error) error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, from->name); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4988,7 +5003,7 @@ retry: error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, &delegated_inode); out_dput: - done_path_create(&new_path, new_dentry); + end_creating_path(&new_path, new_dentry); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) { diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 8f732742b26e..267b50e8e42e 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4418,7 +4418,7 @@ int ocfs2_reflink_ioctl(struct inode *inode, return error; } - new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0); + new_dentry = start_creating_user_path(AT_FDCWD, newname, &new_path, 0); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) { mlog_errno(error); @@ -4435,7 +4435,7 @@ int ocfs2_reflink_ioctl(struct inode *inode, d_inode(new_path.dentry), new_dentry, preserve); out_dput: - done_path_create(&new_path, new_dentry); + end_creating_path(&new_path, new_dentry); out: path_put(&old_path); diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 07739055ac9f..1cfa688904b2 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -196,7 +196,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) pr_err("File(%s): creation failed (err:%d)\n", name, err); } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return err; } @@ -237,7 +237,7 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) if (!err && dentry != d) ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(dentry)); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (err) pr_err("mkdir(%s): creation failed (err:%d)\n", name, err); return err; @@ -669,7 +669,7 @@ int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname, ksmbd_debug(VFS, "vfs_link failed err %d\n", err); out3: - done_path_create(&newpath, dentry); + end_creating_path(&newpath, dentry); out2: path_put(&oldpath); out1: @@ -1325,7 +1325,7 @@ struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, if (!abs_name) return ERR_PTR(-ENOMEM); - dent = kern_path_create(AT_FDCWD, abs_name, path, flags); + dent = start_creating_path(AT_FDCWD, abs_name, path, flags); kfree(abs_name); return dent; } diff --git a/include/linux/namei.h b/include/linux/namei.h index 1d5038c21c20..a7800ef04e76 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -59,11 +59,15 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name, extern int kern_path(const char *, unsigned, struct path *); struct dentry *kern_path_parent(const char *name, struct path *parent); -extern struct dentry *kern_path_create(int, const char *, struct path *, unsigned int); -extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int); -extern void done_path_create(struct path *, struct dentry *); -extern struct dentry *kern_path_locked(const char *, struct path *); -extern struct dentry *user_path_locked_at(int , const char __user *, struct path *); +extern struct dentry *start_creating_path(int, const char *, struct path *, unsigned int); +extern struct dentry *start_creating_user_path(int, const char __user *, struct path *, unsigned int); +extern void end_creating_path(struct path *, struct dentry *); +extern struct dentry *start_removing_path(const char *, struct path *); +extern struct dentry *start_removing_user_path_at(int , const char __user *, struct path *); +static inline void end_removing_path(struct path *path , struct dentry *dentry) +{ + end_creating_path(path, dentry); +} int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, struct path *parent, struct qstr *last, int *type, const struct path *root); diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5c2e96b19392..fadf3817a9c5 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -442,7 +442,7 @@ static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw, umode_t mode; int ret; - dentry = user_path_create(path_fd, pathname, &path, 0); + dentry = start_creating_user_path(path_fd, pathname, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -471,7 +471,7 @@ static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw, ret = -EPERM; } out: - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return ret; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 6d7c110814ff..768098dec231 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1387,7 +1387,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, * Get the parent directory, calculate the hash for last * component. */ - dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); + dentry = start_creating_path(AT_FDCWD, addr->name->sun_path, &parent, 0); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out; @@ -1417,7 +1417,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, unix_table_double_unlock(net, old_hash, new_hash); unix_insert_bsd_socket(sk); mutex_unlock(&u->bindlock); - done_path_create(&parent, dentry); + end_creating_path(&parent, dentry); return 0; out_unlock: @@ -1427,7 +1427,7 @@ out_unlink: /* failed after successful mknod? unlink what we'd created... */ vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); out_path: - done_path_create(&parent, dentry); + end_creating_path(&parent, dentry); out: unix_release_addr(addr); return err == -EEXIST ? -EADDRINUSE : err; -- cgit v1.2.3 From ec7d8e68ef0ec5c635c8f9e93cd881673445a397 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 22 Aug 2025 09:19:23 -0400 Subject: sunrpc: add a Kconfig option to redirect dfprintk() output to trace buffer We have a lot of old dprintk() call sites that aren't going anywhere anytime soon. At the same time, turning them up is a serious burden on the host due to the console locking overhead. Add a new Kconfig option that redirects dfprintk() output to the trace buffer. This is more efficient than logging to the console and allows for proper interleaving of dprintk and static tracepoint events. Since using trace_printk() causes scary warnings to pop at boot time, this new option defaults to "n". Signed-off-by: Jeff Layton Reviewed-by: Chuck Lever Reviewed-by: Simon Horman Signed-off-by: Anna Schumaker --- include/linux/sunrpc/debug.h | 10 ++++++++-- net/sunrpc/Kconfig | 14 ++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index 99a6fa4a1d6a..891f6173c951 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -30,17 +30,23 @@ extern unsigned int nlm_debug; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define ifdebug(fac) if (unlikely(rpc_debug & RPCDBG_##fac)) +# if IS_ENABLED(CONFIG_SUNRPC_DEBUG_TRACE) +# define __sunrpc_printk(fmt, ...) trace_printk(fmt, ##__VA_ARGS__) +# else +# define __sunrpc_printk(fmt, ...) printk(KERN_DEFAULT fmt, ##__VA_ARGS__) +# endif + # define dfprintk(fac, fmt, ...) \ do { \ ifdebug(fac) \ - printk(KERN_DEFAULT fmt, ##__VA_ARGS__); \ + __sunrpc_printk(fmt, ##__VA_ARGS__); \ } while (0) # define dfprintk_rcu(fac, fmt, ...) \ do { \ ifdebug(fac) { \ rcu_read_lock(); \ - printk(KERN_DEFAULT fmt, ##__VA_ARGS__); \ + __sunrpc_printk(fmt, ##__VA_ARGS__); \ rcu_read_unlock(); \ } \ } while (0) diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index 2d8b67dac7b5..a570e7adf270 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -101,6 +101,20 @@ config SUNRPC_DEBUG If unsure, say Y. +config SUNRPC_DEBUG_TRACE + bool "RPC: Send dfprintk() output to the trace buffer" + depends on SUNRPC_DEBUG && TRACING + default n + help + dprintk() output can be voluminous, which can overwhelm the + kernel's logging facility as it must be sent to the console. + This option causes dprintk() output to go to the trace buffer + instead of the kernel log. + + This will cause warnings about trace_printk() being used to be + logged at boot time, so say N unless you are debugging a problem + with sunrpc-based clients or services. + config SUNRPC_XPRT_RDMA tristate "RPC-over-RDMA transport" depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS -- cgit v1.2.3 From 62c0c0e7491211969d8d1c2a9ab0e112b34664cf Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 20 Aug 2025 10:27:28 -0400 Subject: SUNRPC: Move the svc_rpcb_cleanup() call sites Clean up: because svc_rpcb_cleanup() and svc_xprt_destroy_all() are always invoked in pairs, we can deduplicate code by moving the svc_rpcb_cleanup() call sites into svc_xprt_destroy_all(). Signed-off-by: Chuck Lever Tested-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/lockd/svc.c | 6 ++---- fs/nfs/callback.c | 2 +- fs/nfsd/nfsctl.c | 2 +- fs/nfsd/nfssvc.c | 7 ++----- include/linux/sunrpc/svc_xprt.h | 3 ++- net/sunrpc/svc.c | 1 - net/sunrpc/svc_xprt.c | 7 ++++++- 7 files changed, 14 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index e80262a51884..d68afa196535 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -216,8 +216,7 @@ out_err: if (warned++ == 0) printk(KERN_WARNING "lockd_up: makesock failed, error=%d\n", err); - svc_xprt_destroy_all(serv, net); - svc_rpcb_cleanup(serv, net); + svc_xprt_destroy_all(serv, net, true); return err; } @@ -255,8 +254,7 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net) nlm_shutdown_hosts_net(net); cancel_delayed_work_sync(&ln->grace_period_end); locks_end_grace(&ln->lockd_manager); - svc_xprt_destroy_all(serv, net); - svc_rpcb_cleanup(serv, net); + svc_xprt_destroy_all(serv, net, true); } } else { pr_err("%s: no users! net=%x\n", diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 511f80878809..c8b837006bb2 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -136,7 +136,7 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc return; dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum); - svc_xprt_destroy_all(serv, net); + svc_xprt_destroy_all(serv, net, false); } static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index bc6b776fc657..63d52edcad72 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1993,7 +1993,7 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) * remaining listeners and recreate the list. */ if (delete) - svc_xprt_destroy_all(serv, net); + svc_xprt_destroy_all(serv, net, false); /* walk list of addrs again, open any that still don't exist */ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 82b0111ac469..7057ddd7a0a8 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -535,16 +535,13 @@ void nfsd_destroy_serv(struct net *net) #endif } - svc_xprt_destroy_all(serv, net); - /* * write_ports can create the server without actually starting - * any threads--if we get shut down before any threads are + * any threads. If we get shut down before any threads are * started, then nfsd_destroy_serv will be run before any of this * other initialization has been done except the rpcb information. */ - svc_rpcb_cleanup(serv, net); - + svc_xprt_destroy_all(serv, net, true); nfsd_shutdown_net(net); svc_destroy(&serv); } diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 369a89aea186..fde60d4e2cd5 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -165,7 +165,8 @@ int svc_xprt_create(struct svc_serv *serv, const char *xprt_name, struct net *net, const int family, const unsigned short port, int flags, const struct cred *cred); -void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net); +void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net, + bool unregister); void svc_xprt_received(struct svc_xprt *xprt); void svc_xprt_enqueue(struct svc_xprt *xprt); void svc_xprt_put(struct svc_xprt *xprt); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index b1fab3a69544..9c7245d811eb 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -436,7 +436,6 @@ void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net) svc_unregister(serv, net); rpcb_put_local(net); } -EXPORT_SYMBOL_GPL(svc_rpcb_cleanup); static int svc_uses_rpcbind(struct svc_serv *serv) { diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 8b1837228799..049ab53088e9 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1102,6 +1102,7 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net) * svc_xprt_destroy_all - Destroy transports associated with @serv * @serv: RPC service to be shut down * @net: target network namespace + * @unregister: true if it is OK to unregister the destroyed xprts * * Server threads may still be running (especially in the case where the * service is still running in other network namespaces). @@ -1114,7 +1115,8 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net) * threads, we may need to wait a little while and then check again to * see if they're done. */ -void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net) +void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net, + bool unregister) { int delay = 0; @@ -1124,6 +1126,9 @@ void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net) svc_clean_up_xprts(serv, net); msleep(delay++); } + + if (unregister) + svc_rpcb_cleanup(serv, net); } EXPORT_SYMBOL_GPL(svc_xprt_destroy_all); -- cgit v1.2.3 From 040058a8f7fd333d4159a09ae308145a393c8551 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Sun, 10 Aug 2025 15:29:42 +0800 Subject: SUNRPC: Remove redundant __GFP_NOWARN GFP_NOWAIT already includes __GFP_NOWARN, so let's remove the redundant __GFP_NOWARN. Signed-off-by: Qianfeng Rong Acked-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/socklib.c | 2 +- net/sunrpc/xprtrdma/rpc_rdma.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index 4e92e2a50168..d8d8842c7de5 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c @@ -86,7 +86,7 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, struct xdr_skb_reader *desc) /* ACL likes to be lazy in allocating pages - ACLs * are small by default but can get huge. */ if ((xdr->flags & XDRBUF_SPARSE_PAGES) && *ppage == NULL) { - *ppage = alloc_page(GFP_NOWAIT | __GFP_NOWARN); + *ppage = alloc_page(GFP_NOWAIT); if (unlikely(*ppage == NULL)) { if (copied == 0) return -ENOMEM; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 1478c41c7e9d..3aac1456e23e 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -190,7 +190,7 @@ rpcrdma_alloc_sparse_pages(struct xdr_buf *buf) ppages = buf->pages + (buf->page_base >> PAGE_SHIFT); while (len > 0) { if (!*ppages) - *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN); + *ppages = alloc_page(GFP_NOWAIT); if (!*ppages) return -ENOBUFS; ppages++; -- cgit v1.2.3 From d57e43b72bf2071caac90da323849c3983a695f0 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Mon, 30 Jun 2025 14:53:09 -0400 Subject: SUNRPC: Update svcxdr_init_decode() to call xdr_set_scratch_folio() The only snag here is that __folio_alloc_node() doesn't handle NUMA_NO_NODE, so I also need to update svc_pool_map_get_node() to return numa_mem_id() instead. I arrived at this approach by looking at what other users of __folio_alloc_node() do for this case. Signed-off-by: Anna Schumaker --- include/linux/sunrpc/svc.h | 4 ++-- net/sunrpc/svc.c | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 40cbe81360ed..5506d20857c3 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -196,7 +196,7 @@ struct svc_rqst { struct xdr_buf rq_arg; struct xdr_stream rq_arg_stream; struct xdr_stream rq_res_stream; - struct page *rq_scratch_page; + struct folio *rq_scratch_folio; struct xdr_buf rq_res; unsigned long rq_maxpages; /* num of entries in rq_pages */ struct page * *rq_pages; @@ -503,7 +503,7 @@ static inline void svcxdr_init_decode(struct svc_rqst *rqstp) buf->len = buf->head->iov_len + buf->page_len + buf->tail->iov_len; xdr_init_decode(xdr, buf, argv->iov_base, NULL); - xdr_set_scratch_page(xdr, rqstp->rq_scratch_page); + xdr_set_scratch_folio(xdr, rqstp->rq_scratch_folio); } /** diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 9c7245d811eb..de05ef637bdc 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -352,7 +352,7 @@ static int svc_pool_map_get_node(unsigned int pidx) if (m->mode == SVC_POOL_PERNODE) return m->pool_to[pidx]; } - return NUMA_NO_NODE; + return numa_mem_id(); } /* * Set the given thread's cpus_allowed mask so that it @@ -669,8 +669,8 @@ svc_rqst_free(struct svc_rqst *rqstp) folio_batch_release(&rqstp->rq_fbatch); kfree(rqstp->rq_bvec); svc_release_buffer(rqstp); - if (rqstp->rq_scratch_page) - put_page(rqstp->rq_scratch_page); + if (rqstp->rq_scratch_folio) + folio_put(rqstp->rq_scratch_folio); kfree(rqstp->rq_resp); kfree(rqstp->rq_argp); kfree(rqstp->rq_auth_data); @@ -691,8 +691,8 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) rqstp->rq_server = serv; rqstp->rq_pool = pool; - rqstp->rq_scratch_page = alloc_pages_node(node, GFP_KERNEL, 0); - if (!rqstp->rq_scratch_page) + rqstp->rq_scratch_folio = __folio_alloc_node(GFP_KERNEL, 0, node); + if (!rqstp->rq_scratch_folio) goto out_enomem; rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node); -- cgit v1.2.3 From cc6ac66f1c0946299b8f192026cff9a320aaad18 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Tue, 1 Jul 2025 10:46:50 -0400 Subject: SUNRPC: Update gssx_accept_sec_context() to use xdr_set_scratch_folio() This was the last caller of xdr_set_scratch_page(), so I remove this function while I'm at it. Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xdr.h | 13 ------------- net/sunrpc/auth_gss/gss_rpc_xdr.c | 8 ++++---- 2 files changed, 4 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 3ce17321689a..49278749ad0c 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -287,19 +287,6 @@ xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen) xdr->scratch.iov_len = buflen; } -/** - * xdr_set_scratch_page - Attach a scratch buffer for decoding data - * @xdr: pointer to xdr_stream struct - * @page: an anonymous page - * - * See xdr_set_scratch_buffer(). - */ -static inline void -xdr_set_scratch_page(struct xdr_stream *xdr, struct page *page) -{ - xdr_set_scratch_buffer(xdr, page_address(page), PAGE_SIZE); -} - /** * xdr_set_scratch_folio - Attach a scratch buffer for decoding data * @xdr: pointer to xdr_stream struct diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index cb32ab9a8395..7d2cdc2bd374 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -794,12 +794,12 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, struct gssx_res_accept_sec_context *res = data; u32 value_follows; int err; - struct page *scratch; + struct folio *scratch; - scratch = alloc_page(GFP_KERNEL); + scratch = folio_alloc(GFP_KERNEL, 0); if (!scratch) return -ENOMEM; - xdr_set_scratch_page(xdr, scratch); + xdr_set_scratch_folio(xdr, scratch); /* res->status */ err = gssx_dec_status(xdr, &res->status); @@ -844,6 +844,6 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, err = gssx_dec_option_array(xdr, &res->options); out_free: - __free_page(scratch); + folio_put(scratch); return err; } -- cgit v1.2.3 From 8f12d1137c2382c80aada8e05d7cc650cd4e403c Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Mon, 22 Sep 2025 16:33:49 -0700 Subject: bpf: Clear pfmemalloc flag when freeing all fragments It is possible for bpf_xdp_adjust_tail() to free all fragments. The kfunc currently clears the XDP_FLAGS_HAS_FRAGS bit, but not XDP_FLAGS_FRAGS_PF_MEMALLOC. So far, this has not caused a issue when building sk_buff from xdp_buff since all readers of xdp_buff->flags use the flag only when there are fragments. Clear the XDP_FLAGS_FRAGS_PF_MEMALLOC bit as well to make the flags correct. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Reviewed-by: Maciej Fijalkowski Link: https://patch.msgid.link/20250922233356.3356453-2-ameryhung@gmail.com --- include/net/xdp.h | 5 +++++ net/core/filter.c | 1 + 2 files changed, 6 insertions(+) (limited to 'net') diff --git a/include/net/xdp.h b/include/net/xdp.h index b40f1f96cb11..f288c348a6c1 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -115,6 +115,11 @@ static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; } +static __always_inline void xdp_buff_clear_frag_pfmemalloc(struct xdp_buff *xdp) +{ + xdp->flags &= ~XDP_FLAGS_FRAGS_PF_MEMALLOC; +} + static __always_inline void xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) { diff --git a/net/core/filter.c b/net/core/filter.c index 63f3baee2daf..5837534f4352 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4210,6 +4210,7 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) if (unlikely(!sinfo->nr_frags)) { xdp_buff_clear_frags_flag(xdp); + xdp_buff_clear_frag_pfmemalloc(xdp); xdp->data_end -= offset; } -- cgit v1.2.3 From dea1526fbafb55099a788cde0b659530ee5b1c66 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Mon, 22 Sep 2025 16:33:50 -0700 Subject: bpf: Allow bpf_xdp_shrink_data to shrink a frag from head and tail Move skb_frag_t adjustment into bpf_xdp_shrink_data() and extend its functionality to be able to shrink an xdp fragment from both head and tail. In a later patch, bpf_xdp_pull_data() will reuse it to shrink an xdp fragment from head. Additionally, in bpf_xdp_frags_shrink_tail(), breaking the loop when bpf_xdp_shrink_data() returns false (i.e., not releasing the current fragment) is not necessary as the loop condition, offset > 0, has the same effect. Remove the else branch to simplify the code. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Reviewed-by: Maciej Fijalkowski Link: https://patch.msgid.link/20250922233356.3356453-3-ameryhung@gmail.com --- include/net/xdp_sock_drv.h | 21 ++++++++++++++++++--- net/core/filter.c | 41 ++++++++++++++++++++++++----------------- 2 files changed, 42 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 513c8e9704f6..4f2d3268a676 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -160,13 +160,23 @@ static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) return ret; } -static inline void xsk_buff_del_tail(struct xdp_buff *tail) +static inline void xsk_buff_del_frag(struct xdp_buff *xdp) { - struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); list_del(&xskb->list_node); } +static inline struct xdp_buff *xsk_buff_get_head(struct xdp_buff *first) +{ + struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); + struct xdp_buff_xsk *frag; + + frag = list_first_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, + list_node); + return &frag->xdp; +} + static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) { struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); @@ -389,8 +399,13 @@ static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) return NULL; } -static inline void xsk_buff_del_tail(struct xdp_buff *tail) +static inline void xsk_buff_del_frag(struct xdp_buff *xdp) +{ +} + +static inline struct xdp_buff *xsk_buff_get_head(struct xdp_buff *first) { + return NULL; } static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) diff --git a/net/core/filter.c b/net/core/filter.c index 5837534f4352..8cae575ad437 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4153,34 +4153,45 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) return 0; } -static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, - enum xdp_mem_type mem_type, bool release) +static struct xdp_buff *bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, + bool tail, bool release) { - struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp); + struct xdp_buff *zc_frag = tail ? xsk_buff_get_tail(xdp) : + xsk_buff_get_head(xdp); if (release) { - xsk_buff_del_tail(zc_frag); - __xdp_return(0, mem_type, false, zc_frag); + xsk_buff_del_frag(zc_frag); } else { - zc_frag->data_end -= shrink; + if (tail) + zc_frag->data_end -= shrink; + else + zc_frag->data += shrink; } + + return zc_frag; } static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, - int shrink) + int shrink, bool tail) { enum xdp_mem_type mem_type = xdp->rxq->mem.type; bool release = skb_frag_size(frag) == shrink; + netmem_ref netmem = skb_frag_netmem(frag); + struct xdp_buff *zc_frag = NULL; if (mem_type == MEM_TYPE_XSK_BUFF_POOL) { - bpf_xdp_shrink_data_zc(xdp, shrink, mem_type, release); - goto out; + netmem = 0; + zc_frag = bpf_xdp_shrink_data_zc(xdp, shrink, tail, release); } - if (release) - __xdp_return(skb_frag_netmem(frag), mem_type, false, NULL); + if (release) { + __xdp_return(netmem, mem_type, false, zc_frag); + } else { + if (!tail) + skb_frag_off_add(frag, shrink); + skb_frag_size_sub(frag, shrink); + } -out: return release; } @@ -4198,12 +4209,8 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) len_free += shrink; offset -= shrink; - if (bpf_xdp_shrink_data(xdp, frag, shrink)) { + if (bpf_xdp_shrink_data(xdp, frag, shrink, true)) n_frags_free++; - } else { - skb_frag_size_sub(frag, shrink); - break; - } } sinfo->nr_frags -= n_frags_free; sinfo->xdp_frags_size -= len_free; -- cgit v1.2.3 From 4dce1a0d7cf39575a5880414ea882890edd8d26f Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Mon, 22 Sep 2025 16:33:51 -0700 Subject: bpf: Support pulling non-linear xdp data Add kfunc, bpf_xdp_pull_data(), to support pulling data from xdp fragments. Similar to bpf_skb_pull_data(), bpf_xdp_pull_data() makes the first len bytes of data directly readable and writable in bpf programs. If the "len" argument is larger than the linear data size, data in fragments will be copied to the linear data area when there is enough room. Specifically, the kfunc will try to use the tailroom first. When the tailroom is not enough, metadata and data will be shifted down to make room for pulling data. A use case of the kfunc is to decapsulate headers residing in xdp fragments. It is possible for a NIC driver to place headers in xdp fragments. To keep using direct packet access for parsing and decapsulating headers, users can pull headers into the linear data area by calling bpf_xdp_pull_data() and then pop the header with bpf_xdp_adjust_head(). Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/20250922233356.3356453-4-ameryhung@gmail.com --- net/core/filter.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 8cae575ad437..6c8a075a3016 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -12214,6 +12214,98 @@ __bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops, return 0; } +/** + * bpf_xdp_pull_data() - Pull in non-linear xdp data. + * @x: &xdp_md associated with the XDP buffer + * @len: length of data to be made directly accessible in the linear part + * + * Pull in data in case the XDP buffer associated with @x is non-linear and + * not all @len are in the linear data area. + * + * Direct packet access allows reading and writing linear XDP data through + * packet pointers (i.e., &xdp_md->data + offsets). The amount of data which + * ends up in the linear part of the xdp_buff depends on the NIC and its + * configuration. When a frag-capable XDP program wants to directly access + * headers that may be in the non-linear area, call this kfunc to make sure + * the data is available in the linear area. Alternatively, use dynptr or + * bpf_xdp_{load,store}_bytes() to access data without pulling. + * + * This kfunc can also be used with bpf_xdp_adjust_head() to decapsulate + * headers in the non-linear data area. + * + * A call to this kfunc may reduce headroom. If there is not enough tailroom + * in the linear data area, metadata and data will be shifted down. + * + * A call to this kfunc is susceptible to change the buffer geometry. + * Therefore, at load time, all checks on pointers previously done by the + * verifier are invalidated and must be performed again, if the kfunc is used + * in combination with direct packet access. + * + * Return: + * * %0 - success + * * %-EINVAL - invalid len + */ +__bpf_kfunc int bpf_xdp_pull_data(struct xdp_md *x, u32 len) +{ + struct xdp_buff *xdp = (struct xdp_buff *)x; + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + int i, delta, shift, headroom, tailroom, n_frags_free = 0; + void *data_hard_end = xdp_data_hard_end(xdp); + int data_len = xdp->data_end - xdp->data; + void *start; + + if (len <= data_len) + return 0; + + if (unlikely(len > xdp_get_buff_len(xdp))) + return -EINVAL; + + start = xdp_data_meta_unsupported(xdp) ? xdp->data : xdp->data_meta; + + headroom = start - xdp->data_hard_start - sizeof(struct xdp_frame); + tailroom = data_hard_end - xdp->data_end; + + delta = len - data_len; + if (unlikely(delta > tailroom + headroom)) + return -EINVAL; + + shift = delta - tailroom; + if (shift > 0) { + memmove(start - shift, start, xdp->data_end - start); + + xdp->data_meta -= shift; + xdp->data -= shift; + xdp->data_end -= shift; + } + + for (i = 0; i < sinfo->nr_frags && delta; i++) { + skb_frag_t *frag = &sinfo->frags[i]; + u32 shrink = min_t(u32, delta, skb_frag_size(frag)); + + memcpy(xdp->data_end, skb_frag_address(frag), shrink); + + xdp->data_end += shrink; + sinfo->xdp_frags_size -= shrink; + delta -= shrink; + if (bpf_xdp_shrink_data(xdp, frag, shrink, false)) + n_frags_free++; + } + + if (unlikely(n_frags_free)) { + memmove(sinfo->frags, sinfo->frags + n_frags_free, + (sinfo->nr_frags - n_frags_free) * sizeof(skb_frag_t)); + + sinfo->nr_frags -= n_frags_free; + + if (!sinfo->nr_frags) { + xdp_buff_clear_frags_flag(xdp); + xdp_buff_clear_frag_pfmemalloc(xdp); + } + } + + return 0; +} + __bpf_kfunc_end_defs(); int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, @@ -12241,6 +12333,7 @@ BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta) BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) BTF_ID_FLAGS(func, bpf_dynptr_from_xdp) +BTF_ID_FLAGS(func, bpf_xdp_pull_data) BTF_KFUNCS_END(bpf_kfunc_check_set_xdp) BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr) -- cgit v1.2.3 From 7eb83bff02ad5e82e8c456c58717ef181c220870 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Mon, 22 Sep 2025 16:33:53 -0700 Subject: bpf: Make variables in bpf_prog_test_run_xdp less confusing Change the variable naming in bpf_prog_test_run_xdp() to make the overall logic less confusing. As different modes were added to the function over the time, some variables got overloaded, making it hard to understand and changing the code becomes error-prone. Replace "size" with "linear_sz" where it refers to the size of metadata and data. If "size" refers to input data size, use test.data_size_in directly. Replace "max_data_sz" with "max_linear_sz" to better reflect the fact that it is the maximum size of metadata and data (i.e., linear_sz). Also, xdp_rxq.frags_size is always PAGE_SIZE, so just set it directly instead of subtracting headroom and tailroom and adding them back. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250922233356.3356453-6-ameryhung@gmail.com --- net/bpf/test_run.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 9728dbd4c66c..82af47d8c123 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -1207,9 +1207,9 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, { bool do_live = (kattr->test.flags & BPF_F_TEST_XDP_LIVE_FRAMES); u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + u32 retval = 0, duration, max_linear_sz, size; + u32 linear_sz = kattr->test.data_size_in; u32 batch_size = kattr->test.batch_size; - u32 retval = 0, duration, max_data_sz; - u32 size = kattr->test.data_size_in; u32 headroom = XDP_PACKET_HEADROOM; u32 repeat = kattr->test.repeat; struct netdev_rx_queue *rxqueue; @@ -1246,7 +1246,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, if (ctx) { /* There can't be user provided data before the meta data */ - if (ctx->data_meta || ctx->data_end != size || + if (ctx->data_meta || ctx->data_end != kattr->test.data_size_in || ctx->data > ctx->data_end || unlikely(xdp_metalen_invalid(ctx->data)) || (do_live && (kattr->test.data_out || kattr->test.ctx_out))) @@ -1255,30 +1255,30 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, headroom -= ctx->data; } - max_data_sz = PAGE_SIZE - headroom - tailroom; - if (size > max_data_sz) { - /* disallow live data mode for jumbo frames */ - if (do_live) - goto free_ctx; - size = max_data_sz; - } + max_linear_sz = PAGE_SIZE - headroom - tailroom; + linear_sz = min_t(u32, linear_sz, max_linear_sz); + + /* disallow live data mode for jumbo frames */ + if (do_live && kattr->test.data_size_in > linear_sz) + goto free_ctx; - data = bpf_test_init(kattr, size, max_data_sz, headroom, tailroom); + data = bpf_test_init(kattr, linear_sz, max_linear_sz, headroom, tailroom); if (IS_ERR(data)) { ret = PTR_ERR(data); goto free_ctx; } rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); - rxqueue->xdp_rxq.frag_size = headroom + max_data_sz + tailroom; + rxqueue->xdp_rxq.frag_size = PAGE_SIZE; xdp_init_buff(&xdp, rxqueue->xdp_rxq.frag_size, &rxqueue->xdp_rxq); - xdp_prepare_buff(&xdp, data, headroom, size, true); + xdp_prepare_buff(&xdp, data, headroom, linear_sz, true); sinfo = xdp_get_shared_info_from_buff(&xdp); ret = xdp_convert_md_to_buff(ctx, &xdp); if (ret) goto free_data; + size = linear_sz; if (unlikely(kattr->test.data_size_in > size)) { void __user *data_in = u64_to_user_ptr(kattr->test.data_in); -- cgit v1.2.3 From fe9544ed1a2e9217b2c5285c3a4ac0dc5a38bd7b Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Mon, 22 Sep 2025 16:33:54 -0700 Subject: bpf: Support specifying linear xdp packet data size for BPF_PROG_TEST_RUN To test bpf_xdp_pull_data(), an xdp packet containing fragments as well as free linear data area after xdp->data_end needs to be created. However, bpf_prog_test_run_xdp() always fills the linear area with data_in before creating fragments, leaving no space to pull data. This patch will allow users to specify the linear data size through ctx->data_end. Currently, ctx_in->data_end must match data_size_in and will not be the final ctx->data_end seen by xdp programs. This is because ctx->data_end is populated according to the xdp_buff passed to test_run. The linear data area available in an xdp_buff, max_linear_sz, is alawys filled up before copying data_in into fragments. This patch will allow users to specify the size of data that goes into the linear area. When ctx_in->data_end is different from data_size_in, only ctx_in->data_end bytes of data will be put into the linear area when creating the xdp_buff. While ctx_in->data_end will be allowed to be different from data_size_in, it cannot be larger than the data_size_in as there will be no data to copy from user space. If it is larger than the maximum linear data area size, the layout suggested by the user will not be honored. Data beyond max_linear_sz bytes will still be copied into fragments. Finally, since it is possible for a NIC to produce a xdp_buff with empty linear data area, allow it when calling bpf_test_init() from bpf_prog_test_run_xdp() so that we can test XDP kfuncs with such xdp_buff. This is done by moving lower-bound check to callers as most of them already do except bpf_prog_test_run_skb(). The change also fixes a bug that allows passing an xdp_buff with data < ETH_HLEN. This can happen when ctx is used and metadata is at least ETH_HLEN. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250922233356.3356453-7-ameryhung@gmail.com --- net/bpf/test_run.c | 15 ++++++++++++--- .../selftests/bpf/prog_tests/xdp_context_test_run.c | 4 +--- 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 82af47d8c123..3df3fe46beb3 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -665,7 +665,7 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size, void __user *data_in = u64_to_user_ptr(kattr->test.data_in); void *data; - if (user_size < ETH_HLEN || user_size > PAGE_SIZE - headroom - tailroom) + if (user_size > PAGE_SIZE - headroom - tailroom) return ERR_PTR(-EINVAL); size = SKB_DATA_ALIGN(size); @@ -1001,6 +1001,9 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, kattr->test.cpu || kattr->test.batch_size) return -EINVAL; + if (size < ETH_HLEN) + return -EINVAL; + data = bpf_test_init(kattr, kattr->test.data_size_in, size, NET_SKB_PAD + NET_IP_ALIGN, SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); @@ -1207,7 +1210,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, { bool do_live = (kattr->test.flags & BPF_F_TEST_XDP_LIVE_FRAMES); u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - u32 retval = 0, duration, max_linear_sz, size; + u32 retval = 0, meta_sz = 0, duration, max_linear_sz, size; u32 linear_sz = kattr->test.data_size_in; u32 batch_size = kattr->test.batch_size; u32 headroom = XDP_PACKET_HEADROOM; @@ -1246,13 +1249,16 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, if (ctx) { /* There can't be user provided data before the meta data */ - if (ctx->data_meta || ctx->data_end != kattr->test.data_size_in || + if (ctx->data_meta || ctx->data_end > kattr->test.data_size_in || ctx->data > ctx->data_end || unlikely(xdp_metalen_invalid(ctx->data)) || (do_live && (kattr->test.data_out || kattr->test.ctx_out))) goto free_ctx; /* Meta data is allocated from the headroom */ headroom -= ctx->data; + + meta_sz = ctx->data; + linear_sz = ctx->data_end; } max_linear_sz = PAGE_SIZE - headroom - tailroom; @@ -1262,6 +1268,9 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, if (do_live && kattr->test.data_size_in > linear_sz) goto free_ctx; + if (kattr->test.data_size_in - meta_sz < ETH_HLEN) + return -EINVAL; + data = bpf_test_init(kattr, linear_sz, max_linear_sz, headroom, tailroom); if (IS_ERR(data)) { ret = PTR_ERR(data); diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c index 46e0730174ed..178292d1251a 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c @@ -97,9 +97,7 @@ void test_xdp_context_test_run(void) /* Meta data must be 255 bytes or smaller */ test_xdp_context_error(prog_fd, opts, 0, 256, sizeof(data), 0, 0, 0); - /* Total size of data must match data_end - data_meta */ - test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32), - sizeof(data) - 1, 0, 0, 0); + /* Total size of data must be data_end - data_meta or larger */ test_xdp_context_error(prog_fd, opts, 0, sizeof(__u32), sizeof(data) + 1, 0, 0, 0); -- cgit v1.2.3 From b650bf0977d34c52befb31a9fa711534e11b220f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 22 Sep 2025 10:42:40 +0000 Subject: udp: remove busylock and add per NUMA queues busylock was protecting UDP sockets against packet floods, but unfortunately was not protecting the host itself. Under stress, many cpus could spin while acquiring the busylock, and NIC had to drop packets. Or packets would be dropped in cpu backlog if RPS/RFS were in place. This patch replaces the busylock by intermediate lockless queues. (One queue per NUMA node). This means that fewer number of cpus have to acquire the UDP receive queue lock. Most of the cpus can either: - immediately drop the packet. - or queue it in their NUMA aware lockless queue. Then one of the cpu is chosen to process this lockless queue in a batch. The batch only contains packets that were cooked on the same NUMA node, thus with very limited latency impact. Tested: DDOS targeting a victim UDP socket, on a platform with 6 NUMA nodes (Intel(R) Xeon(R) 6985P-C) Before: nstat -n ; sleep 1 ; nstat | grep Udp Udp6InDatagrams 1004179 0.0 Udp6InErrors 3117 0.0 Udp6RcvbufErrors 3117 0.0 After: nstat -n ; sleep 1 ; nstat | grep Udp Udp6InDatagrams 1116633 0.0 Udp6InErrors 14197275 0.0 Udp6RcvbufErrors 14197275 0.0 We can see this host can now proces 14.2 M more packets per second while under attack, and the victim socket can receive 11 % more packets. I used a small bpftrace program measuring time (in us) spent in __udp_enqueue_schedule_skb(). Before: @udp_enqueue_us[398]: [0] 24901 |@@@ | [1] 63512 |@@@@@@@@@ | [2, 4) 344827 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [4, 8) 244673 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [8, 16) 54022 |@@@@@@@@ | [16, 32) 222134 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [32, 64) 232042 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [64, 128) 4219 | | [128, 256) 188 | | After: @udp_enqueue_us[398]: [0] 5608855 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [1] 1111277 |@@@@@@@@@@ | [2, 4) 501439 |@@@@ | [4, 8) 102921 | | [8, 16) 29895 | | [16, 32) 43500 | | [32, 64) 31552 | | [64, 128) 979 | | [128, 256) 13 | | Note that the remaining bottleneck for this platform is in udp_drops_inc() because we limited struct numa_drop_counters to only two nodes so far. Signed-off-by: Eric Dumazet Acked-by: Paolo Abeni Reviewed-by: Willem de Bruijn Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250922104240.2182559-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/udp.h | 9 +++- include/net/udp.h | 11 ++++- net/ipv4/udp.c | 117 +++++++++++++++++++++++++++++++--------------------- net/ipv6/udp.c | 5 ++- 4 files changed, 91 insertions(+), 51 deletions(-) (limited to 'net') diff --git a/include/linux/udp.h b/include/linux/udp.h index e554890c4415..58795688a186 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -44,6 +44,12 @@ enum { UDP_FLAGS_UDPLITE_RECV_CC, /* set via udplite setsockopt */ }; +/* per NUMA structure for lockless producer usage. */ +struct udp_prod_queue { + struct llist_head ll_root ____cacheline_aligned_in_smp; + atomic_t rmem_alloc; +}; + struct udp_sock { /* inet_sock has to be the first member */ struct inet_sock inet; @@ -90,6 +96,8 @@ struct udp_sock { struct sk_buff *skb, int nhoff); + struct udp_prod_queue *udp_prod_queue; + /* udp_recvmsg try to use this before splicing sk_receive_queue */ struct sk_buff_head reader_queue ____cacheline_aligned_in_smp; @@ -109,7 +117,6 @@ struct udp_sock { */ struct hlist_node tunnel_list; struct numa_drop_counters drop_counters; - spinlock_t busylock ____cacheline_aligned_in_smp; }; #define udp_test_bit(nr, sk) \ diff --git a/include/net/udp.h b/include/net/udp.h index 059a0cee5f55..cffedb3e40f2 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -284,16 +284,23 @@ INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *)); struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, netdev_features_t features, bool is_ipv6); -static inline void udp_lib_init_sock(struct sock *sk) +static inline int udp_lib_init_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); sk->sk_drop_counters = &up->drop_counters; - spin_lock_init(&up->busylock); skb_queue_head_init(&up->reader_queue); INIT_HLIST_NODE(&up->tunnel_list); up->forward_threshold = sk->sk_rcvbuf >> 2; set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); + + up->udp_prod_queue = kcalloc(nr_node_ids, sizeof(*up->udp_prod_queue), + GFP_KERNEL); + if (!up->udp_prod_queue) + return -ENOMEM; + for (int i = 0; i < nr_node_ids; i++) + init_llist_head(&up->udp_prod_queue[i].ll_root); + return 0; } static inline void udp_drops_inc(struct sock *sk) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 85cfc32eb2cc..95241093b7f0 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1685,25 +1685,6 @@ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb) udp_rmem_release(sk, udp_skb_truesize(skb), 1, true); } -/* Idea of busylocks is to let producers grab an extra spinlock - * to relieve pressure on the receive_queue spinlock shared by consumer. - * Under flood, this means that only one producer can be in line - * trying to acquire the receive_queue spinlock. - */ -static spinlock_t *busylock_acquire(struct sock *sk) -{ - spinlock_t *busy = &udp_sk(sk)->busylock; - - spin_lock(busy); - return busy; -} - -static void busylock_release(spinlock_t *busy) -{ - if (busy) - spin_unlock(busy); -} - static int udp_rmem_schedule(struct sock *sk, int size) { int delta; @@ -1718,14 +1699,24 @@ static int udp_rmem_schedule(struct sock *sk, int size) int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) { struct sk_buff_head *list = &sk->sk_receive_queue; + struct udp_prod_queue *udp_prod_queue; + struct sk_buff *next, *to_drop = NULL; + struct llist_node *ll_list; unsigned int rmem, rcvbuf; - spinlock_t *busy = NULL; int size, err = -ENOMEM; + int total_size = 0; + int q_size = 0; + int dropcount; + int nb = 0; rmem = atomic_read(&sk->sk_rmem_alloc); rcvbuf = READ_ONCE(sk->sk_rcvbuf); size = skb->truesize; + udp_prod_queue = &udp_sk(sk)->udp_prod_queue[numa_node_id()]; + + rmem += atomic_read(&udp_prod_queue->rmem_alloc); + /* Immediately drop when the receive queue is full. * Cast to unsigned int performs the boundary check for INT_MAX. */ @@ -1747,45 +1738,77 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) if (rmem > (rcvbuf >> 1)) { skb_condense(skb); size = skb->truesize; - rmem = atomic_add_return(size, &sk->sk_rmem_alloc); - if (rmem > rcvbuf) - goto uncharge_drop; - busy = busylock_acquire(sk); - } else { - atomic_add(size, &sk->sk_rmem_alloc); } udp_set_dev_scratch(skb); + atomic_add(size, &udp_prod_queue->rmem_alloc); + + if (!llist_add(&skb->ll_node, &udp_prod_queue->ll_root)) + return 0; + + dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? sk_drops_read(sk) : 0; + spin_lock(&list->lock); - err = udp_rmem_schedule(sk, size); - if (err) { - spin_unlock(&list->lock); - goto uncharge_drop; - } - sk_forward_alloc_add(sk, -size); + ll_list = llist_del_all(&udp_prod_queue->ll_root); - /* no need to setup a destructor, we will explicitly release the - * forward allocated memory on dequeue - */ - sock_skb_set_dropcount(sk, skb); + ll_list = llist_reverse_order(ll_list); + + llist_for_each_entry_safe(skb, next, ll_list, ll_node) { + size = udp_skb_truesize(skb); + total_size += size; + err = udp_rmem_schedule(sk, size); + if (unlikely(err)) { + /* Free the skbs outside of locked section. */ + skb->next = to_drop; + to_drop = skb; + continue; + } + + q_size += size; + sk_forward_alloc_add(sk, -size); + + /* no need to setup a destructor, we will explicitly release the + * forward allocated memory on dequeue + */ + SOCK_SKB_CB(skb)->dropcount = dropcount; + nb++; + __skb_queue_tail(list, skb); + } + + atomic_add(q_size, &sk->sk_rmem_alloc); - __skb_queue_tail(list, skb); spin_unlock(&list->lock); - if (!sock_flag(sk, SOCK_DEAD)) - INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk); + if (!sock_flag(sk, SOCK_DEAD)) { + /* Multiple threads might be blocked in recvmsg(), + * using prepare_to_wait_exclusive(). + */ + while (nb) { + INDIRECT_CALL_1(sk->sk_data_ready, + sock_def_readable, sk); + nb--; + } + } + + if (unlikely(to_drop)) { + for (nb = 0; to_drop != NULL; nb++) { + skb = to_drop; + to_drop = skb->next; + skb_mark_not_on_list(skb); + /* TODO: update SNMP values. */ + sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_PROTO_MEM); + } + numa_drop_add(&udp_sk(sk)->drop_counters, nb); + } - busylock_release(busy); - return 0; + atomic_sub(total_size, &udp_prod_queue->rmem_alloc); -uncharge_drop: - atomic_sub(skb->truesize, &sk->sk_rmem_alloc); + return 0; drop: udp_drops_inc(sk); - busylock_release(busy); return err; } EXPORT_IPV6_MOD_GPL(__udp_enqueue_schedule_skb); @@ -1803,6 +1826,7 @@ void udp_destruct_common(struct sock *sk) kfree_skb(skb); } udp_rmem_release(sk, total, 0, true); + kfree(up->udp_prod_queue); } EXPORT_IPV6_MOD_GPL(udp_destruct_common); @@ -1814,10 +1838,11 @@ static void udp_destruct_sock(struct sock *sk) int udp_init_sock(struct sock *sk) { - udp_lib_init_sock(sk); + int res = udp_lib_init_sock(sk); + sk->sk_destruct = udp_destruct_sock; set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); - return 0; + return res; } void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 9f4d340d1e3a..813a2ba75824 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -67,10 +67,11 @@ static void udpv6_destruct_sock(struct sock *sk) int udpv6_init_sock(struct sock *sk) { - udp_lib_init_sock(sk); + int res = udp_lib_init_sock(sk); + sk->sk_destruct = udpv6_destruct_sock; set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); - return 0; + return res; } INDIRECT_CALLABLE_SCOPE -- cgit v1.2.3 From ca9f9cdc4de97d0221100b11224738416696163c Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 22 Sep 2025 15:19:57 -0400 Subject: net: allow alloc_skb_with_frags() to use MAX_SKB_FRAGS Currently, alloc_skb_with_frags() will only fill (MAX_SKB_FRAGS - 1) slots. I think it should use all MAX_SKB_FRAGS slots, as callers of alloc_skb_with_frags() will size their allocation of frags based on MAX_SKB_FRAGS. This issue was discovered via a test patch that sets 'order' to 0 in alloc_skb_with_frags(), which effectively tests/simulates high fragmentation. In this case sendmsg() on unix sockets will fail every time for large allocations. If the PAGE_SIZE is 4K, then data_len will request 68K or 17 pages, but alloc_skb_with_frags() can only allocate 64K in this case or 16 pages. Fixes: 09c2c90705bb ("net: allow alloc_skb_with_frags() to allocate bigger packets") Signed-off-by: Jason Baron Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250922191957.2855612-1-jbaron@akamai.com Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ee0274417948..1c0279b9cb9f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -6667,7 +6667,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, return NULL; while (data_len) { - if (nr_frags == MAX_SKB_FRAGS - 1) + if (nr_frags == MAX_SKB_FRAGS) goto failure; while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order)) order--; -- cgit v1.2.3 From 390b3a300d7872cef9588f003b204398be69ce08 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 21 Sep 2025 18:08:22 +0300 Subject: nexthop: Forbid FDB status change while nexthop is in a group The kernel forbids the creation of non-FDB nexthop groups with FDB nexthops: # ip nexthop add id 1 via 192.0.2.1 fdb # ip nexthop add id 2 group 1 Error: Non FDB nexthop group cannot have fdb nexthops. And vice versa: # ip nexthop add id 3 via 192.0.2.2 dev dummy1 # ip nexthop add id 4 group 3 fdb Error: FDB nexthop group can only have fdb nexthops. However, as long as no routes are pointing to a non-FDB nexthop group, the kernel allows changing the type of a nexthop from FDB to non-FDB and vice versa: # ip nexthop add id 5 via 192.0.2.2 dev dummy1 # ip nexthop add id 6 group 5 # ip nexthop replace id 5 via 192.0.2.2 fdb # echo $? 0 This configuration is invalid and can result in a NPD [1] since FDB nexthops are not associated with a nexthop device: # ip route add 198.51.100.1/32 nhid 6 # ping 198.51.100.1 Fix by preventing nexthop FDB status change while the nexthop is in a group: # ip nexthop add id 7 via 192.0.2.2 dev dummy1 # ip nexthop add id 8 group 7 # ip nexthop replace id 7 via 192.0.2.2 fdb Error: Cannot change nexthop FDB status while in a group. [1] BUG: kernel NULL pointer dereference, address: 00000000000003c0 [...] Oops: Oops: 0000 [#1] SMP CPU: 6 UID: 0 PID: 367 Comm: ping Not tainted 6.17.0-rc6-virtme-gb65678cacc03 #1 PREEMPT(voluntary) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-4.fc41 04/01/2014 RIP: 0010:fib_lookup_good_nhc+0x1e/0x80 [...] Call Trace: fib_table_lookup+0x541/0x650 ip_route_output_key_hash_rcu+0x2ea/0x970 ip_route_output_key_hash+0x55/0x80 __ip4_datagram_connect+0x250/0x330 udp_connect+0x2b/0x60 __sys_connect+0x9c/0xd0 __x64_sys_connect+0x18/0x20 do_syscall_64+0xa4/0x2a0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Fixes: 38428d68719c ("nexthop: support for fdb ecmp nexthops") Reported-by: syzbot+6596516dd2b635ba2350@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/68c9a4d2.050a0220.3c6139.0e63.GAE@google.com/ Tested-by: syzbot+6596516dd2b635ba2350@syzkaller.appspotmail.com Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/20250921150824.149157-2-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv4/nexthop.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 29118c43ebf5..34137768e7f9 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -2399,6 +2399,13 @@ static int replace_nexthop_single(struct net *net, struct nexthop *old, return -EINVAL; } + if (!list_empty(&old->grp_list) && + rtnl_dereference(new->nh_info)->fdb_nh != + rtnl_dereference(old->nh_info)->fdb_nh) { + NL_SET_ERR_MSG(extack, "Cannot change nexthop FDB status while in a group"); + return -EINVAL; + } + err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); if (err) return err; -- cgit v1.2.3 From cd9a9562b2559973aa1b68c3af63021a2c5fd022 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Mon, 22 Sep 2025 16:14:48 +0200 Subject: net: bridge: Install FDB for bridge MAC on VLAN 0 Currently, after the bridge is created, the FDB does not hold an FDB entry for the bridge MAC on VLAN 0: # ip link add name br up type bridge # ip -br link show dev br br UNKNOWN 92:19:8c:4e:01:ed # bridge fdb show | grep 92:19:8c:4e:01:ed 92:19:8c:4e:01:ed dev br vlan 1 master br permanent Later when the bridge MAC is changed, or in fact when the address is given during netdevice creation, the entry appears: # ip link add name br up address 00:11:22:33:44:55 type bridge # bridge fdb show | grep 00:11:22:33:44:55 00:11:22:33:44:55 dev br vlan 1 master br permanent 00:11:22:33:44:55 dev br master br permanent However when the bridge address is set by the user to the current bridge address before the first port is enslaved, none of the address handlers gets invoked, because the address is not actually changed. The address is however marked as NET_ADDR_SET. Then when a port is enslaved, the address is not changed, because it is NET_ADDR_SET. Thus the VLAN 0 entry is not added, and it has not been added previously either: # ip link add name br up type bridge # ip -br link show dev br br UNKNOWN 7e:f0:a8:1a:be:c2 # ip link set dev br addr 7e:f0:a8:1a:be:c2 # ip link add name v up type veth # ip link set dev v master br # ip -br link show dev br br UNKNOWN 7e:f0:a8:1a:be:c2 # bridge fdb | grep 7e:f0:a8:1a:be:c2 7e:f0:a8:1a:be:c2 dev br vlan 1 master br permanent Then when the bridge MAC is used as DMAC, and br_handle_frame_finish() looks up an FDB entry with VLAN=0, it doesn't find any, and floods the traffic instead of passing it up. Fix this by simply adding the VLAN 0 FDB entry for the bridge itself always on netdevice creation. This also makes the behavior consistent with how ports are treated: ports always have an FDB entry for each member VLAN as well as VLAN 0. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/415202b2d1b9b0899479a502bbe2ba188678f192.1758550408.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- net/bridge/br.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/bridge/br.c b/net/bridge/br.c index 512872a2ef81..c37e52e2f29a 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -37,6 +37,11 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v int err; if (netif_is_bridge_master(dev)) { + struct net_bridge *br = netdev_priv(dev); + + if (event == NETDEV_REGISTER) + br_fdb_change_mac_address(br, dev->dev_addr); + err = br_vlan_bridge_event(dev, event, ptr); if (err) return notifier_from_errno(err); -- cgit v1.2.3 From e8ab231782e92bc26e5eb605263525636a2f7ae7 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Mon, 22 Sep 2025 16:19:24 -0700 Subject: net: ethtool: tsconfig: set command must provide a reply Timestamping configuration through ethtool has inconsistent behavior of skipping the reply for set command if configuration was not changed. Fix it be providing reply in any case. Fixes: 6e9e2eed4f39d ("net: ethtool: Add support for tsconfig command to get/set hwtstamp config") Signed-off-by: Vadim Fedorenko Reviewed-by: Kory Maincent Link: https://patch.msgid.link/20250922231924.2769571-1-vadfed@meta.com Signed-off-by: Jakub Kicinski --- net/ethtool/tsconfig.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ethtool/tsconfig.c b/net/ethtool/tsconfig.c index 2be356bdfe87..169b413b31fc 100644 --- a/net/ethtool/tsconfig.c +++ b/net/ethtool/tsconfig.c @@ -423,13 +423,11 @@ static int ethnl_set_tsconfig(struct ethnl_req_info *req_base, return ret; } - if (hwprov_mod || config_mod) { - ret = tsconfig_send_reply(dev, info); - if (ret && ret != -EOPNOTSUPP) { - NL_SET_ERR_MSG(info->extack, - "error while reading the new configuration set"); - return ret; - } + ret = tsconfig_send_reply(dev, info); + if (ret && ret != -EOPNOTSUPP) { + NL_SET_ERR_MSG(info->extack, + "error while reading the new configuration set"); + return ret; } /* tsconfig has no notification */ -- cgit v1.2.3 From dc1dea796b197aba2c3cae25bfef45f4b3ad46fe Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Sep 2025 00:54:19 +0000 Subject: tcp: Remove stale locking comment for TFO. The listener -> child locking no longer exists in the fast path since commit e994b2f0fb92 ("tcp: do not lock listener to process SYN packets"). Let's remove the stale comment for reqsk_fastopen_remove(). Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250923005441.4131554-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/core/request_sock.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 63de5c635842..897a8f01a67b 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -77,9 +77,7 @@ void reqsk_queue_alloc(struct request_sock_queue *queue) * a simple spin lock - one must consider sock_owned_by_user() and arrange * to use sk_add_backlog() stuff. But what really makes it infeasible is the * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to - * acquire a child's lock while holding listener's socket lock. A corner - * case might also exist in tcp_v4_hnd_req() that will trigger this locking - * order. + * acquire a child's lock while holding listener's socket lock. * * This function also sets "treq->tfo_listener" to false. * treq->tfo_listener is used by the listener so it is protected by the -- cgit v1.2.3 From 134121bfd99a06d44ef5ba15a9beb075297c0821 Mon Sep 17 00:00:00 2001 From: Slavin Liu Date: Fri, 12 Sep 2025 01:57:59 +0800 Subject: ipvs: Defer ip_vs_ftp unregister during netns cleanup On the netns cleanup path, __ip_vs_ftp_exit() may unregister ip_vs_ftp before connections with valid cp->app pointers are flushed, leading to a use-after-free. Fix this by introducing a global `exiting_module` flag, set to true in ip_vs_ftp_exit() before unregistering the pernet subsystem. In __ip_vs_ftp_exit(), skip ip_vs_ftp unregister if called during netns cleanup (when exiting_module is false) and defer it to __ip_vs_cleanup_batch(), which unregisters all apps after all connections are flushed. If called during module exit, unregister ip_vs_ftp immediately. Fixes: 61b1ab4583e2 ("IPVS: netns, add basic init per netns.") Suggested-by: Julian Anastasov Signed-off-by: Slavin Liu Signed-off-by: Julian Anastasov Signed-off-by: Florian Westphal --- net/netfilter/ipvs/ip_vs_ftp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index d8a284999544..206c6700e200 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -53,6 +53,7 @@ enum { IP_VS_FTP_EPSV, }; +static bool exiting_module; /* * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper * First port is set to the default port. @@ -605,7 +606,7 @@ static void __ip_vs_ftp_exit(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); - if (!ipvs) + if (!ipvs || !exiting_module) return; unregister_ip_vs_app(ipvs, &ip_vs_ftp); @@ -627,6 +628,7 @@ static int __init ip_vs_ftp_init(void) */ static void __exit ip_vs_ftp_exit(void) { + exiting_module = true; unregister_pernet_subsys(&ip_vs_ftp_ops); /* rcu_barrier() is called by netns */ } -- cgit v1.2.3 From 09efbac953f6f076a07735f9ba885148d4796235 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Fri, 19 Sep 2025 14:40:43 +0200 Subject: netfilter: nfnetlink: reset nlh pointer during batch replay During a batch replay, the nlh pointer is not reset until the parsing of the commands. Since commit bf2ac490d28c ("netfilter: nfnetlink: Handle ACK flags for batch messages") that is problematic as the condition to add an ACK for batch begin will evaluate to true even if NLM_F_ACK wasn't used for batch begin message. If there is an error during the command processing, netlink is sending an ACK despite that. This misleads userspace tools which think that the return code was 0. Reset the nlh pointer to the original one when a replay is triggered. Fixes: bf2ac490d28c ("netfilter: nfnetlink: Handle ACK flags for batch messages") Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Florian Westphal --- net/netfilter/nfnetlink.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index e598a2a252b0..811d02b4c4f7 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -376,6 +376,7 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, const struct nfnetlink_subsystem *ss; const struct nfnl_callback *nc; struct netlink_ext_ack extack; + struct nlmsghdr *onlh = nlh; LIST_HEAD(err_list); u32 status; int err; @@ -386,6 +387,7 @@ replay: status = 0; replay_abort: skb = netlink_skb_clone(oskb, GFP_KERNEL); + nlh = onlh; if (!skb) return netlink_ack(oskb, nlh, -ENOMEM, NULL); -- cgit v1.2.3 From 4dbac7db17f1e973fbbd6aea07a00181cd4cd162 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 16 Sep 2025 18:34:01 +0200 Subject: netfilter: nft_set_pipapo: use 0 genmask for packetpath lookups In commit c4eaca2e1052 ("netfilter: nft_set_pipapo: don't check genbit from packetpath lookups") I replaced genmask_cur() with NFT_GENMASK_ANY, but this change has no effect in the pipapo set type. New entries are unreachable from the active copy, so NFT_GENMASK_ANY has same result as genmask_cur(): current-gen elements are disabled and the new-generation elements cannot be found. Tests did not catch this incomplete fix because the change also dropped the genmask test from the AVX2 version of the algorithm, so test only fails if host cpu lacks AVX2 support. Use genmask test only from the control plane (inserts, deletions, ..). Packet path has to skip the check, use of 0 is enough for this because ext->genmask has a the relevant bit set when the element is INACTIVE in that generation: using a 0 genmask thus makes nft_set_elem_active() always return true. Fix the comment and replace NFT_GENMASK_ANY with 0. Fixes: c4eaca2e1052 ("netfilter: nft_set_pipapo: don't check genbit from packetpath lookups") Signed-off-by: Florian Westphal --- net/netfilter/nft_set_pipapo.c | 9 ++++----- net/netfilter/nft_set_pipapo_avx2.c | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index a7b8fa8cab7c..112fe46788b6 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -549,8 +549,7 @@ static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m, * * This function is called from the data path. It will search for * an element matching the given key in the current active copy. - * Unlike other set types, this uses NFT_GENMASK_ANY instead of - * nft_genmask_cur(). + * Unlike other set types, this uses 0 instead of nft_genmask_cur(). * * This is because new (future) elements are not reachable from * priv->match, they get added to priv->clone instead. @@ -560,8 +559,8 @@ static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m, * inconsistent state: matching old entries get skipped but thew * newly matching entries are unreachable. * - * GENMASK will still find the 'now old' entries which ensures consistent - * priv->match view. + * GENMASK_ANY doesn't work for the same reason: old-gen entries get + * skipped, new-gen entries are only reachable from priv->clone. * * nft_pipapo_commit swaps ->clone and ->match shortly after the * genbit flip. As ->clone doesn't contain the old entries in the first @@ -578,7 +577,7 @@ nft_pipapo_lookup(const struct net *net, const struct nft_set *set, const struct nft_pipapo_elem *e; m = rcu_dereference(priv->match); - e = pipapo_get_slow(m, (const u8 *)key, NFT_GENMASK_ANY, get_jiffies_64()); + e = pipapo_get_slow(m, (const u8 *)key, 0, get_jiffies_64()); return e ? &e->ext : NULL; } diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index 27dab3667548..e72fd045d037 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1292,7 +1292,7 @@ nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, m = rcu_dereference(priv->match); - e = pipapo_get_avx2(m, rp, NFT_GENMASK_ANY, get_jiffies_64()); + e = pipapo_get_avx2(m, rp, 0, get_jiffies_64()); local_bh_enable(); return e ? &e->ext : NULL; -- cgit v1.2.3 From 5823699a11cf3c05d751b473c66920d2d3cac0a5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 9 Sep 2025 14:11:48 +0200 Subject: netfilter: nft_set_pipapo_avx2: fix skip of expired entries KASAN reports following splat: BUG: KASAN: slab-out-of-bounds in pipapo_get_avx2+0x941/0x25d0 Read of size 1 at addr ffff88814c561be0 by task nft/3944 Call Trace: pipapo_get_avx2+0x941/0x25d0 nft_pipapo_insert+0x440/0x11b0 nf_tables_newsetelem+0x220a/0x3a00 .. This bisects to commit 84c1da7b38d9 ("netfilter: nft_set_pipapo: use AVX2 algorithm for insertions too"). However, that change merely uncovers this bug. When we find a match but that match has expired or timed out, the AVX2 implementation restarts the full match loop. At that point, the pointer to the key data has already been changed and points to the keys last field. This will then result in out-of-bounds read once its incremented again for the next field. The restart logic in AVX2 is different compared to the plain C implementation, but both should follow the same logic. The C implementation just calls pipapo_refill() again do check the next entry. Do the same in the AVX2 implementation. Note that with this change, due to implementation differences of pipapo_refill vs. nft_pipapo_avx2_refill, the refill call will return the same element again. Then, on the next call, it will move to the next entry as expected. This is because avx2_refill doesn't clear the bitmap in the 'last' conditional. This is harmless. Expired/timed out elements are also not expected to be frequent. selftest is added in a followup commit. Fixes: 7400b063969b ("nft_set_pipapo: Introduce AVX2-based lookup implementation") Reviewed-by: Stefano Brivio Signed-off-by: Florian Westphal --- net/netfilter/nft_set_pipapo_avx2.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index e72fd045d037..7ff90325c97f 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1179,7 +1179,6 @@ struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m, nft_pipapo_avx2_prepare(); -next_match: nft_pipapo_for_each_field(f, i, m) { bool last = i == m->field_count - 1, first = !i; int ret = 0; @@ -1226,6 +1225,7 @@ next_match: #undef NFT_SET_PIPAPO_AVX2_LOOKUP +next_match: if (ret < 0) { scratch->map_index = map_index; kernel_fpu_end(); @@ -1238,8 +1238,11 @@ next_match: e = f->mt[ret].e; if (unlikely(__nft_set_elem_expired(&e->ext, tstamp) || - !nft_set_elem_active(&e->ext, genmask))) + !nft_set_elem_active(&e->ext, genmask))) { + ret = pipapo_refill(res, f->bsize, f->rules, + fill, f->mt, last); goto next_match; + } scratch->map_index = map_index; kernel_fpu_end(); -- cgit v1.2.3 From c5ba345b2d358b07cc4f07253ba1ada73e77d586 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 24 Sep 2025 07:27:09 +0000 Subject: netfilter: nf_conntrack: do not skip entries in /proc/net/nf_conntrack ct_seq_show() has an opportunistic garbage collector : if (nf_ct_should_gc(ct)) { nf_ct_kill(ct); goto release; } So if one nf_conn is killed there, next time ct_get_next() runs, we skip the following item in the bucket, even if it should have been displayed if gc did not take place. We can decrement st->skip_elems to tell ct_get_next() one of the items was removed from the chain. Fixes: 58e207e4983d ("netfilter: evict stale entries when user reads /proc/net/nf_conntrack") Signed-off-by: Eric Dumazet Signed-off-by: Florian Westphal --- net/netfilter/nf_conntrack_standalone.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 1f14ef0436c6..708b79380f04 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -317,6 +317,9 @@ static int ct_seq_show(struct seq_file *s, void *v) smp_acquire__after_ctrl_dep(); if (nf_ct_should_gc(ct)) { + struct ct_iter_state *st = s->private; + + st->skip_elems--; nf_ct_kill(ct); goto release; } -- cgit v1.2.3 From b6db19d1df8a75b5f05f5fe487cbd09f48760a3c Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 23 Sep 2025 22:45:10 +0200 Subject: tls: Avoid -Wflex-array-member-not-at-end warning Remove unused flexible-array member in struct tls_rec and, with this, fix the following warning: net/tls/tls.h:131:29: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] Also, add a comment to prevent people from adding any members after struct aead_request, which is a flexible structure --this is a structure that ends in a flexible-array member. Signed-off-by: Gustavo A. R. Silva Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/aNMG1lyXw4XEAVaE@kspp Signed-off-by: Jakub Kicinski --- net/tls/tls.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tls/tls.h b/net/tls/tls.h index e4c42731ce39..2f86baeb71fc 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -128,8 +128,9 @@ struct tls_rec { char aad_space[TLS_AAD_SPACE_SIZE]; u8 iv_data[TLS_MAX_IV_SIZE]; + + /* Must be last --ends in a flexible-array member. */ struct aead_request aead_req; - u8 aead_req_ctx[]; }; int __net_init tls_proc_init(struct net *net); -- cgit v1.2.3 From 4055526d35746ce8b04bfa5e14e14f28bb163186 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 24 Sep 2025 13:33:59 +0200 Subject: ns: move ns type into struct ns_common It's misplaced in struct proc_ns_operations and ns->ops might be NULL if the namespace is compiled out but we still want to know the type of the namespace for the initial namespace struct. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 6 +++--- fs/nsfs.c | 18 +++++++++--------- include/linux/ns_common.h | 30 +++++++++++++++++++++++++----- include/linux/proc_ns.h | 1 - init/version-timestamp.c | 1 + ipc/msgutil.c | 1 + ipc/namespace.c | 1 - kernel/cgroup/cgroup.c | 1 + kernel/cgroup/namespace.c | 1 - kernel/nscommon.c | 5 +++-- kernel/nsproxy.c | 4 ++-- kernel/nstree.c | 8 ++++---- kernel/pid.c | 1 + kernel/pid_namespace.c | 2 -- kernel/time/namespace.c | 3 +-- kernel/user.c | 1 + kernel/user_namespace.c | 1 - kernel/utsname.c | 1 - net/core/net_namespace.c | 1 - 19 files changed, 52 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/fs/namespace.c b/fs/namespace.c index d65917ec5544..01334d5038a2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4927,7 +4927,7 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, return -EINVAL; ns = get_proc_ns(file_inode(fd_file(f))); - if (ns->ops->type != CLONE_NEWUSER) + if (ns->ns_type != CLONE_NEWUSER) return -EINVAL; /* @@ -5830,7 +5830,7 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq return ERR_PTR(-EINVAL); ns = get_proc_ns(file_inode(fd_file(f))); - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return ERR_PTR(-EINVAL); mnt_ns = to_mnt_ns(ns); @@ -6016,6 +6016,7 @@ struct mnt_namespace init_mnt_ns = { .ns.ops = &mntns_operations, .user_ns = &init_user_ns, .ns.__ns_ref = REFCOUNT_INIT(1), + .ns.ns_type = ns_common_type(&init_mnt_ns), .passive = REFCOUNT_INIT(1), .mounts = RB_ROOT, .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll), @@ -6333,7 +6334,6 @@ static struct user_namespace *mntns_owner(struct ns_common *ns) const struct proc_ns_operations mntns_operations = { .name = "mnt", - .type = CLONE_NEWNS, .get = mntns_get, .put = mntns_put, .install = mntns_install, diff --git a/fs/nsfs.c b/fs/nsfs.c index dc0a4404b971..e7fd8a790aaa 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -219,9 +219,9 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, return -EINVAL; return open_related_ns(ns, ns->ops->get_parent); case NS_GET_NSTYPE: - return ns->ops->type; + return ns->ns_type; case NS_GET_OWNER_UID: - if (ns->ops->type != CLONE_NEWUSER) + if (ns->ns_type != CLONE_NEWUSER) return -EINVAL; user_ns = container_of(ns, struct user_namespace, ns); argp = (uid_t __user *) arg; @@ -234,7 +234,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, case NS_GET_PID_IN_PIDNS: fallthrough; case NS_GET_TGID_IN_PIDNS: { - if (ns->ops->type != CLONE_NEWPID) + if (ns->ns_type != CLONE_NEWPID) return -EINVAL; ret = -ESRCH; @@ -273,7 +273,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, return ret; } case NS_GET_MNTNS_ID: - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return -EINVAL; fallthrough; case NS_GET_ID: { @@ -293,7 +293,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg; size_t usize = _IOC_SIZE(ioctl); - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return -EINVAL; if (!uinfo) @@ -314,7 +314,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, struct file *f __free(fput) = NULL; size_t usize = _IOC_SIZE(ioctl); - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return -EINVAL; if (usize < MNT_NS_INFO_SIZE_VER0) @@ -453,7 +453,7 @@ static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, } fid->ns_id = ns->ns_id; - fid->ns_type = ns->ops->type; + fid->ns_type = ns->ns_type; fid->ns_inum = inode->i_ino; return FILEID_NSFS; } @@ -489,14 +489,14 @@ static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, return NULL; VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id); - VFS_WARN_ON_ONCE(ns->ops->type != fid->ns_type); + VFS_WARN_ON_ONCE(ns->ns_type != fid->ns_type); VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum); if (!__ns_ref_get(ns)) return NULL; } - switch (ns->ops->type) { + switch (ns->ns_type) { #ifdef CONFIG_CGROUPS case CLONE_NEWCGROUP: if (!current_in_namespace(to_cg_ns(ns))) diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 56492cd9ff8d..f5b68b8abb54 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -4,6 +4,7 @@ #include #include +#include struct proc_ns_operations; @@ -37,6 +38,7 @@ extern const struct proc_ns_operations timens_operations; extern const struct proc_ns_operations timens_for_children_operations; struct ns_common { + u32 ns_type; struct dentry *stashed; const struct proc_ns_operations *ops; unsigned int inum; @@ -51,7 +53,7 @@ struct ns_common { }; }; -int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum); +int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum); void __ns_common_free(struct ns_common *ns); #define to_ns_common(__ns) \ @@ -106,10 +108,28 @@ void __ns_common_free(struct ns_common *ns); struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \ struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL)) -#define ns_common_init(__ns) \ - __ns_common_init(to_ns_common(__ns), to_ns_operations(__ns), (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) - -#define ns_common_init_inum(__ns, __inum) __ns_common_init(to_ns_common(__ns), to_ns_operations(__ns), __inum) +#define ns_common_type(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CLONE_NEWCGROUP, \ + struct ipc_namespace *: CLONE_NEWIPC, \ + struct mnt_namespace *: CLONE_NEWNS, \ + struct net *: CLONE_NEWNET, \ + struct pid_namespace *: CLONE_NEWPID, \ + struct time_namespace *: CLONE_NEWTIME, \ + struct user_namespace *: CLONE_NEWUSER, \ + struct uts_namespace *: CLONE_NEWUTS) + +#define ns_common_init(__ns) \ + __ns_common_init(to_ns_common(__ns), \ + ns_common_type(__ns), \ + to_ns_operations(__ns), \ + (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) + +#define ns_common_init_inum(__ns, __inum) \ + __ns_common_init(to_ns_common(__ns), \ + ns_common_type(__ns), \ + to_ns_operations(__ns), \ + __inum) #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 08016f6e0e6f..e81b8e596e4f 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -17,7 +17,6 @@ struct inode; struct proc_ns_operations { const char *name; const char *real_ns_name; - int type; struct ns_common *(*get)(struct task_struct *task); void (*put)(struct ns_common *ns); int (*install)(struct nsset *nsset, struct ns_common *ns); diff --git a/init/version-timestamp.c b/init/version-timestamp.c index 376b7c856d4d..d071835121c2 100644 --- a/init/version-timestamp.c +++ b/init/version-timestamp.c @@ -8,6 +8,7 @@ #include struct uts_namespace init_uts_ns = { + .ns.ns_type = ns_common_type(&init_uts_ns), .ns.__ns_ref = REFCOUNT_INIT(2), .name = { .sysname = UTS_SYSNAME, diff --git a/ipc/msgutil.c b/ipc/msgutil.c index dca6c8ec8f5f..7a03f6d03de3 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -33,6 +33,7 @@ struct ipc_namespace init_ipc_ns = { #ifdef CONFIG_IPC_NS .ns.ops = &ipcns_operations, #endif + .ns.ns_type = ns_common_type(&init_ipc_ns), }; struct msg_msgseg { diff --git a/ipc/namespace.c b/ipc/namespace.c index d89dfd718d2b..76abac74a5c3 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -248,7 +248,6 @@ static struct user_namespace *ipcns_owner(struct ns_common *ns) const struct proc_ns_operations ipcns_operations = { .name = "ipc", - .type = CLONE_NEWIPC, .get = ipcns_get, .put = ipcns_put, .install = ipcns_install, diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 245b43ff2fa4..9b75102e81cb 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -224,6 +224,7 @@ struct cgroup_namespace init_cgroup_ns = { .ns.ops = &cgroupns_operations, .ns.inum = ns_init_inum(&init_cgroup_ns), .root_cset = &init_css_set, + .ns.ns_type = ns_common_type(&init_cgroup_ns), }; static struct file_system_type cgroup2_fs_type; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 04c98338ac08..241ca05f07c8 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -137,7 +137,6 @@ static struct user_namespace *cgroupns_owner(struct ns_common *ns) const struct proc_ns_operations cgroupns_operations = { .name = "cgroup", - .type = CLONE_NEWCGROUP, .get = cgroupns_get, .put = cgroupns_put, .install = cgroupns_install, diff --git a/kernel/nscommon.c b/kernel/nscommon.c index 3cef89ddef41..92c9df1e8774 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -7,7 +7,7 @@ #ifdef CONFIG_DEBUG_VFS static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) { - switch (ns->ops->type) { + switch (ns->ns_type) { #ifdef CONFIG_CGROUPS case CLONE_NEWCGROUP: VFS_WARN_ON_ONCE(ops != &cgroupns_operations); @@ -52,12 +52,13 @@ static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) } #endif -int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum) +int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) { refcount_set(&ns->__ns_ref, 1); ns->stashed = NULL; ns->ops = ops; ns->ns_id = 0; + ns->ns_type = ns_type; RB_CLEAR_NODE(&ns->ns_tree_node); INIT_LIST_HEAD(&ns->ns_list_node); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 5f31fdff8a38..8d62449237b6 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -545,9 +545,9 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags) if (proc_ns_file(fd_file(f))) { ns = get_proc_ns(file_inode(fd_file(f))); - if (flags && (ns->ops->type != flags)) + if (flags && (ns->ns_type != flags)) err = -EINVAL; - flags = ns->ops->type; + flags = ns->ns_type; } else if (!IS_ERR(pidfd_pid(fd_file(f)))) { err = check_setns_flags(flags); } else { diff --git a/kernel/nstree.c b/kernel/nstree.c index ecc88b013eff..b24a320a11a6 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -106,7 +106,7 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) write_seqlock(&ns_tree->ns_tree_lock); - VFS_WARN_ON_ONCE(ns->ops->type != ns_tree->type); + VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp); /* @@ -128,7 +128,7 @@ void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) { VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node)); VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node)); - VFS_WARN_ON_ONCE(ns->ops->type != ns_tree->type); + VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); write_seqlock(&ns_tree->ns_tree_lock); rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree); @@ -197,7 +197,7 @@ struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) if (!node) return NULL; - VFS_WARN_ON_ONCE(node_to_ns(node)->ops->type != ns_type); + VFS_WARN_ON_ONCE(node_to_ns(node)->ns_type != ns_type); return node_to_ns(node); } @@ -225,7 +225,7 @@ struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, if (list_is_head(list, &ns_tree->ns_list)) return ERR_PTR(-ENOENT); - VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ops->type != ns_tree->type); + VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ns_type != ns_tree->type); return list_entry_rcu(list, struct ns_common, ns_list_node); } diff --git a/kernel/pid.c b/kernel/pid.c index 7e8c66e0bf67..0c2dcddb317a 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -85,6 +85,7 @@ struct pid_namespace init_pid_ns = { #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, #endif + .ns.ns_type = ns_common_type(&init_pid_ns), }; EXPORT_SYMBOL_GPL(init_pid_ns); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a262a3f19443..f5b222c8ac39 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -443,7 +443,6 @@ static struct user_namespace *pidns_owner(struct ns_common *ns) const struct proc_ns_operations pidns_operations = { .name = "pid", - .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, @@ -454,7 +453,6 @@ const struct proc_ns_operations pidns_operations = { const struct proc_ns_operations pidns_for_children_operations = { .name = "pid_for_children", .real_ns_name = "pid", - .type = CLONE_NEWPID, .get = pidns_for_children_get, .put = pidns_put, .install = pidns_install, diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 9f26e61be044..530cf99c2212 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -462,7 +462,6 @@ out: const struct proc_ns_operations timens_operations = { .name = "time", - .type = CLONE_NEWTIME, .get = timens_get, .put = timens_put, .install = timens_install, @@ -472,7 +471,6 @@ const struct proc_ns_operations timens_operations = { const struct proc_ns_operations timens_for_children_operations = { .name = "time_for_children", .real_ns_name = "time", - .type = CLONE_NEWTIME, .get = timens_for_children_get, .put = timens_put, .install = timens_install, @@ -480,6 +478,7 @@ const struct proc_ns_operations timens_for_children_operations = { }; struct time_namespace init_time_ns = { + .ns.ns_type = ns_common_type(&init_time_ns), .ns.__ns_ref = REFCOUNT_INIT(3), .user_ns = &init_user_ns, .ns.inum = ns_init_inum(&init_time_ns), diff --git a/kernel/user.c b/kernel/user.c index b2a53674d506..0163665914c9 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -65,6 +65,7 @@ struct user_namespace init_user_ns = { .nr_extents = 1, }, }, + .ns.ns_type = ns_common_type(&init_user_ns), .ns.__ns_ref = REFCOUNT_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index e1559e8a8a02..03cb63883d04 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1400,7 +1400,6 @@ static struct user_namespace *userns_owner(struct ns_common *ns) const struct proc_ns_operations userns_operations = { .name = "user", - .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, diff --git a/kernel/utsname.c b/kernel/utsname.c index 00001592ad13..a8cdc84648ee 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -146,7 +146,6 @@ static struct user_namespace *utsns_owner(struct ns_common *ns) const struct proc_ns_operations utsns_operations = { .name = "uts", - .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index bdea7d5fac56..dfe84bd35f98 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1543,7 +1543,6 @@ static struct user_namespace *netns_owner(struct ns_common *ns) const struct proc_ns_operations netns_operations = { .name = "net", - .type = CLONE_NEWNET, .get = netns_get, .put = netns_put, .install = netns_install, -- cgit v1.2.3 From 25c550464acd40803d63868dfa4a42506df48b88 Mon Sep 17 00:00:00 2001 From: Richard Gobert Date: Tue, 23 Sep 2025 10:59:04 +0200 Subject: net: gro: remove is_ipv6 from napi_gro_cb Remove is_ipv6 from napi_gro_cb and use sk->sk_family instead. This frees up space for another ip_fixedid bit that will be added in the next commit. udp_sock_create always creates either a AF_INET or a AF_INET6 socket, so using sk->sk_family is reliable. In IPv6-FOU, cfg->ipv6_v6only is always enabled. Signed-off-by: Richard Gobert Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250923085908.4687-2-richardbgobert@gmail.com Signed-off-by: Paolo Abeni --- include/net/gro.h | 3 --- net/ipv4/fou_core.c | 32 ++++++++++++++------------------ net/ipv4/udp_offload.c | 2 -- net/ipv6/udp_offload.c | 2 -- 4 files changed, 14 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/include/net/gro.h b/include/net/gro.h index a0fca7ac6e7e..87c68007f949 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -71,9 +71,6 @@ struct napi_gro_cb { /* Free the skb? */ u8 free:2; - /* Used in foo-over-udp, set in udp[46]_gro_receive */ - u8 is_ipv6:1; - /* Used in GRE, set in fou/gue_gro_receive */ u8 is_fou:1; diff --git a/net/ipv4/fou_core.c b/net/ipv4/fou_core.c index 3e30745e2c09..3970b6b7ace5 100644 --- a/net/ipv4/fou_core.c +++ b/net/ipv4/fou_core.c @@ -228,21 +228,27 @@ drop: return 0; } +static const struct net_offload *fou_gro_ops(const struct sock *sk, + int proto) +{ + const struct net_offload __rcu **offloads; + + /* FOU doesn't allow IPv4 on IPv6 sockets. */ + offloads = sk->sk_family == AF_INET6 ? inet6_offloads : inet_offloads; + return rcu_dereference(offloads[proto]); +} + static struct sk_buff *fou_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { - const struct net_offload __rcu **offloads; struct fou *fou = fou_from_sock(sk); const struct net_offload *ops; struct sk_buff *pp = NULL; - u8 proto; if (!fou) goto out; - proto = fou->protocol; - /* We can clear the encap_mark for FOU as we are essentially doing * one of two possible things. We are either adding an L4 tunnel * header to the outer L3 tunnel header, or we are simply @@ -254,8 +260,7 @@ static struct sk_buff *fou_gro_receive(struct sock *sk, /* Flag this frame as already having an outer encap header */ NAPI_GRO_CB(skb)->is_fou = 1; - offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[proto]); + ops = fou_gro_ops(sk, fou->protocol); if (!ops || !ops->callbacks.gro_receive) goto out; @@ -268,10 +273,8 @@ out: static int fou_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { - const struct net_offload __rcu **offloads; struct fou *fou = fou_from_sock(sk); const struct net_offload *ops; - u8 proto; int err; if (!fou) { @@ -279,10 +282,7 @@ static int fou_gro_complete(struct sock *sk, struct sk_buff *skb, goto out; } - proto = fou->protocol; - - offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[proto]); + ops = fou_gro_ops(sk, fou->protocol); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) { err = -ENOSYS; goto out; @@ -323,7 +323,6 @@ static struct sk_buff *gue_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { - const struct net_offload __rcu **offloads; const struct net_offload *ops; struct sk_buff *pp = NULL; struct sk_buff *p; @@ -450,8 +449,7 @@ next_proto: /* Flag this frame as already having an outer encap header */ NAPI_GRO_CB(skb)->is_fou = 1; - offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[proto]); + ops = fou_gro_ops(sk, proto); if (!ops || !ops->callbacks.gro_receive) goto out; @@ -467,7 +465,6 @@ out: static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff); - const struct net_offload __rcu **offloads; const struct net_offload *ops; unsigned int guehlen = 0; u8 proto; @@ -494,8 +491,7 @@ static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) return err; } - offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[proto]); + ops = fou_gro_ops(sk, proto); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index b1f3fd302e9d..19d0b5b09ffa 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -891,8 +891,6 @@ struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) skb_gro_checksum_try_convert(skb, IPPROTO_UDP, inet_gro_compute_pseudo); skip: - NAPI_GRO_CB(skb)->is_ipv6 = 0; - if (static_branch_unlikely(&udp_encap_needed_key)) sk = udp4_gro_lookup_skb(skb, uh->source, uh->dest); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index d8445ac1b2e4..046f13b1d77a 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -154,8 +154,6 @@ struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) ip6_gro_compute_pseudo); skip: - NAPI_GRO_CB(skb)->is_ipv6 = 1; - if (static_branch_unlikely(&udpv6_encap_needed_key)) sk = udp6_gro_lookup_skb(skb, uh->source, uh->dest); -- cgit v1.2.3 From 21f7484220ace6c355cb0023d14d83da6fe5843d Mon Sep 17 00:00:00 2001 From: Richard Gobert Date: Tue, 23 Sep 2025 10:59:05 +0200 Subject: net: gro: only merge packets with incrementing or fixed outer ids Only merge encapsulated packets if their outer IDs are either incrementing or fixed, just like for inner IDs and IDs of non-encapsulated packets. Add another ip_fixedid bit for a total of two bits: one for outer IDs (and for unencapsulated packets) and one for inner IDs. This commit preserves the current behavior of GSO where only the IDs of the inner-most headers are restored correctly. Signed-off-by: Richard Gobert Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250923085908.4687-3-richardbgobert@gmail.com Signed-off-by: Paolo Abeni --- include/net/gro.h | 26 +++++++++++--------------- net/ipv4/tcp_offload.c | 5 ++++- 2 files changed, 15 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/net/gro.h b/include/net/gro.h index 87c68007f949..e7997a9fb30b 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -75,7 +75,7 @@ struct napi_gro_cb { u8 is_fou:1; /* Used to determine if ipid_offset can be ignored */ - u8 ip_fixedid:1; + u8 ip_fixedid:2; /* Number of gro_receive callbacks this packet already went through */ u8 recursion_counter:4; @@ -442,29 +442,26 @@ static inline __wsum ip6_gro_compute_pseudo(const struct sk_buff *skb, } static inline int inet_gro_flush(const struct iphdr *iph, const struct iphdr *iph2, - struct sk_buff *p, bool outer) + struct sk_buff *p, bool inner) { const u32 id = ntohl(*(__be32 *)&iph->id); const u32 id2 = ntohl(*(__be32 *)&iph2->id); const u16 ipid_offset = (id >> 16) - (id2 >> 16); const u16 count = NAPI_GRO_CB(p)->count; const u32 df = id & IP_DF; - int flush; /* All fields must match except length and checksum. */ - flush = (iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | (df ^ (id2 & IP_DF)); - - if (flush | (outer && df)) - return flush; + if ((iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | (df ^ (id2 & IP_DF))) + return true; /* When we receive our second frame we can make a decision on if we * continue this flow as an atomic flow with a fixed ID or if we use * an incrementing ID. */ if (count == 1 && df && !ipid_offset) - NAPI_GRO_CB(p)->ip_fixedid = true; + NAPI_GRO_CB(p)->ip_fixedid |= 1 << inner; - return ipid_offset ^ (count * !NAPI_GRO_CB(p)->ip_fixedid); + return ipid_offset ^ (count * !(NAPI_GRO_CB(p)->ip_fixedid & (1 << inner))); } static inline int ipv6_gro_flush(const struct ipv6hdr *iph, const struct ipv6hdr *iph2) @@ -479,7 +476,7 @@ static inline int ipv6_gro_flush(const struct ipv6hdr *iph, const struct ipv6hdr static inline int __gro_receive_network_flush(const void *th, const void *th2, struct sk_buff *p, const u16 diff, - bool outer) + bool inner) { const void *nh = th - diff; const void *nh2 = th2 - diff; @@ -487,19 +484,18 @@ static inline int __gro_receive_network_flush(const void *th, const void *th2, if (((struct iphdr *)nh)->version == 6) return ipv6_gro_flush(nh, nh2); else - return inet_gro_flush(nh, nh2, p, outer); + return inet_gro_flush(nh, nh2, p, inner); } static inline int gro_receive_network_flush(const void *th, const void *th2, struct sk_buff *p) { - const bool encap_mark = NAPI_GRO_CB(p)->encap_mark; int off = skb_transport_offset(p); int flush; - flush = __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->network_offset, encap_mark); - if (encap_mark) - flush |= __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->inner_network_offset, false); + flush = __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->network_offset, false); + if (NAPI_GRO_CB(p)->encap_mark) + flush |= __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->inner_network_offset, true); return flush; } diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index e6612bd84d09..1949eede9ec9 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -471,6 +471,7 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; const struct iphdr *iph = (struct iphdr *)(skb->data + offset); struct tcphdr *th = tcp_hdr(skb); + bool is_fixedid; if (unlikely(NAPI_GRO_CB(skb)->is_flist)) { skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV4; @@ -484,8 +485,10 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr, iph->daddr, 0); + is_fixedid = (NAPI_GRO_CB(skb)->ip_fixedid >> skb->encapsulation) & 1; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4 | - (NAPI_GRO_CB(skb)->ip_fixedid * SKB_GSO_TCP_FIXEDID); + (is_fixedid * SKB_GSO_TCP_FIXEDID); tcp_gro_complete(skb); return 0; -- cgit v1.2.3 From 3271f19bf7b9df665549666d789b9f126b4420c7 Mon Sep 17 00:00:00 2001 From: Richard Gobert Date: Tue, 23 Sep 2025 10:59:06 +0200 Subject: net: gso: restore ids of outer ip headers correctly Currently, NETIF_F_TSO_MANGLEID indicates that the inner-most ID can be mangled. Outer IDs can always be mangled. Make GSO preserve outer IDs by default, with NETIF_F_TSO_MANGLEID allowing both inner and outer IDs to be mangled. This commit also modifies a few drivers that use SKB_GSO_FIXEDID directly. Signed-off-by: Richard Gobert Reviewed-by: Edward Cree # for sfc Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250923085908.4687-4-richardbgobert@gmail.com Signed-off-by: Paolo Abeni --- Documentation/networking/segmentation-offloads.rst | 22 ++++++++++++++-------- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 8 ++++++-- drivers/net/ethernet/sfc/ef100_tx.c | 17 +++++++++++++---- include/linux/netdevice.h | 9 +++++++-- include/linux/skbuff.h | 8 +++++++- net/core/dev.c | 10 ++++++++-- net/ipv4/af_inet.c | 13 ++++++------- net/ipv4/tcp_offload.c | 6 ++---- 8 files changed, 63 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/Documentation/networking/segmentation-offloads.rst b/Documentation/networking/segmentation-offloads.rst index 085e8fab03fd..72f69b22b28c 100644 --- a/Documentation/networking/segmentation-offloads.rst +++ b/Documentation/networking/segmentation-offloads.rst @@ -43,10 +43,19 @@ also point to the TCP header of the packet. For IPv4 segmentation we support one of two types in terms of the IP ID. The default behavior is to increment the IP ID with every segment. If the GSO type SKB_GSO_TCP_FIXEDID is specified then we will not increment the IP -ID and all segments will use the same IP ID. If a device has -NETIF_F_TSO_MANGLEID set then the IP ID can be ignored when performing TSO -and we will either increment the IP ID for all frames, or leave it at a -static value based on driver preference. +ID and all segments will use the same IP ID. + +For encapsulated packets, SKB_GSO_TCP_FIXEDID refers only to the outer header. +SKB_GSO_TCP_FIXEDID_INNER can be used to specify the same for the inner header. +Any combination of these two GSO types is allowed. + +If a device has NETIF_F_TSO_MANGLEID set then the IP ID can be ignored when +performing TSO and we will either increment the IP ID for all frames, or leave +it at a static value based on driver preference. For encapsulated packets, +NETIF_F_TSO_MANGLEID is relevant for both outer and inner headers, unless the +DF bit is not set on the outer header, in which case the device driver must +guarantee that the IP ID field is incremented in the outer header with every +segment. UDP Fragmentation Offload @@ -124,10 +133,7 @@ Generic Receive Offload Generic receive offload is the complement to GSO. Ideally any frame assembled by GRO should be segmented to create an identical sequence of frames using GSO, and any sequence of frames segmented by GSO should be -able to be reassembled back to the original by GRO. The only exception to -this is IPv4 ID in the case that the DF bit is set for a given IP header. -If the value of the IPv4 ID is not sequentially incrementing it will be -altered so that it is when a frame assembled via GRO is segmented via GSO. +able to be reassembled back to the original by GRO. Partial Generic Segmentation Offload diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 4ed43ee9aa35..263d5628ee44 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1290,8 +1290,12 @@ static void mlx5e_shampo_update_ipv4_tcp_hdr(struct mlx5e_rq *rq, struct iphdr * tcp->check = ~tcp_v4_check(skb->len - tcp_off, ipv4->saddr, ipv4->daddr, 0); skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; - if (ntohs(ipv4->id) == rq->hw_gro_data->second_ip_id) - skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_FIXEDID; + if (ntohs(ipv4->id) == rq->hw_gro_data->second_ip_id) { + bool encap = rq->hw_gro_data->fk.control.flags & FLOW_DIS_ENCAPSULATION; + + skb_shinfo(skb)->gso_type |= encap ? SKB_GSO_TCP_FIXEDID_INNER : + SKB_GSO_TCP_FIXEDID; + } skb->csum_start = (unsigned char *)tcp - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); diff --git a/drivers/net/ethernet/sfc/ef100_tx.c b/drivers/net/ethernet/sfc/ef100_tx.c index e6b6be549581..03005757c060 100644 --- a/drivers/net/ethernet/sfc/ef100_tx.c +++ b/drivers/net/ethernet/sfc/ef100_tx.c @@ -189,6 +189,7 @@ static void ef100_make_tso_desc(struct efx_nic *efx, { bool gso_partial = skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL; unsigned int len, ip_offset, tcp_offset, payload_segs; + u32 mangleid_outer = ESE_GZ_TX_DESC_IP4_ID_INC_MOD16; u32 mangleid = ESE_GZ_TX_DESC_IP4_ID_INC_MOD16; unsigned int outer_ip_offset, outer_l4_offset; u16 vlan_tci = skb_vlan_tag_get(skb); @@ -200,8 +201,17 @@ static void ef100_make_tso_desc(struct efx_nic *efx, bool outer_csum; u32 paylen; - if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID) - mangleid = ESE_GZ_TX_DESC_IP4_ID_NO_OP; + if (encap) { + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID_INNER) + mangleid = ESE_GZ_TX_DESC_IP4_ID_NO_OP; + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID) + mangleid_outer = ESE_GZ_TX_DESC_IP4_ID_NO_OP; + } else { + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID) + mangleid = ESE_GZ_TX_DESC_IP4_ID_NO_OP; + mangleid_outer = ESE_GZ_TX_DESC_IP4_ID_NO_OP; + } + if (efx->net_dev->features & NETIF_F_HW_VLAN_CTAG_TX) vlan_enable = skb_vlan_tag_present(skb); @@ -245,8 +255,7 @@ static void ef100_make_tso_desc(struct efx_nic *efx, ESF_GZ_TX_TSO_OUTER_L4_OFF_W, outer_l4_offset >> 1, ESF_GZ_TX_TSO_ED_OUTER_UDP_LEN, udp_encap && !gso_partial, ESF_GZ_TX_TSO_ED_OUTER_IP_LEN, encap && !gso_partial, - ESF_GZ_TX_TSO_ED_OUTER_IP4_ID, encap ? mangleid : - ESE_GZ_TX_DESC_IP4_ID_NO_OP, + ESF_GZ_TX_TSO_ED_OUTER_IP4_ID, mangleid_outer, ESF_GZ_TX_TSO_VLAN_INSERT_EN, vlan_enable, ESF_GZ_TX_TSO_VLAN_INSERT_TCI, vlan_tci ); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1c54d44805fa..1b85454116f6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -5320,13 +5320,18 @@ void skb_warn_bad_offload(const struct sk_buff *skb); static inline bool net_gso_ok(netdev_features_t features, int gso_type) { - netdev_features_t feature = (netdev_features_t)gso_type << NETIF_F_GSO_SHIFT; + netdev_features_t feature; + + if (gso_type & (SKB_GSO_TCP_FIXEDID | SKB_GSO_TCP_FIXEDID_INNER)) + gso_type |= __SKB_GSO_TCP_FIXEDID; + + feature = ((netdev_features_t)gso_type << NETIF_F_GSO_SHIFT) & NETIF_F_GSO_MASK; /* check flags correspondence */ BUILD_BUG_ON(SKB_GSO_TCPV4 != (NETIF_F_TSO >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_DODGY != (NETIF_F_GSO_ROBUST >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT)); - BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> NETIF_F_GSO_SHIFT)); + BUILD_BUG_ON(__SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_TCPV6 != (NETIF_F_TSO6 >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_FCOE != (NETIF_F_FSO >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_GRE != (NETIF_F_GSO_GRE >> NETIF_F_GSO_SHIFT)); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 78ecfa7d00d0..fb3fec9affaa 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -674,7 +674,7 @@ enum { /* This indicates the tcp segment has CWR set. */ SKB_GSO_TCP_ECN = 1 << 2, - SKB_GSO_TCP_FIXEDID = 1 << 3, + __SKB_GSO_TCP_FIXEDID = 1 << 3, SKB_GSO_TCPV6 = 1 << 4, @@ -707,6 +707,12 @@ enum { SKB_GSO_FRAGLIST = 1 << 18, SKB_GSO_TCP_ACCECN = 1 << 19, + + /* These indirectly map onto the same netdev feature. + * If NETIF_F_TSO_MANGLEID is set it may mangle both inner and outer IDs. + */ + SKB_GSO_TCP_FIXEDID = 1 << 30, + SKB_GSO_TCP_FIXEDID_INNER = 1 << 31, }; #if BITS_PER_LONG > 32 diff --git a/net/core/dev.c b/net/core/dev.c index fc4993526ead..8b54fdf0289a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3768,8 +3768,14 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)) features &= ~dev->gso_partial_features; - /* Make sure to clear the IPv4 ID mangling feature if the - * IPv4 header has the potential to be fragmented. + /* Make sure to clear the IPv4 ID mangling feature if the IPv4 header + * has the potential to be fragmented so that TSO does not generate + * segments with the same ID. For encapsulated packets, the ID mangling + * feature is guaranteed not to use the same ID for the outer IPv4 + * headers of the generated segments if the headers have the potential + * to be fragmented, so there is no need to clear the IPv4 ID mangling + * feature (see the section about NETIF_F_TSO_MANGLEID in + * segmentation-offloads.rst). */ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { struct iphdr *iph = skb->encapsulation ? diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e298dacb4a06..804c51296c55 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1395,14 +1395,13 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, segs = ERR_PTR(-EPROTONOSUPPORT); - if (!skb->encapsulation || encap) { - udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); - fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID); + /* fixed ID is invalid if DF bit is not set */ + fixedid = !!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCP_FIXEDID << encap)); + if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF))) + goto out; - /* fixed ID is invalid if DF bit is not set */ - if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF))) - goto out; - } + if (!skb->encapsulation || encap) + udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) { diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 1949eede9ec9..2cb93da93abc 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -471,7 +471,6 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; const struct iphdr *iph = (struct iphdr *)(skb->data + offset); struct tcphdr *th = tcp_hdr(skb); - bool is_fixedid; if (unlikely(NAPI_GRO_CB(skb)->is_flist)) { skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV4; @@ -485,10 +484,9 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr, iph->daddr, 0); - is_fixedid = (NAPI_GRO_CB(skb)->ip_fixedid >> skb->encapsulation) & 1; - + BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID << 1 != SKB_GSO_TCP_FIXEDID_INNER); skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4 | - (is_fixedid * SKB_GSO_TCP_FIXEDID); + (NAPI_GRO_CB(skb)->ip_fixedid * SKB_GSO_TCP_FIXEDID); tcp_gro_complete(skb); return 0; -- cgit v1.2.3 From f095a358faf263bf1d8ae712bd38e13b71286819 Mon Sep 17 00:00:00 2001 From: Richard Gobert Date: Tue, 23 Sep 2025 10:59:07 +0200 Subject: net: gro: remove unnecessary df checks Currently, packets with fixed IDs will be merged only if their don't-fragment bit is set. This restriction is unnecessary since packets without the don't-fragment bit will be forwarded as-is even if they were merged together. The merged packets will be segmented into their original forms before being forwarded, either by GSO or by TSO. The IDs will also remain identical unless NETIF_F_TSO_MANGLEID is set, in which case the IDs can become incrementing, which is also fine. Clean up the code by removing the unnecessary don't-fragment checks. Signed-off-by: Richard Gobert Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250923085908.4687-5-richardbgobert@gmail.com Signed-off-by: Paolo Abeni --- include/net/gro.h | 5 ++--- net/ipv4/af_inet.c | 3 --- tools/testing/selftests/net/gro.c | 9 ++++----- 3 files changed, 6 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/gro.h b/include/net/gro.h index e7997a9fb30b..e3affb2e2ca8 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -448,17 +448,16 @@ static inline int inet_gro_flush(const struct iphdr *iph, const struct iphdr *ip const u32 id2 = ntohl(*(__be32 *)&iph2->id); const u16 ipid_offset = (id >> 16) - (id2 >> 16); const u16 count = NAPI_GRO_CB(p)->count; - const u32 df = id & IP_DF; /* All fields must match except length and checksum. */ - if ((iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | (df ^ (id2 & IP_DF))) + if ((iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | ((id ^ id2) & IP_DF)) return true; /* When we receive our second frame we can make a decision on if we * continue this flow as an atomic flow with a fixed ID or if we use * an incrementing ID. */ - if (count == 1 && df && !ipid_offset) + if (count == 1 && !ipid_offset) NAPI_GRO_CB(p)->ip_fixedid |= 1 << inner; return ipid_offset ^ (count * !(NAPI_GRO_CB(p)->ip_fixedid & (1 << inner))); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 804c51296c55..3109c5ec38f3 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1395,10 +1395,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, segs = ERR_PTR(-EPROTONOSUPPORT); - /* fixed ID is invalid if DF bit is not set */ fixedid = !!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCP_FIXEDID << encap)); - if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF))) - goto out; if (!skb->encapsulation || encap) udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c index d5824eadea10..3d4a82a2607c 100644 --- a/tools/testing/selftests/net/gro.c +++ b/tools/testing/selftests/net/gro.c @@ -670,7 +670,7 @@ static void send_flush_id_case(int fd, struct sockaddr_ll *daddr, int tcase) iph2->id = htons(9); break; - case 3: /* DF=0, Fixed - should not coalesce */ + case 3: /* DF=0, Fixed - should coalesce */ iph1->frag_off &= ~htons(IP_DF); iph1->id = htons(8); @@ -1188,10 +1188,9 @@ static void gro_receiver(void) correct_payload[0] = PAYLOAD_LEN * 2; check_recv_pkts(rxfd, correct_payload, 1); - printf("DF=0, Fixed - should not coalesce: "); - correct_payload[0] = PAYLOAD_LEN; - correct_payload[1] = PAYLOAD_LEN; - check_recv_pkts(rxfd, correct_payload, 2); + printf("DF=0, Fixed - should coalesce: "); + correct_payload[0] = PAYLOAD_LEN * 2; + check_recv_pkts(rxfd, correct_payload, 1); printf("DF=1, 2 Incrementing and one fixed - should coalesce only first 2 packets: "); correct_payload[0] = PAYLOAD_LEN * 2; -- cgit v1.2.3 From bc8712f2b5250825968e6b0c3d2709a4b9d5d570 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Thu, 25 Sep 2025 10:00:12 -0700 Subject: bpf: Emit struct bpf_xdp_sock type in vmlinux BTF Similar to other BPF UAPI struct, force emit BTF of struct bpf_xdp_sock so that it is defined in vmlinux.h. In a later patch, a selftest will use vmlinux.h to get the definition of struct bpf_xdp_sock instead of bpf.h. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250925170013.1752561-1-ameryhung@gmail.com --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index b20d59bb19b8..2af0a5f1d748 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7439,6 +7439,8 @@ u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, offsetof(struct xdp_sock, FIELD)); \ } while (0) + BTF_TYPE_EMIT(struct bpf_xdp_sock); + switch (si->off) { case offsetof(struct bpf_xdp_sock, queue_id): BPF_XDP_SOCK_GET(queue_id); -- cgit v1.2.3 From c30d084960cf316c95fbf145d39974ce1ff7889c Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 25 Sep 2025 18:00:07 +0200 Subject: xsk: avoid overwriting skb fields for multi-buffer traffic We are unnecessarily setting a bunch of skb fields per each processed descriptor, which is redundant for fragmented frames. Let us set these respective members for first fragment only. To address both paths that we have within xsk_build_skb(), move assignments onto xsk_set_destructor_arg() and rename it to xsk_skb_init_misc(). Signed-off-by: Maciej Fijalkowski Acked-by: Stanislav Fomichev Reviewed-by: Jason Xing Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/20250925160009.2474816-2-maciej.fijalkowski@intel.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 72e34bd2d925..01f258894fae 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -618,11 +618,16 @@ static void xsk_destruct_skb(struct sk_buff *skb) sock_wfree(skb); } -static void xsk_set_destructor_arg(struct sk_buff *skb, u64 addr) +static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs, + u64 addr) { BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb)); INIT_LIST_HEAD(&XSKCB(skb)->addrs_list); + skb->dev = xs->dev; + skb->priority = READ_ONCE(xs->sk.sk_priority); + skb->mark = READ_ONCE(xs->sk.sk_mark); XSKCB(skb)->num_descs = 0; + skb->destructor = xsk_destruct_skb; skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr; } @@ -673,7 +678,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, skb_reserve(skb, hr); - xsk_set_destructor_arg(skb, desc->addr); + xsk_skb_init_misc(skb, xs, desc->addr); } else { xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); if (!xsk_addr) @@ -757,7 +762,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, if (unlikely(err)) goto free_err; - xsk_set_destructor_arg(skb, desc->addr); + xsk_skb_init_misc(skb, xs, desc->addr); } else { int nr_frags = skb_shinfo(skb)->nr_frags; struct xsk_addr_node *xsk_addr; @@ -826,14 +831,10 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) skb->skb_mstamp_ns = meta->request.launch_time; + xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); } } - skb->dev = dev; - skb->priority = READ_ONCE(xs->sk.sk_priority); - skb->mark = READ_ONCE(xs->sk.sk_mark); - skb->destructor = xsk_destruct_skb; - xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); xsk_inc_num_desc(skb); return skb; -- cgit v1.2.3 From 6b9c129c2f93df545248e26434720928f249ff2e Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 25 Sep 2025 18:00:08 +0200 Subject: xsk: remove @first_frag from xsk_build_skb() Instead of using auxiliary boolean that tracks if we are at first frag when gathering all elements of skb, same functionality can be achieved with checking if skb_shared_info::nr_frags is 0. Remove @first_frag but be careful around xsk_build_skb_zerocopy() and NULL the skb pointer when it failed so that common error path does not incorrectly interpret it during decision whether to call kfree_skb(). Signed-off-by: Maciej Fijalkowski Acked-by: Stanislav Fomichev Reviewed-by: Jason Xing Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/20250925160009.2474816-3-maciej.fijalkowski@intel.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 01f258894fae..f7e0d254a723 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -730,13 +730,13 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, struct xsk_tx_metadata *meta = NULL; struct net_device *dev = xs->dev; struct sk_buff *skb = xs->skb; - bool first_frag = false; int err; if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { skb = xsk_build_skb_zerocopy(xs, desc); if (IS_ERR(skb)) { err = PTR_ERR(skb); + skb = NULL; goto free_err; } } else { @@ -747,8 +747,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, len = desc->len; if (!skb) { - first_frag = true; - hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); tr = dev->needed_tailroom; skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); @@ -798,7 +796,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list); } - if (first_frag && desc->options & XDP_TX_METADATA) { + if (!skb_shinfo(skb)->nr_frags && desc->options & XDP_TX_METADATA) { if (unlikely(xs->pool->tx_metadata_len == 0)) { err = -EINVAL; goto free_err; @@ -840,7 +838,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, return skb; free_err: - if (first_frag && skb) + if (skb && !skb_shinfo(skb)->nr_frags) kfree_skb(skb); if (err == -EOVERFLOW) { -- cgit v1.2.3 From 30c3055f9c0d84a67b8fd723bdec9b1b52b3c695 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Thu, 25 Sep 2025 18:00:09 +0200 Subject: xsk: wrap generic metadata handling onto separate function xsk_build_skb() has gone wild with its size and one of the things we can do about it is to pull out a branch that takes care of metadata handling and make it a separate function. While at it, let us add metadata SW support for devices supporting IFF_TX_SKB_NO_LINEAR flag, that happen to have separate logic for building skb in xsk's generic xmit path. Acked-by: Stanislav Fomichev Reviewed-by: Jason Xing Signed-off-by: Maciej Fijalkowski Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/20250925160009.2474816-4-maciej.fijalkowski@intel.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 92 ++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 53 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index f7e0d254a723..7b0c68a70888 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -657,6 +657,45 @@ static void xsk_drop_skb(struct sk_buff *skb) xsk_consume_skb(skb); } +static int xsk_skb_metadata(struct sk_buff *skb, void *buffer, + struct xdp_desc *desc, struct xsk_buff_pool *pool, + u32 hr) +{ + struct xsk_tx_metadata *meta = NULL; + + if (unlikely(pool->tx_metadata_len == 0)) + return -EINVAL; + + meta = buffer - pool->tx_metadata_len; + if (unlikely(!xsk_buff_valid_tx_metadata(meta))) + return -EINVAL; + + if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) { + if (unlikely(meta->request.csum_start + + meta->request.csum_offset + + sizeof(__sum16) > desc->len)) + return -EINVAL; + + skb->csum_start = hr + meta->request.csum_start; + skb->csum_offset = meta->request.csum_offset; + skb->ip_summed = CHECKSUM_PARTIAL; + + if (unlikely(pool->tx_sw_csum)) { + int err; + + err = skb_checksum_help(skb); + if (err) + return err; + } + } + + if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) + skb->skb_mstamp_ns = meta->request.launch_time; + xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); + + return 0; +} + static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, struct xdp_desc *desc) { @@ -669,6 +708,9 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, int err, i; u64 addr; + addr = desc->addr; + buffer = xsk_buff_raw_get_data(pool, addr); + if (!skb) { hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); @@ -679,6 +721,11 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, skb_reserve(skb, hr); xsk_skb_init_misc(skb, xs, desc->addr); + if (desc->options & XDP_TX_METADATA) { + err = xsk_skb_metadata(skb, buffer, desc, pool, hr); + if (unlikely(err)) + return ERR_PTR(err); + } } else { xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); if (!xsk_addr) @@ -692,11 +739,9 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list); } - addr = desc->addr; len = desc->len; ts = pool->unaligned ? len : pool->chunk_size; - buffer = xsk_buff_raw_get_data(pool, addr); offset = offset_in_page(buffer); addr = buffer - pool->addrs; @@ -727,7 +772,6 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, struct xdp_desc *desc) { - struct xsk_tx_metadata *meta = NULL; struct net_device *dev = xs->dev; struct sk_buff *skb = xs->skb; int err; @@ -761,6 +805,12 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, goto free_err; xsk_skb_init_misc(skb, xs, desc->addr); + if (desc->options & XDP_TX_METADATA) { + err = xsk_skb_metadata(skb, buffer, desc, + xs->pool, hr); + if (unlikely(err)) + goto free_err; + } } else { int nr_frags = skb_shinfo(skb)->nr_frags; struct xsk_addr_node *xsk_addr; @@ -795,42 +845,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, xsk_addr->addr = desc->addr; list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list); } - - if (!skb_shinfo(skb)->nr_frags && desc->options & XDP_TX_METADATA) { - if (unlikely(xs->pool->tx_metadata_len == 0)) { - err = -EINVAL; - goto free_err; - } - - meta = buffer - xs->pool->tx_metadata_len; - if (unlikely(!xsk_buff_valid_tx_metadata(meta))) { - err = -EINVAL; - goto free_err; - } - - if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) { - if (unlikely(meta->request.csum_start + - meta->request.csum_offset + - sizeof(__sum16) > len)) { - err = -EINVAL; - goto free_err; - } - - skb->csum_start = hr + meta->request.csum_start; - skb->csum_offset = meta->request.csum_offset; - skb->ip_summed = CHECKSUM_PARTIAL; - - if (unlikely(xs->pool->tx_sw_csum)) { - err = skb_checksum_help(skb); - if (err) - goto free_err; - } - } - - if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) - skb->skb_mstamp_ns = meta->request.launch_time; - xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); - } } xsk_inc_num_desc(skb); -- cgit v1.2.3 From 6c85fb5486c5a8ae646438877d7dc5050992a173 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 25 Sep 2025 09:09:50 +0200 Subject: psp: Expand PSP acronym in INET_PSP help description People not very intimate with PSP may not know the meaning of this recursive acronym. Hence replace the half-explanatory "PSP protocol" in the help description by the full expansion, like is done in the linked PSP Architecture Specification document. Signed-off-by: Geert Uytterhoeven Reviewed-by: Simon Horman Link: https://patch.msgid.link/ae13c3ed7f80e604b8ae1561437a67b73549e599.1758784164.git.geert+renesas@glider.be Signed-off-by: Jakub Kicinski --- net/psp/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/psp/Kconfig b/net/psp/Kconfig index a7d24691a7e1..371e8771f3bd 100644 --- a/net/psp/Kconfig +++ b/net/psp/Kconfig @@ -8,7 +8,7 @@ config INET_PSP select SKB_DECRYPTED select SOCK_VALIDATE_XMIT help - Enable kernel support for the PSP protocol. + Enable kernel support for the PSP Security Protocol (PSP). For more information see: https://raw.githubusercontent.com/google/psp/main/doc/PSP_Arch_Spec.pdf -- cgit v1.2.3 From cc2f08129925b437bf28f7f7822f20dac083a87c Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 24 Sep 2025 12:40:33 +0000 Subject: ethtool: add FEC bins histogram report IEEE 802.3ck-2022 defines counters for FEC bins and 802.3df-2024 clarifies it a bit further. Implement reporting interface through as addition to FEC stats available in ethtool. Drivers can leave bin counter uninitialized if per-lane values are provided. In this case the core will recalculate summ for the bin. Signed-off-by: Vadim Fedorenko Reviewed-by: Aleksandr Loktionov Link: https://patch.msgid.link/20250924124037.1508846-2-vadim.fedorenko@linux.dev Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 29 +++++++++ Documentation/networking/ethtool-netlink.rst | 5 ++ drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 3 +- .../net/ethernet/fungible/funeth/funeth_ethtool.c | 3 +- drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 3 +- drivers/net/ethernet/intel/ice/ice_ethtool.c | 4 +- .../ethernet/marvell/octeontx2/nic/otx2_ethtool.c | 3 +- .../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 3 +- drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c | 3 +- drivers/net/ethernet/sfc/ethtool.c | 3 +- drivers/net/ethernet/sfc/siena/ethtool.c | 3 +- drivers/net/netdevsim/ethtool.c | 25 +++++++- include/linux/ethtool.h | 25 +++++++- include/uapi/linux/ethtool_netlink_generated.h | 12 ++++ net/ethtool/fec.c | 75 +++++++++++++++++++++- 15 files changed, 186 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 7a7594713f1f..6a0fb1974513 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -1219,6 +1219,30 @@ attribute-sets: name: udp-ports type: nest nested-attributes: tunnel-udp + - + name: fec-hist + attr-cnt-name: --ethtool-a-fec-hist-cnt + attributes: + - + name: pad + type: pad + - + name: bin-low + type: u32 + doc: Low bound of FEC bin (inclusive) + - + name: bin-high + type: u32 + doc: High bound of FEC bin (inclusive) + - + name: bin-val + type: uint + doc: Error count in the bin (optional if per-lane values exist) + - + name: bin-val-per-lane + type: binary + sub-type: u64 + doc: An array of per-lane error counters in the bin (optional) - name: fec-stat attr-cnt-name: __ethtool-a-fec-stat-cnt @@ -1242,6 +1266,11 @@ attribute-sets: name: corr-bits type: binary sub-type: u64 + - + name: hist + type: nest + multi-attr: True + nested-attributes: fec-hist - name: fec attr-cnt-name: __ethtool-a-fec-cnt diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index ab20c644af24..b270886c5f5d 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1541,6 +1541,11 @@ Drivers fill in the statistics in the following structure: .. kernel-doc:: include/linux/ethtool.h :identifiers: ethtool_fec_stats +Statistics may have FEC bins histogram attribute ``ETHTOOL_A_FEC_STAT_HIST`` +as defined in IEEE 802.3ck-2022 and 802.3df-2024. Nested attributes will have +the range of FEC errors in the bin (inclusive) and the amount of error events +in the bin. + FEC_SET ======= diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index be32ef8f5c96..41686a6f84b5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -3208,7 +3208,8 @@ static int bnxt_get_fecparam(struct net_device *dev, } static void bnxt_get_fec_stats(struct net_device *dev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct bnxt *bp = netdev_priv(dev); u64 *rx; diff --git a/drivers/net/ethernet/fungible/funeth/funeth_ethtool.c b/drivers/net/ethernet/fungible/funeth/funeth_ethtool.c index ba83dbf4ed22..1966dba512f8 100644 --- a/drivers/net/ethernet/fungible/funeth/funeth_ethtool.c +++ b/drivers/net/ethernet/fungible/funeth/funeth_ethtool.c @@ -930,7 +930,8 @@ static void fun_get_rmon_stats(struct net_device *netdev, } static void fun_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *stats) + struct ethtool_fec_stats *stats, + struct ethtool_fec_hist *hist) { const struct funeth_priv *fp = netdev_priv(netdev); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index a752d0e3db3a..a5eefa28454c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -1659,7 +1659,8 @@ static void hns3_set_msglevel(struct net_device *netdev, u32 msg_level) } static void hns3_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct hnae3_handle *handle = hns3_get_handle(netdev); struct hnae3_ae_dev *ae_dev = hns3_get_ae_dev(handle); diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 348acd46a0ef..dc131779d426 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -4624,10 +4624,12 @@ static int ice_get_port_fec_stats(struct ice_hw *hw, u16 pcs_quad, u16 pcs_port, * ice_get_fec_stats - returns FEC correctable, uncorrectable stats per netdev * @netdev: network interface device structure * @fec_stats: buffer to hold FEC statistics for given port + * @hist: buffer to put FEC histogram statistics for given port * */ static void ice_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct ice_netdev_priv *np = netdev_priv(netdev); struct ice_port_topology port_topology; diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c index 998c734ff839..b90e23dc49de 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c @@ -1283,7 +1283,8 @@ end: } static void otx2_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct otx2_nic *pfvf = netdev_priv(netdev); struct cgx_fw_data *rsp; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index d507366d773e..bcc3bbb78cc9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -1927,7 +1927,8 @@ static int mlx5e_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) } static void mlx5e_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct mlx5e_priv *priv = netdev_priv(netdev); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c index fecb8c602024..d55d2ac1c3b9 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c @@ -1718,7 +1718,8 @@ fbnic_get_pause_stats(struct net_device *netdev, static void fbnic_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct fbnic_net *fbn = netdev_priv(netdev); struct fbnic_phy_stats *phy_stats; diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c index 23c6a7df78d0..18fe5850a978 100644 --- a/drivers/net/ethernet/sfc/ethtool.c +++ b/drivers/net/ethernet/sfc/ethtool.c @@ -217,7 +217,8 @@ static int efx_ethtool_set_wol(struct net_device *net_dev, } static void efx_ethtool_get_fec_stats(struct net_device *net_dev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct efx_nic *efx = efx_netdev_priv(net_dev); diff --git a/drivers/net/ethernet/sfc/siena/ethtool.c b/drivers/net/ethernet/sfc/siena/ethtool.c index 994909789bfe..8c3ebd0617fb 100644 --- a/drivers/net/ethernet/sfc/siena/ethtool.c +++ b/drivers/net/ethernet/sfc/siena/ethtool.c @@ -217,7 +217,8 @@ static int efx_ethtool_set_wol(struct net_device *net_dev, } static void efx_ethtool_get_fec_stats(struct net_device *net_dev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct efx_nic *efx = netdev_priv(net_dev); diff --git a/drivers/net/netdevsim/ethtool.c b/drivers/net/netdevsim/ethtool.c index f631d90c428a..36a201533aae 100644 --- a/drivers/net/netdevsim/ethtool.c +++ b/drivers/net/netdevsim/ethtool.c @@ -165,11 +165,34 @@ nsim_set_fecparam(struct net_device *dev, struct ethtool_fecparam *fecparam) return 0; } +static const struct ethtool_fec_hist_range netdevsim_fec_ranges[] = { + { 0, 0}, + { 1, 3}, + { 4, 7}, + { 0, 0} +}; + static void -nsim_get_fec_stats(struct net_device *dev, struct ethtool_fec_stats *fec_stats) +nsim_get_fec_stats(struct net_device *dev, struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { + struct ethtool_fec_hist_value *values = hist->values; + + hist->ranges = netdevsim_fec_ranges; + fec_stats->corrected_blocks.total = 123; fec_stats->uncorrectable_blocks.total = 4; + + values[0].per_lane[0] = 125; + values[0].per_lane[1] = 120; + values[0].per_lane[2] = 100; + values[0].per_lane[3] = 100; + values[1].sum = 12; + values[2].sum = 2; + values[2].per_lane[0] = 2; + values[2].per_lane[1] = 0; + values[2].per_lane[2] = 0; + values[2].per_lane[3] = 0; } static int nsim_get_ts_info(struct net_device *dev, diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index c869b7f8bce8..c2d8b4ec62eb 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -492,7 +492,29 @@ struct ethtool_pause_stats { }; #define ETHTOOL_MAX_LANES 8 +/** + * IEEE 802.3ck/df defines 16 bins for FEC histogram plus one more for + * the end-of-list marker, total 17 items + */ +#define ETHTOOL_FEC_HIST_MAX 17 +/** + * struct ethtool_fec_hist_range - error bits range for FEC histogram + * statistics + * @low: low bound of the bin (inclusive) + * @high: high bound of the bin (inclusive) + */ +struct ethtool_fec_hist_range { + u16 low; + u16 high; +}; +struct ethtool_fec_hist { + struct ethtool_fec_hist_value { + u64 sum; + u64 per_lane[ETHTOOL_MAX_LANES]; + } values[ETHTOOL_FEC_HIST_MAX]; + const struct ethtool_fec_hist_range *ranges; +}; /** * struct ethtool_fec_stats - statistics for IEEE 802.3 FEC * @corrected_blocks: number of received blocks corrected by FEC @@ -1214,7 +1236,8 @@ struct ethtool_ops { int (*set_link_ksettings)(struct net_device *, const struct ethtool_link_ksettings *); void (*get_fec_stats)(struct net_device *dev, - struct ethtool_fec_stats *fec_stats); + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist); int (*get_fecparam)(struct net_device *, struct ethtool_fecparam *); int (*set_fecparam)(struct net_device *, diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index e3b8813465d7..0e8ac0d974e2 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -561,12 +561,24 @@ enum { ETHTOOL_A_TUNNEL_INFO_MAX = (__ETHTOOL_A_TUNNEL_INFO_CNT - 1) }; +enum { + ETHTOOL_A_FEC_HIST_PAD = 1, + ETHTOOL_A_FEC_HIST_BIN_LOW, + ETHTOOL_A_FEC_HIST_BIN_HIGH, + ETHTOOL_A_FEC_HIST_BIN_VAL, + ETHTOOL_A_FEC_HIST_BIN_VAL_PER_LANE, + + __ETHTOOL_A_FEC_HIST_CNT, + ETHTOOL_A_FEC_HIST_MAX = (__ETHTOOL_A_FEC_HIST_CNT - 1) +}; + enum { ETHTOOL_A_FEC_STAT_UNSPEC, ETHTOOL_A_FEC_STAT_PAD, ETHTOOL_A_FEC_STAT_CORRECTED, ETHTOOL_A_FEC_STAT_UNCORR, ETHTOOL_A_FEC_STAT_CORR_BITS, + ETHTOOL_A_FEC_STAT_HIST, __ETHTOOL_A_FEC_STAT_CNT, ETHTOOL_A_FEC_STAT_MAX = (__ETHTOOL_A_FEC_STAT_CNT - 1) diff --git a/net/ethtool/fec.c b/net/ethtool/fec.c index e7d3f2c352a3..4669e74cbcaa 100644 --- a/net/ethtool/fec.c +++ b/net/ethtool/fec.c @@ -17,6 +17,7 @@ struct fec_reply_data { u64 stats[1 + ETHTOOL_MAX_LANES]; u8 cnt; } corr, uncorr, corr_bits; + struct ethtool_fec_hist fec_stat_hist; }; #define FEC_REPDATA(__reply_base) \ @@ -113,7 +114,10 @@ static int fec_prepare_data(const struct ethnl_req_info *req_base, struct ethtool_fec_stats stats; ethtool_stats_init((u64 *)&stats, sizeof(stats) / 8); - dev->ethtool_ops->get_fec_stats(dev, &stats); + ethtool_stats_init((u64 *)data->fec_stat_hist.values, + sizeof(data->fec_stat_hist.values) / 8); + dev->ethtool_ops->get_fec_stats(dev, &stats, + &data->fec_stat_hist); fec_stats_recalc(&data->corr, &stats.corrected_blocks); fec_stats_recalc(&data->uncorr, &stats.uncorrectable_blocks); @@ -157,13 +161,77 @@ static int fec_reply_size(const struct ethnl_req_info *req_base, len += nla_total_size(sizeof(u8)) + /* _FEC_AUTO */ nla_total_size(sizeof(u32)); /* _FEC_ACTIVE */ - if (req_base->flags & ETHTOOL_FLAG_STATS) + if (req_base->flags & ETHTOOL_FLAG_STATS) { len += 3 * nla_total_size_64bit(sizeof(u64) * (1 + ETHTOOL_MAX_LANES)); + /* add FEC bins information */ + len += (nla_total_size(0) + /* _A_FEC_HIST */ + nla_total_size(4) + /* _A_FEC_HIST_BIN_LOW */ + nla_total_size(4) + /* _A_FEC_HIST_BIN_HI */ + /* _A_FEC_HIST_BIN_VAL + per-lane values */ + nla_total_size_64bit(sizeof(u64)) + + nla_total_size_64bit(sizeof(u64) * ETHTOOL_MAX_LANES)) * + ETHTOOL_FEC_HIST_MAX; + } return len; } +static int fec_put_hist(struct sk_buff *skb, + const struct ethtool_fec_hist *hist) +{ + const struct ethtool_fec_hist_range *ranges = hist->ranges; + const struct ethtool_fec_hist_value *values = hist->values; + struct nlattr *nest; + int i, j; + u64 sum; + + if (!ranges) + return 0; + + for (i = 0; i < ETHTOOL_FEC_HIST_MAX; i++) { + if (i && !ranges[i].low && !ranges[i].high) + break; + + if (WARN_ON_ONCE(values[i].sum == ETHTOOL_STAT_NOT_SET && + values[i].per_lane[0] == ETHTOOL_STAT_NOT_SET)) + break; + + nest = nla_nest_start(skb, ETHTOOL_A_FEC_STAT_HIST); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, ETHTOOL_A_FEC_HIST_BIN_LOW, + ranges[i].low) || + nla_put_u32(skb, ETHTOOL_A_FEC_HIST_BIN_HIGH, + ranges[i].high)) + goto err_cancel_hist; + sum = 0; + for (j = 0; j < ETHTOOL_MAX_LANES; j++) { + if (values[i].per_lane[j] == ETHTOOL_STAT_NOT_SET) + break; + sum += values[i].per_lane[j]; + } + if (nla_put_uint(skb, ETHTOOL_A_FEC_HIST_BIN_VAL, + values[i].sum == ETHTOOL_STAT_NOT_SET ? + sum : values[i].sum)) + goto err_cancel_hist; + if (j && nla_put_64bit(skb, ETHTOOL_A_FEC_HIST_BIN_VAL_PER_LANE, + sizeof(u64) * j, + values[i].per_lane, + ETHTOOL_A_FEC_HIST_PAD)) + goto err_cancel_hist; + + nla_nest_end(skb, nest); + } + + return 0; + +err_cancel_hist: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static int fec_put_stats(struct sk_buff *skb, const struct fec_reply_data *data) { struct nlattr *nest; @@ -183,6 +251,9 @@ static int fec_put_stats(struct sk_buff *skb, const struct fec_reply_data *data) data->corr_bits.stats, ETHTOOL_A_FEC_STAT_PAD)) goto err_cancel; + if (fec_put_hist(skb, &data->fec_stat_hist)) + goto err_cancel; + nla_nest_end(skb, nest); return 0; -- cgit v1.2.3 From 4b1ff850e0c1aacc23e923ed22989b827b9808f9 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:36 +0200 Subject: mptcp: pm: in-kernel: usable client side with C-flag When servers set the C-flag in their MP_CAPABLE to tell clients not to create subflows to the initial address and port, clients will likely not use their other endpoints. That's because the in-kernel path-manager uses the 'subflow' endpoints to create subflows only to the initial address and port. If the limits have not been modified to accept ADD_ADDR, the client doesn't try to establish new subflows. If the limits accept ADD_ADDR, the routing routes will be used to select the source IP. The C-flag is typically set when the server is operating behind a legacy Layer 4 load balancer, or using anycast IP address. Clients having their different 'subflow' endpoints setup, don't end up creating multiple subflows as expected, and causing some deployment issues. A special case is then added here: when servers set the C-flag in the MPC and directly sends an ADD_ADDR, this single ADD_ADDR is accepted. The 'subflows' endpoints will then be used with this new remote IP and port. This exception is only allowed when the ADD_ADDR is sent immediately after the 3WHS, and makes the client switching to the 'fully established' mode. After that, 'select_local_address()' will not be able to find any subflows, because 'id_avail_bitmap' will be filled in mptcp_pm_create_subflow_or_signal_addr(), when switching to 'fully established' mode. Fixes: df377be38725 ("mptcp: add deny_join_id0 in mptcp_options_received") Cc: stable@vger.kernel.org Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/536 Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-1-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 7 +++++-- net/mptcp/pm_kernel.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++++- net/mptcp/protocol.h | 8 ++++++++ 3 files changed, 62 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 204e1f61212e..584cab90aa6e 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -637,9 +637,12 @@ void mptcp_pm_add_addr_received(const struct sock *ssk, } else { __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP); } - /* id0 should not have a different address */ + /* - id0 should not have a different address + * - special case for C-flag: linked to fill_local_addresses_vec() + */ } else if ((addr->id == 0 && !mptcp_pm_is_init_remote_addr(msk, addr)) || - (addr->id > 0 && !READ_ONCE(pm->accept_addr))) { + (addr->id > 0 && !READ_ONCE(pm->accept_addr) && + !mptcp_pm_add_addr_c_flag_case(msk))) { mptcp_pm_announce_addr(msk, addr, true); mptcp_pm_add_addr_send_ack(msk); } else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) { diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 667803d72b64..8c46493a0835 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -389,10 +389,12 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info mpc_addr; struct pm_nl_pernet *pernet; unsigned int subflows_max; + bool c_flag_case; int i = 0; pernet = pm_nl_get_pernet_from_msk(msk); subflows_max = mptcp_pm_get_subflows_max(msk); + c_flag_case = remote->id && mptcp_pm_add_addr_c_flag_case(msk); mptcp_local_address((struct sock_common *)msk, &mpc_addr); @@ -405,12 +407,27 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, continue; if (msk->pm.subflows < subflows_max) { + bool is_id0; + locals[i].addr = entry->addr; locals[i].flags = entry->flags; locals[i].ifindex = entry->ifindex; + is_id0 = mptcp_addresses_equal(&locals[i].addr, + &mpc_addr, + locals[i].addr.port); + + if (c_flag_case && + (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) { + __clear_bit(locals[i].addr.id, + msk->pm.id_avail_bitmap); + + if (!is_id0) + msk->pm.local_addr_used++; + } + /* Special case for ID0: set the correct ID */ - if (mptcp_addresses_equal(&locals[i].addr, &mpc_addr, locals[i].addr.port)) + if (is_id0) locals[i].addr.id = 0; msk->pm.subflows++; @@ -419,6 +436,37 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, } rcu_read_unlock(); + /* Special case: peer sets the C flag, accept one ADD_ADDR if default + * limits are used -- accepting no ADD_ADDR -- and use subflow endpoints + */ + if (!i && c_flag_case) { + unsigned int local_addr_max = mptcp_pm_get_local_addr_max(msk); + + while (msk->pm.local_addr_used < local_addr_max && + msk->pm.subflows < subflows_max) { + struct mptcp_pm_local *local = &locals[i]; + + if (!select_local_address(pernet, msk, local)) + break; + + __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); + + if (!mptcp_pm_addr_families_match(sk, &local->addr, + remote)) + continue; + + if (mptcp_addresses_equal(&local->addr, &mpc_addr, + local->addr.port)) + continue; + + msk->pm.local_addr_used++; + msk->pm.subflows++; + i++; + } + + return i; + } + /* If the array is empty, fill in the single * 'IPADDRANY' local address */ diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index a1787a1344ac..cbe54331e5c7 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1199,6 +1199,14 @@ static inline void mptcp_pm_close_subflow(struct mptcp_sock *msk) spin_unlock_bh(&msk->pm.lock); } +static inline bool mptcp_pm_add_addr_c_flag_case(struct mptcp_sock *msk) +{ + return READ_ONCE(msk->pm.remote_deny_join_id0) && + msk->pm.local_addr_used == 0 && + mptcp_pm_get_add_addr_accept_max(msk) == 0 && + msk->pm.subflows < mptcp_pm_get_subflows_max(msk); +} + void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk); static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb) -- cgit v1.2.3 From 8dc63ade451d28211511b657fecf5e0822580986 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:38 +0200 Subject: mptcp: pm: in-kernel: refactor fill_local_addresses_vec Before this modification, this function was quite long with many levels of indentations. Each case can be split in a dedicated function: fullmesh, C flag, any. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-3-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_kernel.c | 175 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 104 insertions(+), 71 deletions(-) (limited to 'net') diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 8c46493a0835..c8f2af2277c2 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -377,116 +377,149 @@ static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk) mptcp_pm_create_subflow_or_signal_addr(msk); } -/* Fill all the local addresses into the array addrs[], - * and return the array size. - */ -static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, - struct mptcp_addr_info *remote, - struct mptcp_pm_local *locals) +static unsigned int +fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk, + struct mptcp_addr_info *remote, + struct mptcp_pm_local *locals, + bool c_flag_case) { + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + unsigned int subflows_max = mptcp_pm_get_subflows_max(msk); struct sock *sk = (struct sock *)msk; struct mptcp_pm_addr_entry *entry; struct mptcp_addr_info mpc_addr; - struct pm_nl_pernet *pernet; - unsigned int subflows_max; - bool c_flag_case; + struct mptcp_pm_local *local; int i = 0; - pernet = pm_nl_get_pernet_from_msk(msk); - subflows_max = mptcp_pm_get_subflows_max(msk); - c_flag_case = remote->id && mptcp_pm_add_addr_c_flag_case(msk); - mptcp_local_address((struct sock_common *)msk, &mpc_addr); rcu_read_lock(); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + bool is_id0; + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH)) continue; if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote)) continue; - if (msk->pm.subflows < subflows_max) { - bool is_id0; + local = &locals[i]; + local->addr = entry->addr; + local->flags = entry->flags; + local->ifindex = entry->ifindex; - locals[i].addr = entry->addr; - locals[i].flags = entry->flags; - locals[i].ifindex = entry->ifindex; + is_id0 = mptcp_addresses_equal(&local->addr, &mpc_addr, + local->addr.port); - is_id0 = mptcp_addresses_equal(&locals[i].addr, - &mpc_addr, - locals[i].addr.port); + if (c_flag_case && + (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) { + __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); - if (c_flag_case && - (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) { - __clear_bit(locals[i].addr.id, - msk->pm.id_avail_bitmap); + if (!is_id0) + msk->pm.local_addr_used++; + } - if (!is_id0) - msk->pm.local_addr_used++; - } + /* Special case for ID0: set the correct ID */ + if (is_id0) + local->addr.id = 0; - /* Special case for ID0: set the correct ID */ - if (is_id0) - locals[i].addr.id = 0; + msk->pm.subflows++; + i++; - msk->pm.subflows++; - i++; - } + if (msk->pm.subflows >= subflows_max) + break; } rcu_read_unlock(); - /* Special case: peer sets the C flag, accept one ADD_ADDR if default - * limits are used -- accepting no ADD_ADDR -- and use subflow endpoints - */ - if (!i && c_flag_case) { - unsigned int local_addr_max = mptcp_pm_get_local_addr_max(msk); + return i; +} - while (msk->pm.local_addr_used < local_addr_max && - msk->pm.subflows < subflows_max) { - struct mptcp_pm_local *local = &locals[i]; +static unsigned int +fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, + struct mptcp_addr_info *remote, + struct mptcp_pm_local *locals) +{ + unsigned int local_addr_max = mptcp_pm_get_local_addr_max(msk); + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + unsigned int subflows_max = mptcp_pm_get_subflows_max(msk); + struct sock *sk = (struct sock *)msk; + struct mptcp_addr_info mpc_addr; + struct mptcp_pm_local *local; + int i = 0; - if (!select_local_address(pernet, msk, local)) - break; + mptcp_local_address((struct sock_common *)msk, &mpc_addr); - __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); + while (msk->pm.local_addr_used < local_addr_max) { + local = &locals[i]; - if (!mptcp_pm_addr_families_match(sk, &local->addr, - remote)) - continue; + if (!select_local_address(pernet, msk, local)) + break; - if (mptcp_addresses_equal(&local->addr, &mpc_addr, - local->addr.port)) - continue; + __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); - msk->pm.local_addr_used++; - msk->pm.subflows++; - i++; - } + if (!mptcp_pm_addr_families_match(sk, &local->addr, remote)) + continue; - return i; + if (mptcp_addresses_equal(&local->addr, &mpc_addr, + local->addr.port)) + continue; + + msk->pm.local_addr_used++; + msk->pm.subflows++; + i++; + + if (msk->pm.subflows >= subflows_max) + break; } - /* If the array is empty, fill in the single - * 'IPADDRANY' local address - */ - if (!i) { - memset(&locals[i], 0, sizeof(locals[i])); - locals[i].addr.family = + return i; +} + +static unsigned int +fill_local_address_any(struct mptcp_sock *msk, struct mptcp_addr_info *remote, + struct mptcp_pm_local *local) +{ + struct sock *sk = (struct sock *)msk; + + memset(local, 0, sizeof(*local)); + local->addr.family = #if IS_ENABLED(CONFIG_MPTCP_IPV6) - remote->family == AF_INET6 && - ipv6_addr_v4mapped(&remote->addr6) ? AF_INET : + remote->family == AF_INET6 && + ipv6_addr_v4mapped(&remote->addr6) ? AF_INET : #endif - remote->family; + remote->family; - if (!mptcp_pm_addr_families_match(sk, &locals[i].addr, remote)) - return 0; + if (!mptcp_pm_addr_families_match(sk, &local->addr, remote)) + return 0; - msk->pm.subflows++; - i++; - } + msk->pm.subflows++; - return i; + return 1; +} + +/* Fill all the local addresses into the array addrs[], + * and return the array size. + */ +static unsigned int +fill_local_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *remote, + struct mptcp_pm_local *locals) +{ + bool c_flag_case = remote->id && mptcp_pm_add_addr_c_flag_case(msk); + int i; + + /* If there is at least one MPTCP endpoint with a fullmesh flag */ + i = fill_local_addresses_vec_fullmesh(msk, remote, locals, c_flag_case); + if (i) + return i; + + /* Special case: peer sets the C flag, accept one ADD_ADDR if default + * limits are used -- accepting no ADD_ADDR -- and use subflow endpoints + */ + if (c_flag_case) + return fill_local_addresses_vec_c_flag(msk, remote, locals); + + /* No special case: fill in the single 'IPADDRANY' local address */ + return fill_local_address_any(msk, remote, &locals[0]); } static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) -- cgit v1.2.3 From a845b2bbf26ed73e545a3573df264c3a1cb302a1 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:39 +0200 Subject: mptcp: pm: in-kernel: refactor fill_remote_addresses_vec Before this modification, this function was quite long with many levels of indentations. Each case can be split in a dedicated function: fullmesh, non-fullmesh. To remove one level of indentation, msk->pm.subflows >= subflows_max is now checked after having added one subflow, and stops the loop if it is no longer possible to add new subflows. This is fine to do this because this function should only be called if msk->pm.subflows < subflows_max. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-4-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_kernel.c | 116 +++++++++++++++++++++++++++++--------------------- 1 file changed, 67 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index c8f2af2277c2..a82c077b8a20 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -159,74 +159,92 @@ select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk, return found; } -/* Fill all the remote addresses into the array addrs[], - * and return the array size. - */ -static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, - struct mptcp_addr_info *local, - bool fullmesh, - struct mptcp_addr_info *addrs) +static unsigned int +fill_remote_addr(struct mptcp_sock *msk, struct mptcp_addr_info *local, + struct mptcp_addr_info *addrs) +{ + bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0); + struct mptcp_addr_info remote = { 0 }; + struct sock *sk = (struct sock *)msk; + + if (deny_id0) + return 0; + + mptcp_remote_address((struct sock_common *)sk, &remote); + + if (!mptcp_pm_addr_families_match(sk, local, &remote)) + return 0; + + msk->pm.subflows++; + *addrs = remote; + + return 1; +} + +static unsigned int +fill_remote_addresses_fullmesh(struct mptcp_sock *msk, + struct mptcp_addr_info *local, + struct mptcp_addr_info *addrs) { bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0); + DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); struct sock *sk = (struct sock *)msk, *ssk; struct mptcp_subflow_context *subflow; - struct mptcp_addr_info remote = { 0 }; unsigned int subflows_max; int i = 0; subflows_max = mptcp_pm_get_subflows_max(msk); - mptcp_remote_address((struct sock_common *)sk, &remote); - /* Non-fullmesh endpoint, fill in the single entry - * corresponding to the primary MPC subflow remote address + /* Forbid creation of new subflows matching existing ones, possibly + * already created by incoming ADD_ADDR */ - if (!fullmesh) { - if (deny_id0) - return 0; - - if (!mptcp_pm_addr_families_match(sk, local, &remote)) - return 0; - - msk->pm.subflows++; - addrs[i++] = remote; - } else { - DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + mptcp_for_each_subflow(msk, subflow) + if (READ_ONCE(subflow->local_id) == local->id) + __set_bit(subflow->remote_id, unavail_id); + + mptcp_for_each_subflow(msk, subflow) { + ssk = mptcp_subflow_tcp_sock(subflow); + mptcp_remote_address((struct sock_common *)ssk, &addrs[i]); + addrs[i].id = READ_ONCE(subflow->remote_id); + if (deny_id0 && !addrs[i].id) + continue; - /* Forbid creation of new subflows matching existing - * ones, possibly already created by incoming ADD_ADDR - */ - bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); - mptcp_for_each_subflow(msk, subflow) - if (READ_ONCE(subflow->local_id) == local->id) - __set_bit(subflow->remote_id, unavail_id); - - mptcp_for_each_subflow(msk, subflow) { - ssk = mptcp_subflow_tcp_sock(subflow); - mptcp_remote_address((struct sock_common *)ssk, &addrs[i]); - addrs[i].id = READ_ONCE(subflow->remote_id); - if (deny_id0 && !addrs[i].id) - continue; + if (test_bit(addrs[i].id, unavail_id)) + continue; - if (test_bit(addrs[i].id, unavail_id)) - continue; + if (!mptcp_pm_addr_families_match(sk, local, &addrs[i])) + continue; - if (!mptcp_pm_addr_families_match(sk, local, &addrs[i])) - continue; + /* forbid creating multiple address towards this id */ + __set_bit(addrs[i].id, unavail_id); + msk->pm.subflows++; + i++; - if (msk->pm.subflows < subflows_max) { - /* forbid creating multiple address towards - * this id - */ - __set_bit(addrs[i].id, unavail_id); - msk->pm.subflows++; - i++; - } - } + if (msk->pm.subflows >= subflows_max) + break; } return i; } +/* Fill all the remote addresses into the array addrs[], + * and return the array size. + */ +static unsigned int +fill_remote_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *local, + bool fullmesh, struct mptcp_addr_info *addrs) +{ + /* Non-fullmesh: fill in the single entry corresponding to the primary + * MPC subflow remote address, and return 1, corresponding to 1 entry. + */ + if (!fullmesh) + return fill_remote_addr(msk, local, addrs); + + /* Fullmesh endpoint: fill all possible remote addresses */ + return fill_remote_addresses_fullmesh(msk, local, addrs); +} + static struct mptcp_pm_addr_entry * __lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) { -- cgit v1.2.3 From c5273f6ca166c4edfaa6a87570e111453a0576ad Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:40 +0200 Subject: mptcp: pm: rename 'subflows' to 'extra_subflows' A few variables linked to the Path-Managers are confusing, and it would help current and future developers, to clarify them. One of them is 'subflows', which in fact represents the number of extra subflows: all the additional subflows created after the initial one, and not the total number of subflows. While at it, add an additional name for the corresponding variable in MPTCP INFO: mptcpi_extra_subflows. Not to break the current uAPI, the new name is added as a 'define' pointing to the former name. This will then also help userspace devs. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-5-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + net/mptcp/pm.c | 13 ++++++------ net/mptcp/pm_kernel.c | 24 +++++++++++------------ net/mptcp/pm_userspace.c | 2 +- net/mptcp/protocol.h | 6 +++--- net/mptcp/sockopt.c | 4 ++-- tools/testing/selftests/bpf/progs/mptcp_subflow.c | 2 +- 7 files changed, 27 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 15eef878690b..f807c8dba56e 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -42,6 +42,7 @@ struct mptcp_info { __u8 mptcpi_subflows; + #define mptcpi_extra_subflows mptcpi_subflows __u8 mptcpi_add_addr_signal; __u8 mptcpi_add_addr_accepted; __u8 mptcpi_subflows_max; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 584cab90aa6e..332e96bdadc0 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -489,7 +489,7 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) if (mptcp_pm_is_userspace(msk)) { if (mptcp_userspace_pm_active(msk)) { spin_lock_bh(&pm->lock); - pm->subflows++; + pm->extra_subflows++; spin_unlock_bh(&pm->lock); return true; } @@ -498,8 +498,9 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) subflows_max = mptcp_pm_get_subflows_max(msk); - pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk, pm->subflows, - subflows_max, READ_ONCE(pm->accept_subflow)); + pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk, + pm->extra_subflows, subflows_max, + READ_ONCE(pm->accept_subflow)); /* try to avoid acquiring the lock below */ if (!READ_ONCE(pm->accept_subflow)) @@ -507,8 +508,8 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) spin_lock_bh(&pm->lock); if (READ_ONCE(pm->accept_subflow)) { - ret = pm->subflows < subflows_max; - if (ret && ++pm->subflows == subflows_max) + ret = pm->extra_subflows < subflows_max; + if (ret && ++pm->extra_subflows == subflows_max) WRITE_ONCE(pm->accept_subflow, false); } spin_unlock_bh(&pm->lock); @@ -594,7 +595,7 @@ void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, if (mptcp_pm_is_userspace(msk)) { if (update_subflows) { spin_lock_bh(&pm->lock); - pm->subflows--; + pm->extra_subflows--; spin_unlock_bh(&pm->lock); } return; diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index a82c077b8a20..20bee6fc0625 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -175,7 +175,7 @@ fill_remote_addr(struct mptcp_sock *msk, struct mptcp_addr_info *local, if (!mptcp_pm_addr_families_match(sk, local, &remote)) return 0; - msk->pm.subflows++; + msk->pm.extra_subflows++; *addrs = remote; return 1; @@ -218,10 +218,10 @@ fill_remote_addresses_fullmesh(struct mptcp_sock *msk, /* forbid creating multiple address towards this id */ __set_bit(addrs[i].id, unavail_id); - msk->pm.subflows++; + msk->pm.extra_subflows++; i++; - if (msk->pm.subflows >= subflows_max) + if (msk->pm.extra_subflows >= subflows_max) break; } @@ -313,7 +313,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", msk->pm.local_addr_used, local_addr_max, msk->pm.add_addr_signaled, add_addr_signal_max, - msk->pm.subflows, subflows_max); + msk->pm.extra_subflows, subflows_max); /* check first for announce */ if (msk->pm.add_addr_signaled < add_addr_signal_max) { @@ -353,7 +353,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) subflow: /* check if should create a new subflow */ while (msk->pm.local_addr_used < local_addr_max && - msk->pm.subflows < subflows_max) { + msk->pm.extra_subflows < subflows_max) { struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; bool fullmesh; int i, nr; @@ -441,10 +441,10 @@ fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk, if (is_id0) local->addr.id = 0; - msk->pm.subflows++; + msk->pm.extra_subflows++; i++; - if (msk->pm.subflows >= subflows_max) + if (msk->pm.extra_subflows >= subflows_max) break; } rcu_read_unlock(); @@ -483,10 +483,10 @@ fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, continue; msk->pm.local_addr_used++; - msk->pm.subflows++; + msk->pm.extra_subflows++; i++; - if (msk->pm.subflows >= subflows_max) + if (msk->pm.extra_subflows >= subflows_max) break; } @@ -510,7 +510,7 @@ fill_local_address_any(struct mptcp_sock *msk, struct mptcp_addr_info *remote, if (!mptcp_pm_addr_families_match(sk, &local->addr, remote)) return 0; - msk->pm.subflows++; + msk->pm.extra_subflows++; return 1; } @@ -586,7 +586,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) if (remote.id) msk->pm.add_addr_accepted++; if (msk->pm.add_addr_accepted >= add_addr_accept_max || - msk->pm.subflows >= subflows_max) + msk->pm.extra_subflows >= subflows_max) WRITE_ONCE(msk->pm.accept_addr, false); } } @@ -1427,7 +1427,7 @@ bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) || + if (msk->pm.extra_subflows == mptcp_pm_get_subflows_max(msk) || (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) { WRITE_ONCE(msk->pm.work_pending, false); diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index a715dcbe0146..8cbc1920afb4 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -419,7 +419,7 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) if (err) mptcp_userspace_pm_delete_local_addr(msk, &entry); else - msk->pm.subflows++; + msk->pm.extra_subflows++; spin_unlock_bh(&msk->pm.lock); create_err: diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index cbe54331e5c7..ca68f9a75801 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -235,7 +235,7 @@ struct mptcp_pm_data { u8 add_addr_accepted; u8 local_addr_used; u8 pm_type; - u8 subflows; + u8 extra_subflows; u8 status; ); @@ -1188,7 +1188,7 @@ unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); /* called under PM lock */ static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk) { - if (--msk->pm.subflows < mptcp_pm_get_subflows_max(msk)) + if (--msk->pm.extra_subflows < mptcp_pm_get_subflows_max(msk)) WRITE_ONCE(msk->pm.accept_subflow, true); } @@ -1204,7 +1204,7 @@ static inline bool mptcp_pm_add_addr_c_flag_case(struct mptcp_sock *msk) return READ_ONCE(msk->pm.remote_deny_join_id0) && msk->pm.local_addr_used == 0 && mptcp_pm_get_add_addr_accept_max(msk) == 0 && - msk->pm.subflows < mptcp_pm_get_subflows_max(msk); + msk->pm.extra_subflows < mptcp_pm_get_subflows_max(msk); } void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 2abe6f1e9940..17966da80239 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -962,7 +962,7 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) memset(info, 0, sizeof(*info)); - info->mptcpi_subflows = READ_ONCE(msk->pm.subflows); + info->mptcpi_extra_subflows = READ_ONCE(msk->pm.extra_subflows); info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled); info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used); @@ -996,7 +996,7 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) info->mptcpi_bytes_sent = msk->bytes_sent; info->mptcpi_bytes_received = msk->bytes_received; info->mptcpi_bytes_retrans = msk->bytes_retrans; - info->mptcpi_subflows_total = info->mptcpi_subflows + + info->mptcpi_subflows_total = info->mptcpi_extra_subflows + __mptcp_has_initial_subflow(msk); now = tcp_jiffies32; info->mptcpi_last_data_sent = jiffies_to_msecs(now - msk->last_data_sent); diff --git a/tools/testing/selftests/bpf/progs/mptcp_subflow.c b/tools/testing/selftests/bpf/progs/mptcp_subflow.c index 70302477e326..41389e579578 100644 --- a/tools/testing/selftests/bpf/progs/mptcp_subflow.c +++ b/tools/testing/selftests/bpf/progs/mptcp_subflow.c @@ -117,7 +117,7 @@ int _getsockopt_subflow(struct bpf_sockopt *ctx) return 1; msk = bpf_core_cast(sk, struct mptcp_sock); - if (msk->pm.subflows != 1) { + if (msk->pm.extra_subflows != 1) { ctx->retval = -1; return 1; } -- cgit v1.2.3 From 3eb3c9a9596a53880f7d7eff28ac5622f3e0ba37 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:41 +0200 Subject: mptcp: pm: in-kernel: rename 'subflows_max' to 'limit_extra_subflows' A few variables linked to the in-kernel Path-Manager are confusing, and it would help current and future developers, to clarify them. One of them is 'subflows_max', which in fact represents the limit of extra subflows: the limit set via 'ip mptcp limit subflows X' for example. It is not linked to the maximum number of created / possible subflows. While at it, add an additional name for the corresponding variable in MPTCP INFO: mptcpi_limit_extra_subflows. Not to break the current uAPI, the new name is added as a 'define' pointing to the former name. This will then also help userspace devs. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-6-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + net/mptcp/pm.c | 12 ++++++------ net/mptcp/pm_kernel.c | 48 ++++++++++++++++++++++++---------------------- net/mptcp/protocol.h | 6 +++--- net/mptcp/sockopt.c | 4 ++-- 5 files changed, 37 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index f807c8dba56e..314200c61f15 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -46,6 +46,7 @@ struct mptcp_info { __u8 mptcpi_add_addr_signal; __u8 mptcpi_add_addr_accepted; __u8 mptcpi_subflows_max; + #define mptcpi_limit_extra_subflows mptcpi_subflows_max __u8 mptcpi_add_addr_signal_max; __u8 mptcpi_add_addr_accepted_max; __u32 mptcpi_flags; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 332e96bdadc0..502f6c235e06 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -483,7 +483,7 @@ void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; - unsigned int subflows_max; + unsigned int limit_extra_subflows; int ret = 0; if (mptcp_pm_is_userspace(msk)) { @@ -496,10 +496,10 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) return false; } - subflows_max = mptcp_pm_get_subflows_max(msk); + limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk, - pm->extra_subflows, subflows_max, + pm->extra_subflows, limit_extra_subflows, READ_ONCE(pm->accept_subflow)); /* try to avoid acquiring the lock below */ @@ -508,8 +508,8 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) spin_lock_bh(&pm->lock); if (READ_ONCE(pm->accept_subflow)) { - ret = pm->extra_subflows < subflows_max; - if (ret && ++pm->extra_subflows == subflows_max) + ret = pm->extra_subflows < limit_extra_subflows; + if (ret && ++pm->extra_subflows == limit_extra_subflows) WRITE_ONCE(pm->accept_subflow, false); } spin_unlock_bh(&pm->lock); @@ -1029,7 +1029,7 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk) WRITE_ONCE(pm->pm_type, pm_type); if (pm_type == MPTCP_PM_TYPE_KERNEL) { - bool subflows_allowed = !!mptcp_pm_get_subflows_max(msk); + bool subflows_allowed = !!mptcp_pm_get_limit_extra_subflows(msk); /* pm->work_pending must be only be set to 'true' when * pm->pm_type is set to MPTCP_PM_TYPE_KERNEL diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 20bee6fc0625..db0d254d0e6b 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -23,7 +23,7 @@ struct pm_nl_pernet { unsigned int add_addr_signal_max; unsigned int add_addr_accept_max; unsigned int local_addr_max; - unsigned int subflows_max; + unsigned int limit_extra_subflows; unsigned int next_id; DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); }; @@ -62,13 +62,13 @@ unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max); -unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk) +unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - return READ_ONCE(pernet->subflows_max); + return READ_ONCE(pernet->limit_extra_subflows); } -EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max); +EXPORT_SYMBOL_GPL(mptcp_pm_get_limit_extra_subflows); unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk) { @@ -190,10 +190,10 @@ fill_remote_addresses_fullmesh(struct mptcp_sock *msk, DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); struct sock *sk = (struct sock *)msk, *ssk; struct mptcp_subflow_context *subflow; - unsigned int subflows_max; + unsigned int limit_extra_subflows; int i = 0; - subflows_max = mptcp_pm_get_subflows_max(msk); + limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); /* Forbid creation of new subflows matching existing ones, possibly * already created by incoming ADD_ADDR @@ -221,7 +221,7 @@ fill_remote_addresses_fullmesh(struct mptcp_sock *msk, msk->pm.extra_subflows++; i++; - if (msk->pm.extra_subflows >= subflows_max) + if (msk->pm.extra_subflows >= limit_extra_subflows) break; } @@ -274,18 +274,18 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; + unsigned int limit_extra_subflows; unsigned int add_addr_signal_max; bool signal_and_subflow = false; unsigned int local_addr_max; struct pm_nl_pernet *pernet; struct mptcp_pm_local local; - unsigned int subflows_max; pernet = pm_nl_get_pernet(sock_net(sk)); add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk); local_addr_max = mptcp_pm_get_local_addr_max(msk); - subflows_max = mptcp_pm_get_subflows_max(msk); + limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); /* do lazy endpoint usage accounting for the MPC subflows */ if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) { @@ -313,7 +313,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", msk->pm.local_addr_used, local_addr_max, msk->pm.add_addr_signaled, add_addr_signal_max, - msk->pm.extra_subflows, subflows_max); + msk->pm.extra_subflows, limit_extra_subflows); /* check first for announce */ if (msk->pm.add_addr_signaled < add_addr_signal_max) { @@ -353,7 +353,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) subflow: /* check if should create a new subflow */ while (msk->pm.local_addr_used < local_addr_max && - msk->pm.extra_subflows < subflows_max) { + msk->pm.extra_subflows < limit_extra_subflows) { struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; bool fullmesh; int i, nr; @@ -402,14 +402,15 @@ fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk, bool c_flag_case) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - unsigned int subflows_max = mptcp_pm_get_subflows_max(msk); struct sock *sk = (struct sock *)msk; struct mptcp_pm_addr_entry *entry; + unsigned int limit_extra_subflows; struct mptcp_addr_info mpc_addr; struct mptcp_pm_local *local; int i = 0; mptcp_local_address((struct sock_common *)msk, &mpc_addr); + limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); rcu_read_lock(); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { @@ -444,7 +445,7 @@ fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk, msk->pm.extra_subflows++; i++; - if (msk->pm.extra_subflows >= subflows_max) + if (msk->pm.extra_subflows >= limit_extra_subflows) break; } rcu_read_unlock(); @@ -459,13 +460,14 @@ fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, { unsigned int local_addr_max = mptcp_pm_get_local_addr_max(msk); struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - unsigned int subflows_max = mptcp_pm_get_subflows_max(msk); struct sock *sk = (struct sock *)msk; + unsigned int limit_extra_subflows; struct mptcp_addr_info mpc_addr; struct mptcp_pm_local *local; int i = 0; mptcp_local_address((struct sock_common *)msk, &mpc_addr); + limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); while (msk->pm.local_addr_used < local_addr_max) { local = &locals[i]; @@ -486,7 +488,7 @@ fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, msk->pm.extra_subflows++; i++; - if (msk->pm.extra_subflows >= subflows_max) + if (msk->pm.extra_subflows >= limit_extra_subflows) break; } @@ -544,14 +546,14 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) { struct mptcp_pm_local locals[MPTCP_PM_ADDR_MAX]; struct sock *sk = (struct sock *)msk; + unsigned int limit_extra_subflows; unsigned int add_addr_accept_max; struct mptcp_addr_info remote; - unsigned int subflows_max; bool sf_created = false; int i, nr; add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); - subflows_max = mptcp_pm_get_subflows_max(msk); + limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); pr_debug("accepted %d:%d remote family %d\n", msk->pm.add_addr_accepted, add_addr_accept_max, @@ -586,7 +588,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) if (remote.id) msk->pm.add_addr_accepted++; if (msk->pm.add_addr_accepted >= add_addr_accept_max || - msk->pm.extra_subflows >= subflows_max) + msk->pm.extra_subflows >= limit_extra_subflows) WRITE_ONCE(msk->pm.accept_addr, false); } } @@ -1285,13 +1287,13 @@ int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info) if (ret) goto unlock; - subflows = pernet->subflows_max; + subflows = pernet->limit_extra_subflows; ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows); if (ret) goto unlock; WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs); - WRITE_ONCE(pernet->subflows_max, subflows); + WRITE_ONCE(pernet->limit_extra_subflows, subflows); unlock: spin_unlock_bh(&pernet->lock); @@ -1318,7 +1320,7 @@ int mptcp_pm_nl_get_limits_doit(struct sk_buff *skb, struct genl_info *info) goto fail; if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS, - READ_ONCE(pernet->subflows_max))) + READ_ONCE(pernet->limit_extra_subflows))) goto fail; genlmsg_end(msg, reply); @@ -1427,7 +1429,7 @@ bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - if (msk->pm.extra_subflows == mptcp_pm_get_subflows_max(msk) || + if (msk->pm.extra_subflows == mptcp_pm_get_limit_extra_subflows(msk) || (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) { WRITE_ONCE(msk->pm.work_pending, false); @@ -1462,7 +1464,7 @@ static int __net_init pm_nl_init_net(struct net *net) INIT_LIST_HEAD_RCU(&pernet->local_addr_list); /* Cit. 2 subflows ought to be enough for anybody. */ - pernet->subflows_max = 2; + pernet->limit_extra_subflows = 2; pernet->next_id = 1; pernet->stale_loss_cnt = 4; spin_lock_init(&pernet->lock); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index ca68f9a75801..4c777f87b049 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1182,13 +1182,13 @@ void mptcp_pm_worker(struct mptcp_sock *msk); void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); -unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk); +unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); /* called under PM lock */ static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk) { - if (--msk->pm.extra_subflows < mptcp_pm_get_subflows_max(msk)) + if (--msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk)) WRITE_ONCE(msk->pm.accept_subflow, true); } @@ -1204,7 +1204,7 @@ static inline bool mptcp_pm_add_addr_c_flag_case(struct mptcp_sock *msk) return READ_ONCE(msk->pm.remote_deny_join_id0) && msk->pm.local_addr_used == 0 && mptcp_pm_get_add_addr_accept_max(msk) == 0 && - msk->pm.extra_subflows < mptcp_pm_get_subflows_max(msk); + msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk); } void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 17966da80239..4e82bcfcd34e 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -972,8 +972,8 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) /* The following limits only make sense for the in-kernel PM */ if (mptcp_pm_is_kernel(msk)) { - info->mptcpi_subflows_max = - mptcp_pm_get_subflows_max(msk); + info->mptcpi_limit_extra_subflows = + mptcp_pm_get_limit_extra_subflows(msk); info->mptcpi_add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk); info->mptcpi_add_addr_accepted_max = -- cgit v1.2.3 From 45cae570664d58c562e21a3c7409fc02147bba46 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:42 +0200 Subject: mptcp: pm: in-kernel: rename 'add_addr_signal_max' to 'endp_signal_max' A few variables linked to the in-kernel Path-Manager are confusing, and it would help current and future developers, to clarify them. One of them is 'add_addr_signal_max', which in fact represents the maximum number of 'signal' endpoints that can be used to announced addresses, and not the number of ADD_ADDR that can be signalled. While at it, add an additional name for the corresponding variable in MPTCP INFO: mptcpi_endp_signal_max. Not to break the current uAPI, the new name is added as a 'define' pointing to the former name. This will then also help userspace devs. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-7-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + net/mptcp/pm.c | 2 +- net/mptcp/pm_kernel.c | 26 +++++++++++++------------- net/mptcp/protocol.h | 2 +- net/mptcp/sockopt.c | 4 ++-- 5 files changed, 18 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 314200c61f15..69fc20db1c2f 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -48,6 +48,7 @@ struct mptcp_info { __u8 mptcpi_subflows_max; #define mptcpi_limit_extra_subflows mptcpi_subflows_max __u8 mptcpi_add_addr_signal_max; + #define mptcpi_endp_signal_max mptcpi_add_addr_signal_max __u8 mptcpi_add_addr_accepted_max; __u32 mptcpi_flags; __u32 mptcpi_token; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 502f6c235e06..1100ba8b1ce8 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -1037,7 +1037,7 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk) WRITE_ONCE(pm->work_pending, (!!mptcp_pm_get_local_addr_max(msk) && subflows_allowed) || - !!mptcp_pm_get_add_addr_signal_max(msk)); + !!mptcp_pm_get_endp_signal_max(msk)); WRITE_ONCE(pm->accept_addr, !!mptcp_pm_get_add_addr_accept_max(msk) && subflows_allowed); diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index db0d254d0e6b..740f0b20b941 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -20,7 +20,7 @@ struct pm_nl_pernet { struct list_head local_addr_list; unsigned int addrs; unsigned int stale_loss_cnt; - unsigned int add_addr_signal_max; + unsigned int endp_signal_max; unsigned int add_addr_accept_max; unsigned int local_addr_max; unsigned int limit_extra_subflows; @@ -46,13 +46,13 @@ static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info) return pm_nl_get_pernet(genl_info_net(info)); } -unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk) +unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk) { const struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - return READ_ONCE(pernet->add_addr_signal_max); + return READ_ONCE(pernet->endp_signal_max); } -EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max); +EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_signal_max); unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk) { @@ -275,15 +275,15 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; unsigned int limit_extra_subflows; - unsigned int add_addr_signal_max; bool signal_and_subflow = false; + unsigned int endp_signal_max; unsigned int local_addr_max; struct pm_nl_pernet *pernet; struct mptcp_pm_local local; pernet = pm_nl_get_pernet(sock_net(sk)); - add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk); + endp_signal_max = mptcp_pm_get_endp_signal_max(msk); local_addr_max = mptcp_pm_get_local_addr_max(msk); limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); @@ -312,11 +312,11 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", msk->pm.local_addr_used, local_addr_max, - msk->pm.add_addr_signaled, add_addr_signal_max, + msk->pm.add_addr_signaled, endp_signal_max, msk->pm.extra_subflows, limit_extra_subflows); /* check first for announce */ - if (msk->pm.add_addr_signaled < add_addr_signal_max) { + if (msk->pm.add_addr_signaled < endp_signal_max) { /* due to racing events on both ends we can reach here while * previous add address is still running: if we invoke now * mptcp_pm_announce_addr(), that will fail and the @@ -699,8 +699,8 @@ find_next: pernet->next_id = entry->addr.id; if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { - addr_max = pernet->add_addr_signal_max; - WRITE_ONCE(pernet->add_addr_signal_max, addr_max + 1); + addr_max = pernet->endp_signal_max; + WRITE_ONCE(pernet->endp_signal_max, addr_max + 1); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { addr_max = pernet->local_addr_max; @@ -1098,8 +1098,8 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { - addr_max = pernet->add_addr_signal_max; - WRITE_ONCE(pernet->add_addr_signal_max, addr_max - 1); + addr_max = pernet->endp_signal_max; + WRITE_ONCE(pernet->endp_signal_max, addr_max - 1); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { addr_max = pernet->local_addr_max; @@ -1185,7 +1185,7 @@ static void __flush_addrs(struct list_head *list) static void __reset_counters(struct pm_nl_pernet *pernet) { - WRITE_ONCE(pernet->add_addr_signal_max, 0); + WRITE_ONCE(pernet->endp_signal_max, 0); WRITE_ONCE(pernet->local_addr_max, 0); pernet->addrs = 0; } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 4c777f87b049..86c30cd6c1f2 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1180,7 +1180,7 @@ void __init mptcp_pm_userspace_register(void); void __init mptcp_pm_nl_init(void); void mptcp_pm_worker(struct mptcp_sock *msk); void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); -unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk); +unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 4e82bcfcd34e..4688e0f25d15 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -974,8 +974,8 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) if (mptcp_pm_is_kernel(msk)) { info->mptcpi_limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); - info->mptcpi_add_addr_signal_max = - mptcp_pm_get_add_addr_signal_max(msk); + info->mptcpi_endp_signal_max = + mptcp_pm_get_endp_signal_max(msk); info->mptcpi_add_addr_accepted_max = mptcp_pm_get_add_addr_accept_max(msk); info->mptcpi_local_addr_max = -- cgit v1.2.3 From 37712d84dfc2e80d4d218ff9be490c86e604aa69 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:43 +0200 Subject: mptcp: pm: in-kernel: rename 'add_addr_accept_max' to 'limit_add_addr_accepted' A few variables linked to the in-kernel Path-Manager are confusing, and it would help current and future developers, to clarify them. One of them is 'add_addr_accept_max', which in fact represents the limit of ADD_ADDR that can be accepted: the limit set via 'ip mptcp limit add_addr_accepted X' for example. It is not linked to the maximum number of accepted ADD_ADDR. While at it, add an additional name for the corresponding variable in MPTCP INFO: mptcpi_limit_add_addr_accepted. Not to break the current uAPI, the new name is added as a 'define' pointing to the former name. This will then also help userspace devs. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-8-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + net/mptcp/pm.c | 2 +- net/mptcp/pm_kernel.c | 27 +++++++++++++++------------ net/mptcp/protocol.h | 4 ++-- net/mptcp/sockopt.c | 4 ++-- 5 files changed, 21 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 69fc20db1c2f..1c275ce96b52 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -50,6 +50,7 @@ struct mptcp_info { __u8 mptcpi_add_addr_signal_max; #define mptcpi_endp_signal_max mptcpi_add_addr_signal_max __u8 mptcpi_add_addr_accepted_max; + #define mptcpi_limit_add_addr_accepted mptcpi_add_addr_accepted_max __u32 mptcpi_flags; __u32 mptcpi_token; __u64 mptcpi_write_seq; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 1100ba8b1ce8..e13bfec50ef8 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -1039,7 +1039,7 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk) subflows_allowed) || !!mptcp_pm_get_endp_signal_max(msk)); WRITE_ONCE(pm->accept_addr, - !!mptcp_pm_get_add_addr_accept_max(msk) && + !!mptcp_pm_get_limit_add_addr_accepted(msk) && subflows_allowed); WRITE_ONCE(pm->accept_subflow, subflows_allowed); diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 740f0b20b941..92f7419485a8 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -21,7 +21,7 @@ struct pm_nl_pernet { unsigned int addrs; unsigned int stale_loss_cnt; unsigned int endp_signal_max; - unsigned int add_addr_accept_max; + unsigned int limit_add_addr_accepted; unsigned int local_addr_max; unsigned int limit_extra_subflows; unsigned int next_id; @@ -54,13 +54,13 @@ unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_signal_max); -unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk) +unsigned int mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - return READ_ONCE(pernet->add_addr_accept_max); + return READ_ONCE(pernet->limit_add_addr_accepted); } -EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max); +EXPORT_SYMBOL_GPL(mptcp_pm_get_limit_add_addr_accepted); unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk) { @@ -547,16 +547,16 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) struct mptcp_pm_local locals[MPTCP_PM_ADDR_MAX]; struct sock *sk = (struct sock *)msk; unsigned int limit_extra_subflows; - unsigned int add_addr_accept_max; + unsigned int limit_add_addr_accepted; struct mptcp_addr_info remote; bool sf_created = false; int i, nr; - add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); + limit_add_addr_accepted = mptcp_pm_get_limit_add_addr_accepted(msk); limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); pr_debug("accepted %d:%d remote family %d\n", - msk->pm.add_addr_accepted, add_addr_accept_max, + msk->pm.add_addr_accepted, limit_add_addr_accepted, msk->pm.remote.family); remote = msk->pm.remote; @@ -587,7 +587,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) /* add_addr_accepted is not decr for ID 0 */ if (remote.id) msk->pm.add_addr_accepted++; - if (msk->pm.add_addr_accepted >= add_addr_accept_max || + if (msk->pm.add_addr_accepted >= limit_add_addr_accepted || msk->pm.extra_subflows >= limit_extra_subflows) WRITE_ONCE(msk->pm.accept_addr, false); } @@ -596,10 +596,13 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id) { if (rm_id && WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { + unsigned int limit_add_addr_accepted = + mptcp_pm_get_limit_add_addr_accepted(msk); + /* Note: if the subflow has been closed before, this * add_addr_accepted counter will not be decremented. */ - if (--msk->pm.add_addr_accepted < mptcp_pm_get_add_addr_accept_max(msk)) + if (--msk->pm.add_addr_accepted < limit_add_addr_accepted) WRITE_ONCE(msk->pm.accept_addr, true); } } @@ -1282,7 +1285,7 @@ int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info) int ret; spin_lock_bh(&pernet->lock); - rcv_addrs = pernet->add_addr_accept_max; + rcv_addrs = pernet->limit_add_addr_accepted; ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs); if (ret) goto unlock; @@ -1292,7 +1295,7 @@ int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info) if (ret) goto unlock; - WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs); + WRITE_ONCE(pernet->limit_add_addr_accepted, rcv_addrs); WRITE_ONCE(pernet->limit_extra_subflows, subflows); unlock: @@ -1316,7 +1319,7 @@ int mptcp_pm_nl_get_limits_doit(struct sk_buff *skb, struct genl_info *info) goto fail; if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS, - READ_ONCE(pernet->add_addr_accept_max))) + READ_ONCE(pernet->limit_add_addr_accepted))) goto fail; if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS, diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 86c30cd6c1f2..114995e1352d 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1181,7 +1181,7 @@ void __init mptcp_pm_nl_init(void); void mptcp_pm_worker(struct mptcp_sock *msk); void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk); -unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); +unsigned int mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); @@ -1203,7 +1203,7 @@ static inline bool mptcp_pm_add_addr_c_flag_case(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.remote_deny_join_id0) && msk->pm.local_addr_used == 0 && - mptcp_pm_get_add_addr_accept_max(msk) == 0 && + mptcp_pm_get_limit_add_addr_accepted(msk) == 0 && msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk); } diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 4688e0f25d15..5ab9909dbe79 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -976,8 +976,8 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) mptcp_pm_get_limit_extra_subflows(msk); info->mptcpi_endp_signal_max = mptcp_pm_get_endp_signal_max(msk); - info->mptcpi_add_addr_accepted_max = - mptcp_pm_get_add_addr_accept_max(msk); + info->mptcpi_limit_add_addr_accepted = + mptcp_pm_get_limit_add_addr_accepted(msk); info->mptcpi_local_addr_max = mptcp_pm_get_local_addr_max(msk); } -- cgit v1.2.3 From e7757b6d3a623671705388be24851af7360b54ba Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:44 +0200 Subject: mptcp: pm: in-kernel: rename 'local_addr_max' to 'endp_subflow_max' A few variables linked to the in-kernel Path-Manager are confusing, and it would help current and future developers, to clarify them. One of them is 'local_addr_max', which in fact represents the maximum number of 'subflow' endpoints that can be used to create new subflows, and not the number of local addresses that have been used to create subflows. While at it, add an additional name for the corresponding variable in MPTCP INFO: mptcpi_endp_subflow_max. Not to break the current uAPI, the new name is added as a 'define' pointing to the former name. This will then also help userspace devs. Also move the variable and function next to the other 'endp_X_max' ones. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-9-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + net/mptcp/pm.c | 2 +- net/mptcp/pm_kernel.c | 40 ++++++++++++++++++++-------------------- net/mptcp/protocol.h | 2 +- net/mptcp/sockopt.c | 4 ++-- 5 files changed, 25 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 1c275ce96b52..5ec996977b3f 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -58,6 +58,7 @@ struct mptcp_info { __u64 mptcpi_rcv_nxt; __u8 mptcpi_local_addr_used; __u8 mptcpi_local_addr_max; + #define mptcpi_endp_subflow_max mptcpi_local_addr_max __u8 mptcpi_csum_enabled; __u32 mptcpi_retransmits; __u64 mptcpi_bytes_retrans; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index e13bfec50ef8..2ff1b9499568 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -1035,7 +1035,7 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk) * pm->pm_type is set to MPTCP_PM_TYPE_KERNEL */ WRITE_ONCE(pm->work_pending, - (!!mptcp_pm_get_local_addr_max(msk) && + (!!mptcp_pm_get_endp_subflow_max(msk) && subflows_allowed) || !!mptcp_pm_get_endp_signal_max(msk)); WRITE_ONCE(pm->accept_addr, diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 92f7419485a8..e62e21eb9da1 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -21,8 +21,8 @@ struct pm_nl_pernet { unsigned int addrs; unsigned int stale_loss_cnt; unsigned int endp_signal_max; + unsigned int endp_subflow_max; unsigned int limit_add_addr_accepted; - unsigned int local_addr_max; unsigned int limit_extra_subflows; unsigned int next_id; DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); @@ -54,6 +54,14 @@ unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_signal_max); +unsigned int mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + return READ_ONCE(pernet->endp_subflow_max); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_subflow_max); + unsigned int mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); @@ -70,14 +78,6 @@ unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_limit_extra_subflows); -unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk) -{ - struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - - return READ_ONCE(pernet->local_addr_max); -} -EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max); - static bool lookup_subflow_by_daddr(const struct list_head *list, const struct mptcp_addr_info *daddr) { @@ -276,15 +276,15 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) struct sock *sk = (struct sock *)msk; unsigned int limit_extra_subflows; bool signal_and_subflow = false; + unsigned int endp_subflow_max; unsigned int endp_signal_max; - unsigned int local_addr_max; struct pm_nl_pernet *pernet; struct mptcp_pm_local local; pernet = pm_nl_get_pernet(sock_net(sk)); endp_signal_max = mptcp_pm_get_endp_signal_max(msk); - local_addr_max = mptcp_pm_get_local_addr_max(msk); + endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk); limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); /* do lazy endpoint usage accounting for the MPC subflows */ @@ -311,7 +311,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) } pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", - msk->pm.local_addr_used, local_addr_max, + msk->pm.local_addr_used, endp_subflow_max, msk->pm.add_addr_signaled, endp_signal_max, msk->pm.extra_subflows, limit_extra_subflows); @@ -352,7 +352,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) subflow: /* check if should create a new subflow */ - while (msk->pm.local_addr_used < local_addr_max && + while (msk->pm.local_addr_used < endp_subflow_max && msk->pm.extra_subflows < limit_extra_subflows) { struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; bool fullmesh; @@ -458,7 +458,7 @@ fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, struct mptcp_addr_info *remote, struct mptcp_pm_local *locals) { - unsigned int local_addr_max = mptcp_pm_get_local_addr_max(msk); + unsigned int endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk); struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); struct sock *sk = (struct sock *)msk; unsigned int limit_extra_subflows; @@ -469,7 +469,7 @@ fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, mptcp_local_address((struct sock_common *)msk, &mpc_addr); limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); - while (msk->pm.local_addr_used < local_addr_max) { + while (msk->pm.local_addr_used < endp_subflow_max) { local = &locals[i]; if (!select_local_address(pernet, msk, local)) @@ -706,8 +706,8 @@ find_next: WRITE_ONCE(pernet->endp_signal_max, addr_max + 1); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { - addr_max = pernet->local_addr_max; - WRITE_ONCE(pernet->local_addr_max, addr_max + 1); + addr_max = pernet->endp_subflow_max; + WRITE_ONCE(pernet->endp_subflow_max, addr_max + 1); } pernet->addrs++; @@ -1105,8 +1105,8 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) WRITE_ONCE(pernet->endp_signal_max, addr_max - 1); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { - addr_max = pernet->local_addr_max; - WRITE_ONCE(pernet->local_addr_max, addr_max - 1); + addr_max = pernet->endp_subflow_max; + WRITE_ONCE(pernet->endp_subflow_max, addr_max - 1); } pernet->addrs--; @@ -1189,7 +1189,7 @@ static void __flush_addrs(struct list_head *list) static void __reset_counters(struct pm_nl_pernet *pernet) { WRITE_ONCE(pernet->endp_signal_max, 0); - WRITE_ONCE(pernet->local_addr_max, 0); + WRITE_ONCE(pernet->endp_subflow_max, 0); pernet->addrs = 0; } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 114995e1352d..df8f977039d0 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1181,9 +1181,9 @@ void __init mptcp_pm_nl_init(void); void mptcp_pm_worker(struct mptcp_sock *msk); void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk); +unsigned int mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); -unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); /* called under PM lock */ static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 5ab9909dbe79..92a2a2742627 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -978,8 +978,8 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) mptcp_pm_get_endp_signal_max(msk); info->mptcpi_limit_add_addr_accepted = mptcp_pm_get_limit_add_addr_accepted(msk); - info->mptcpi_local_addr_max = - mptcp_pm_get_local_addr_max(msk); + info->mptcpi_endp_subflow_max = + mptcp_pm_get_endp_subflow_max(msk); } if (__mptcp_check_fallback(msk)) -- cgit v1.2.3 From 35e71e43a56d40e68ea0ebab3ac85038624cb8b5 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:45 +0200 Subject: mptcp: pm: in-kernel: rename 'local_addr_list' to 'endp_list' A few variables linked to the in-kernel Path-Manager are confusing, and it would help current and future developers, to clarify them. One of them is 'local_addr_list', which in fact represents the list of endpoints, and not only the 'subflow' endpoints. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-10-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_kernel.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index e62e21eb9da1..056624965546 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -17,7 +17,7 @@ static int pm_nl_pernet_id; struct pm_nl_pernet { /* protects pernet updates */ spinlock_t lock; - struct list_head local_addr_list; + struct list_head endp_list; unsigned int addrs; unsigned int stale_loss_cnt; unsigned int endp_signal_max; @@ -110,7 +110,7 @@ select_local_address(const struct pm_nl_pernet *pernet, msk_owned_by_me(msk); rcu_read_lock(); - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + list_for_each_entry_rcu(entry, &pernet->endp_list, list) { if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) continue; @@ -141,7 +141,7 @@ select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk, * Note: removal from the local address list during the msk life-cycle * can lead to additional addresses not being announced. */ - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + list_for_each_entry_rcu(entry, &pernet->endp_list, list) { if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) continue; @@ -250,7 +250,7 @@ __lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) { struct mptcp_pm_addr_entry *entry; - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, + list_for_each_entry_rcu(entry, &pernet->endp_list, list, lockdep_is_held(&pernet->lock)) { if (entry->addr.id == id) return entry; @@ -263,7 +263,7 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) { struct mptcp_pm_addr_entry *entry; - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, + list_for_each_entry_rcu(entry, &pernet->endp_list, list, lockdep_is_held(&pernet->lock)) { if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) return entry; @@ -413,7 +413,7 @@ fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk, limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); rcu_read_lock(); - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + list_for_each_entry_rcu(entry, &pernet->endp_list, list) { bool is_id0; if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH)) @@ -650,7 +650,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, */ if (!address_use_port(entry)) entry->addr.port = 0; - list_for_each_entry(cur, &pernet->local_addr_list, list) { + list_for_each_entry(cur, &pernet->endp_list, list) { if (mptcp_addresses_equal(&cur->addr, &entry->addr, cur->addr.port || entry->addr.port)) { /* allow replacing the exiting endpoint only if such @@ -712,9 +712,9 @@ find_next: pernet->addrs++; if (!entry->addr.port) - list_add_tail_rcu(&entry->list, &pernet->local_addr_list); + list_add_tail_rcu(&entry->list, &pernet->endp_list); else - list_add_rcu(&entry->list, &pernet->local_addr_list); + list_add_rcu(&entry->list, &pernet->endp_list); ret = entry->addr.id; out: @@ -1199,7 +1199,7 @@ int mptcp_pm_nl_flush_addrs_doit(struct sk_buff *skb, struct genl_info *info) LIST_HEAD(free_list); spin_lock_bh(&pernet->lock); - list_splice_init(&pernet->local_addr_list, &free_list); + list_splice_init(&pernet->endp_list, &free_list); __reset_counters(pernet); pernet->next_id = 1; bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); @@ -1464,7 +1464,7 @@ static int __net_init pm_nl_init_net(struct net *net) { struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); - INIT_LIST_HEAD_RCU(&pernet->local_addr_list); + INIT_LIST_HEAD_RCU(&pernet->endp_list); /* Cit. 2 subflows ought to be enough for anybody. */ pernet->limit_extra_subflows = 2; @@ -1490,7 +1490,7 @@ static void __net_exit pm_nl_exit_net(struct list_head *net_list) * other modifiers, also netns core already waited for a * RCU grace period. */ - __flush_addrs(&pernet->local_addr_list); + __flush_addrs(&pernet->endp_list); } } -- cgit v1.2.3 From e9aa044f4a1f7c7a858b96ea1fc7c642095ef4b8 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:46 +0200 Subject: mptcp: pm: in-kernel: rename 'addrs' to 'endpoints' A few variables linked to the in-kernel Path-Manager are confusing, and it would help current and future developers, to clarify them. One of them is 'addrs', which in fact represents the number of declared endpoints, and not only the 'signal' endpoints. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-11-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_kernel.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 056624965546..d30b06605f62 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -18,7 +18,7 @@ struct pm_nl_pernet { /* protects pernet updates */ spinlock_t lock; struct list_head endp_list; - unsigned int addrs; + unsigned int endpoints; unsigned int stale_loss_cnt; unsigned int endp_signal_max; unsigned int endp_subflow_max; @@ -636,7 +636,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, */ if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID) pernet->next_id = 1; - if (pernet->addrs >= MPTCP_PM_ADDR_MAX) { + if (pernet->endpoints >= MPTCP_PM_ADDR_MAX) { ret = -ERANGE; goto out; } @@ -675,7 +675,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, goto out; } - pernet->addrs--; + pernet->endpoints--; entry->addr.id = cur->addr.id; list_del_rcu(&cur->list); del_entry = cur; @@ -710,7 +710,7 @@ find_next: WRITE_ONCE(pernet->endp_subflow_max, addr_max + 1); } - pernet->addrs++; + pernet->endpoints++; if (!entry->addr.port) list_add_tail_rcu(&entry->list, &pernet->endp_list); else @@ -1109,7 +1109,7 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) WRITE_ONCE(pernet->endp_subflow_max, addr_max - 1); } - pernet->addrs--; + pernet->endpoints--; list_del_rcu(&entry->list); __clear_bit(entry->addr.id, pernet->id_bitmap); spin_unlock_bh(&pernet->lock); @@ -1190,7 +1190,7 @@ static void __reset_counters(struct pm_nl_pernet *pernet) { WRITE_ONCE(pernet->endp_signal_max, 0); WRITE_ONCE(pernet->endp_subflow_max, 0); - pernet->addrs = 0; + pernet->endpoints = 0; } int mptcp_pm_nl_flush_addrs_doit(struct sk_buff *skb, struct genl_info *info) -- cgit v1.2.3 From db9a0e3858ba8acbe4f78edcb8c2061aee53dfa4 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:47 +0200 Subject: mptcp: pm: in-kernel: remove stale_loss_cnt It is currently not used. It was in fact never used since its introduction in commit ff5a0b421cb2 ("mptcp: faster active backup recovery"). It was probably initially added to struct pm_nl_pernet during the development of this commit, before being added to struct mptcp_pernet in ctrl.c, but not removed from the first place. Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-12-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_kernel.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index d30b06605f62..0e1e99e72950 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -19,7 +19,6 @@ struct pm_nl_pernet { spinlock_t lock; struct list_head endp_list; unsigned int endpoints; - unsigned int stale_loss_cnt; unsigned int endp_signal_max; unsigned int endp_subflow_max; unsigned int limit_add_addr_accepted; @@ -1469,7 +1468,6 @@ static int __net_init pm_nl_init_net(struct net *net) /* Cit. 2 subflows ought to be enough for anybody. */ pernet->limit_extra_subflows = 2; pernet->next_id = 1; - pernet->stale_loss_cnt = 4; spin_lock_init(&pernet->lock); /* No need to initialize other pernet fields, the struct is zeroed at -- cgit v1.2.3 From 4984fe6254f8d469c98e639856b7ce21fe8da86f Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:48 +0200 Subject: mptcp: pm: in-kernel: reduce pernet struct size All the 'unsigned int' variables from the 'pm_nl_pernet' structure are bounded to MPTCP_PM_ADDR_MAX, currently set to 8. The endpoint ID is also bounded by the protocol to 8-bit. MPTCP_PM_ADDR_MAX, if extended later, will never over 8-bit. So no need to use 'unsigned int' variables, 'u8' is enough. Note that the exposed counters in MPTCP_INFO are already limited to 8-bit, same for pm->extra_subflows, and others. So it seems even better to limit them to 8-bit. Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-13-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_kernel.c | 59 ++++++++++++++++++++------------------------------- net/mptcp/protocol.h | 8 +++---- 2 files changed, 27 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 0e1e99e72950..117f842fe18e 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -18,12 +18,12 @@ struct pm_nl_pernet { /* protects pernet updates */ spinlock_t lock; struct list_head endp_list; - unsigned int endpoints; - unsigned int endp_signal_max; - unsigned int endp_subflow_max; - unsigned int limit_add_addr_accepted; - unsigned int limit_extra_subflows; - unsigned int next_id; + u8 endpoints; + u8 endp_signal_max; + u8 endp_subflow_max; + u8 limit_add_addr_accepted; + u8 limit_extra_subflows; + u8 next_id; DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); }; @@ -45,7 +45,7 @@ static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info) return pm_nl_get_pernet(genl_info_net(info)); } -unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk) +u8 mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk) { const struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); @@ -53,7 +53,7 @@ unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_signal_max); -unsigned int mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk) +u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); @@ -61,7 +61,7 @@ unsigned int mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_subflow_max); -unsigned int mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk) +u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); @@ -69,7 +69,7 @@ unsigned int mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_limit_add_addr_accepted); -unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk) +u8 mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); @@ -185,15 +185,13 @@ fill_remote_addresses_fullmesh(struct mptcp_sock *msk, struct mptcp_addr_info *local, struct mptcp_addr_info *addrs) { + u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0); DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); struct sock *sk = (struct sock *)msk, *ssk; struct mptcp_subflow_context *subflow; - unsigned int limit_extra_subflows; int i = 0; - limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); - /* Forbid creation of new subflows matching existing ones, possibly * already created by incoming ADD_ADDR */ @@ -272,20 +270,14 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) { + u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + u8 endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk); + u8 endp_signal_max = mptcp_pm_get_endp_signal_max(msk); struct sock *sk = (struct sock *)msk; - unsigned int limit_extra_subflows; bool signal_and_subflow = false; - unsigned int endp_subflow_max; - unsigned int endp_signal_max; - struct pm_nl_pernet *pernet; struct mptcp_pm_local local; - pernet = pm_nl_get_pernet(sock_net(sk)); - - endp_signal_max = mptcp_pm_get_endp_signal_max(msk); - endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk); - limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); - /* do lazy endpoint usage accounting for the MPC subflows */ if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(msk->first); @@ -400,16 +392,15 @@ fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk, struct mptcp_pm_local *locals, bool c_flag_case) { + u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); struct sock *sk = (struct sock *)msk; struct mptcp_pm_addr_entry *entry; - unsigned int limit_extra_subflows; struct mptcp_addr_info mpc_addr; struct mptcp_pm_local *local; int i = 0; mptcp_local_address((struct sock_common *)msk, &mpc_addr); - limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); rcu_read_lock(); list_for_each_entry_rcu(entry, &pernet->endp_list, list) { @@ -457,16 +448,15 @@ fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, struct mptcp_addr_info *remote, struct mptcp_pm_local *locals) { - unsigned int endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk); + u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + u8 endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk); struct sock *sk = (struct sock *)msk; - unsigned int limit_extra_subflows; struct mptcp_addr_info mpc_addr; struct mptcp_pm_local *local; int i = 0; mptcp_local_address((struct sock_common *)msk, &mpc_addr); - limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); while (msk->pm.local_addr_used < endp_subflow_max) { local = &locals[i]; @@ -543,17 +533,14 @@ fill_local_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *remote, static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) { + u8 limit_add_addr_accepted = mptcp_pm_get_limit_add_addr_accepted(msk); + u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); struct mptcp_pm_local locals[MPTCP_PM_ADDR_MAX]; struct sock *sk = (struct sock *)msk; - unsigned int limit_extra_subflows; - unsigned int limit_add_addr_accepted; struct mptcp_addr_info remote; bool sf_created = false; int i, nr; - limit_add_addr_accepted = mptcp_pm_get_limit_add_addr_accepted(msk); - limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); - pr_debug("accepted %d:%d remote family %d\n", msk->pm.add_addr_accepted, limit_add_addr_accepted, msk->pm.remote.family); @@ -595,7 +582,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id) { if (rm_id && WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { - unsigned int limit_add_addr_accepted = + u8 limit_add_addr_accepted = mptcp_pm_get_limit_add_addr_accepted(msk); /* Note: if the subflow has been closed before, this @@ -626,8 +613,8 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, bool needs_id, bool replace) { struct mptcp_pm_addr_entry *cur, *del_entry = NULL; - unsigned int addr_max; int ret = -EINVAL; + u8 addr_max; spin_lock_bh(&pernet->lock); /* to keep the code simple, don't do IDR-like allocation for address ID, @@ -1072,8 +1059,8 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) { struct pm_nl_pernet *pernet = genl_info_pm_nl(info); struct mptcp_pm_addr_entry addr, *entry; - unsigned int addr_max; struct nlattr *attr; + u8 addr_max; int ret; if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR)) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index df8f977039d0..0cd3333cafaf 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1180,10 +1180,10 @@ void __init mptcp_pm_userspace_register(void); void __init mptcp_pm_nl_init(void); void mptcp_pm_worker(struct mptcp_sock *msk); void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); -unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk); -unsigned int mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk); -unsigned int mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk); -unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); +u8 mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk); +u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk); +u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk); +u8 mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); /* called under PM lock */ static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk) -- cgit v1.2.3 From f596293314b25fc494acb42f40ec256e4662d04f Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:49 +0200 Subject: mptcp: pm: in-kernel: compare IDs instead of addresses When receiving an ADD_ADDR right after the 3WHS, the connection will switch to 'fully established'. It means the MPTCP worker will be called to treat two events, in this order: ADD_ADDR_RECEIVED, PM_ESTABLISHED. The MPTCP endpoints cannot have the ID 0, because it is reserved to the address and port used by the initial subflow. To be able to deal with this case in different places, msk->mpc_endpoint_id contains the endpoint ID linked to the initial subflow. This variable was only set when treating the first PM_ESTABLISHED event, after ADD_ADDR_RECEIVED. That's why in fill_local_addresses_vec(), the endpoint addresses were compared with the one of the initial subflow, instead of only comparing the IDs. Instead, msk->mpc_endpoint_id is now set when treating ADD_ADDR_RECEIVED as well, if needed, then the IDs can be compared. To be able to do so, the code doing that is now in a dedicated helper, and called from the functions linked to the two actions. While at it, mptcp_endp_get_local_id() has also been moved up, next to this new helper, because they are linked, and to be able to use it in fill_local_addresses_vec() in the next commit. Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-14-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_kernel.c | 82 +++++++++++++++++++++++++++------------------------ 1 file changed, 44 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 117f842fe18e..55dbf89d19b8 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -268,6 +268,46 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) return NULL; } +static u8 mptcp_endp_get_local_id(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) +{ + return msk->mpc_endpoint_id == addr->id ? 0 : addr->id; +} + +/* Set mpc_endpoint_id, and send MP_PRIO for ID0 if needed */ +static void mptcp_mpc_endpoint_setup(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_pm_addr_entry *entry; + struct mptcp_addr_info mpc_addr; + struct pm_nl_pernet *pernet; + bool backup = false; + + /* do lazy endpoint usage accounting for the MPC subflows */ + if (likely(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED)) || + !msk->first) + return; + + subflow = mptcp_subflow_ctx(msk->first); + pernet = pm_nl_get_pernet_from_msk(msk); + + mptcp_local_address((struct sock_common *)msk->first, &mpc_addr); + rcu_read_lock(); + entry = __lookup_addr(pernet, &mpc_addr); + if (entry) { + __clear_bit(entry->addr.id, msk->pm.id_avail_bitmap); + msk->mpc_endpoint_id = entry->addr.id; + backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); + } + rcu_read_unlock(); + + /* Send MP_PRIO */ + if (backup) + mptcp_pm_send_ack(msk, subflow, true, backup); + + msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED); +} + static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) { u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); @@ -278,28 +318,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) bool signal_and_subflow = false; struct mptcp_pm_local local; - /* do lazy endpoint usage accounting for the MPC subflows */ - if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(msk->first); - struct mptcp_pm_addr_entry *entry; - struct mptcp_addr_info mpc_addr; - bool backup = false; - - mptcp_local_address((struct sock_common *)msk->first, &mpc_addr); - rcu_read_lock(); - entry = __lookup_addr(pernet, &mpc_addr); - if (entry) { - __clear_bit(entry->addr.id, msk->pm.id_avail_bitmap); - msk->mpc_endpoint_id = entry->addr.id; - backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); - } - rcu_read_unlock(); - - if (backup) - mptcp_pm_send_ack(msk, subflow, true, backup); - - msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED); - } + mptcp_mpc_endpoint_setup(msk); pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", msk->pm.local_addr_used, endp_subflow_max, @@ -396,12 +415,9 @@ fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk, struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); struct sock *sk = (struct sock *)msk; struct mptcp_pm_addr_entry *entry; - struct mptcp_addr_info mpc_addr; struct mptcp_pm_local *local; int i = 0; - mptcp_local_address((struct sock_common *)msk, &mpc_addr); - rcu_read_lock(); list_for_each_entry_rcu(entry, &pernet->endp_list, list) { bool is_id0; @@ -417,8 +433,7 @@ fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk, local->flags = entry->flags; local->ifindex = entry->ifindex; - is_id0 = mptcp_addresses_equal(&local->addr, &mpc_addr, - local->addr.port); + is_id0 = local->addr.id == msk->mpc_endpoint_id; if (c_flag_case && (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) { @@ -452,12 +467,9 @@ fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); u8 endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk); struct sock *sk = (struct sock *)msk; - struct mptcp_addr_info mpc_addr; struct mptcp_pm_local *local; int i = 0; - mptcp_local_address((struct sock_common *)msk, &mpc_addr); - while (msk->pm.local_addr_used < endp_subflow_max) { local = &locals[i]; @@ -469,8 +481,7 @@ fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, if (!mptcp_pm_addr_families_match(sk, &local->addr, remote)) continue; - if (mptcp_addresses_equal(&local->addr, &mpc_addr, - local->addr.port)) + if (local->addr.id == msk->mpc_endpoint_id) continue; msk->pm.local_addr_used++; @@ -548,6 +559,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) remote = msk->pm.remote; mptcp_pm_announce_addr(msk, &remote, true); mptcp_pm_addr_send_ack(msk); + mptcp_mpc_endpoint_setup(msk); if (lookup_subflow_by_daddr(&msk->conn_list, &remote)) return; @@ -935,12 +947,6 @@ out_free: return ret; } -static u8 mptcp_endp_get_local_id(struct mptcp_sock *msk, - const struct mptcp_addr_info *addr) -{ - return msk->mpc_endpoint_id == addr->id ? 0 : addr->id; -} - static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool force) -- cgit v1.2.3 From 539f6b9de39ec5d827b16f6f5c8f3cfd58669e93 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:50 +0200 Subject: mptcp: pm: in-kernel: add laminar endpoints Currently, upon the reception of an ADD_ADDR (and when the fullmesh flag is not used), the in-kernel PM will create new subflows using the local address the routing configuration will pick. It would be easier to pick local addresses from a selected list of endpoints, and use it only once, than relying on routing rules. Use case: both the client (C) and the server (S) have two addresses (a and b). The client establishes the connection between C(a) and S(a). Once established, the server announces its additional address S(b). Once received, the client connects to it using its second address C(b). Compared to a situation without the 'laminar' endpoint for C(b), the client didn't use this address C(b) to establish a subflow to the server's primary address S(a). So at the end, we have: C S C(a) --- S(a) C(b) --- S(b) In case of a 3rd address on each side (C(c) and S(c)), upon the reception of an ADD_ADDR with S(c), the client should not pick C(b) because it has already been used. C(c) should then be used. Note that this situation is currently possible if C doesn't add any endpoint, but configure the routing in order to pick C(b) for the route to S(b), and pick C(c) for the route to S(c). That doesn't sound very practical because it means knowing in advance the IP addresses that will be used and announced by the server. 'laminar', like the idea of laminar flows: the different subflows don't mix with each other on an endpoint, unlike the "turbulent" way traffic is mixed by 'fullmesh'. In the code, the new endpoint type is added. Similar to the other subflow types, an MPTCP_INFO counter is added. While at it, hole are now commented in struct mptcp_info, to remember next time that these holes can no longer be used. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/503 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-15-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 6 +++- net/mptcp/pm_kernel.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++ net/mptcp/protocol.h | 1 + net/mptcp/sockopt.c | 2 ++ 4 files changed, 90 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 5ec996977b3f..87cfab874e24 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -39,6 +39,7 @@ #define MPTCP_PM_ADDR_FLAG_BACKUP _BITUL(2) #define MPTCP_PM_ADDR_FLAG_FULLMESH _BITUL(3) #define MPTCP_PM_ADDR_FLAG_IMPLICIT _BITUL(4) +#define MPTCP_PM_ADDR_FLAG_LAMINAR _BITUL(5) struct mptcp_info { __u8 mptcpi_subflows; @@ -51,6 +52,7 @@ struct mptcp_info { #define mptcpi_endp_signal_max mptcpi_add_addr_signal_max __u8 mptcpi_add_addr_accepted_max; #define mptcpi_limit_add_addr_accepted mptcpi_add_addr_accepted_max + /* 16-bit hole that can no longer be filled */ __u32 mptcpi_flags; __u32 mptcpi_token; __u64 mptcpi_write_seq; @@ -60,13 +62,15 @@ struct mptcp_info { __u8 mptcpi_local_addr_max; #define mptcpi_endp_subflow_max mptcpi_local_addr_max __u8 mptcpi_csum_enabled; + /* 8-bit hole that can no longer be filled */ __u32 mptcpi_retransmits; __u64 mptcpi_bytes_retrans; __u64 mptcpi_bytes_sent; __u64 mptcpi_bytes_received; __u64 mptcpi_bytes_acked; __u8 mptcpi_subflows_total; - __u8 reserved[3]; + __u8 mptcpi_endp_laminar_max; + __u8 reserved[2]; __u32 mptcpi_last_data_sent; __u32 mptcpi_last_data_recv; __u32 mptcpi_last_ack_recv; diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 55dbf89d19b8..e0f44dc232aa 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -21,6 +21,7 @@ struct pm_nl_pernet { u8 endpoints; u8 endp_signal_max; u8 endp_subflow_max; + u8 endp_laminar_max; u8 limit_add_addr_accepted; u8 limit_extra_subflows; u8 next_id; @@ -61,6 +62,14 @@ u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_subflow_max); +u8 mptcp_pm_get_endp_laminar_max(const struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + return READ_ONCE(pernet->endp_laminar_max); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_laminar_max); + u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); @@ -458,6 +467,66 @@ fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk, return i; } +static unsigned int +fill_local_laminar_endp(struct mptcp_sock *msk, struct mptcp_addr_info *remote, + struct mptcp_pm_local *locals) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + struct mptcp_pm_addr_entry *entry; + struct mptcp_pm_local *local; + int found = 0; + + /* Forbid creation of new subflows matching existing ones, possibly + * already created by 'subflow' endpoints + */ + bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if ((1 << inet_sk_state_load(ssk)) & + (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING | + TCPF_CLOSE)) + continue; + + __set_bit(subflow_get_local_id(subflow), unavail_id); + } + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &pernet->endp_list, list) { + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR)) + continue; + + if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote)) + continue; + + if (test_bit(mptcp_endp_get_local_id(msk, &entry->addr), + unavail_id)) + continue; + + local = &locals[0]; + local->addr = entry->addr; + local->flags = entry->flags; + local->ifindex = entry->ifindex; + + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); + + if (local->addr.id != msk->mpc_endpoint_id) + msk->pm.local_addr_used++; + } + + msk->pm.extra_subflows++; + found = 1; + break; + } + rcu_read_unlock(); + + return found; +} + static unsigned int fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, struct mptcp_addr_info *remote, @@ -532,6 +601,10 @@ fill_local_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *remote, if (i) return i; + /* If there is at least one MPTCP endpoint with a laminar flag */ + if (mptcp_pm_get_endp_laminar_max(msk)) + return fill_local_laminar_endp(msk, remote, locals); + /* Special case: peer sets the C flag, accept one ADD_ADDR if default * limits are used -- accepting no ADD_ADDR -- and use subflow endpoints */ @@ -707,6 +780,10 @@ find_next: addr_max = pernet->endp_subflow_max; WRITE_ONCE(pernet->endp_subflow_max, addr_max + 1); } + if (entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR) { + addr_max = pernet->endp_laminar_max; + WRITE_ONCE(pernet->endp_laminar_max, addr_max + 1); + } pernet->endpoints++; if (!entry->addr.port) @@ -1100,6 +1177,10 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) addr_max = pernet->endp_subflow_max; WRITE_ONCE(pernet->endp_subflow_max, addr_max - 1); } + if (entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR) { + addr_max = pernet->endp_laminar_max; + WRITE_ONCE(pernet->endp_laminar_max, addr_max - 1); + } pernet->endpoints--; list_del_rcu(&entry->list); @@ -1182,6 +1263,7 @@ static void __reset_counters(struct pm_nl_pernet *pernet) { WRITE_ONCE(pernet->endp_signal_max, 0); WRITE_ONCE(pernet->endp_subflow_max, 0); + WRITE_ONCE(pernet->endp_laminar_max, 0); pernet->endpoints = 0; } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 0cd3333cafaf..371084a3fc22 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1182,6 +1182,7 @@ void mptcp_pm_worker(struct mptcp_sock *msk); void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); u8 mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk); u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk); +u8 mptcp_pm_get_endp_laminar_max(const struct mptcp_sock *msk); u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk); u8 mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 92a2a2742627..a28a48385885 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -980,6 +980,8 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) mptcp_pm_get_limit_add_addr_accepted(msk); info->mptcpi_endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk); + info->mptcpi_endp_laminar_max = + mptcp_pm_get_endp_laminar_max(msk); } if (__mptcp_check_fallback(msk)) -- cgit v1.2.3 From d79c7d01f1c8bcf9a48337c8960d618fbe31fc0c Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 27 Jun 2025 11:18:29 -0400 Subject: Bluetooth: ISO: Don't initiate CIS connections if there are no buffers If the controller has no buffers left return -ENOBUFF to indicate that iso_cnt might be out of sync. Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 5ce823ca3aaf..ac6e83313b9b 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -458,6 +458,13 @@ static int iso_connect_cis(struct sock *sk) goto unlock; } + /* Check if there are available buffers for output/TX. */ + if (iso_pi(sk)->qos.ucast.out.sdu && !hci_iso_count(hdev) && + (hdev->iso_pkts && !hdev->iso_cnt)) { + err = -ENOBUFS; + goto unlock; + } + /* Just bind if DEFER_SETUP has been set */ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { hcon = hci_bind_cis(hdev, &iso_pi(sk)->dst, -- cgit v1.2.3 From 69a86cc17811c411fe336eb484a23bc0b425a814 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 27 Jun 2025 11:18:30 -0400 Subject: Bluetooth: HCI: Fix using LE/ACL buffers for ISO packets ISO packets shall not use LE/ACL buffer pool, that feature seem to be exclusive to LE-ACL only. Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 7 +++---- net/bluetooth/hci_core.c | 6 ++---- net/bluetooth/hci_event.c | 16 +++------------- 3 files changed, 8 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index e524bb59bff2..091cff2155e6 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -924,10 +924,9 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t case CIS_LINK: case BIS_LINK: case PA_LINK: - if (hdev->iso_mtu) - /* Dedicated ISO Buffer exists */ - break; - fallthrough; + if (!hdev->iso_mtu) + return ERR_PTR(-ECONNREFUSED); + break; case LE_LINK: if (hdev->le_mtu && hdev->le_mtu < HCI_MIN_LE_MTU) return ERR_PTR(-ECONNREFUSED); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 55e0722fd066..e2bffad9816f 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3399,8 +3399,7 @@ static inline void hci_quote_sent(struct hci_conn *conn, int num, int *quote) case CIS_LINK: case BIS_LINK: case PA_LINK: - cnt = hdev->iso_mtu ? hdev->iso_cnt : - hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt; + cnt = hdev->iso_cnt; break; default: cnt = 0; @@ -3759,8 +3758,7 @@ static void hci_sched_iso(struct hci_dev *hdev, __u8 type) if (!hci_conn_num(hdev, type)) return; - cnt = hdev->iso_pkts ? &hdev->iso_cnt : - hdev->le_pkts ? &hdev->le_cnt : &hdev->acl_cnt; + cnt = &hdev->iso_cnt; while (*cnt && (conn = hci_low_sent(hdev, type, "e))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index fe49e8a7969f..d790b0d4eb9a 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4461,19 +4461,9 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data, case CIS_LINK: case BIS_LINK: case PA_LINK: - if (hdev->iso_pkts) { - hdev->iso_cnt += count; - if (hdev->iso_cnt > hdev->iso_pkts) - hdev->iso_cnt = hdev->iso_pkts; - } else if (hdev->le_pkts) { - hdev->le_cnt += count; - if (hdev->le_cnt > hdev->le_pkts) - hdev->le_cnt = hdev->le_pkts; - } else { - hdev->acl_cnt += count; - if (hdev->acl_cnt > hdev->acl_pkts) - hdev->acl_cnt = hdev->acl_pkts; - } + hdev->iso_cnt += count; + if (hdev->iso_cnt > hdev->iso_pkts) + hdev->iso_cnt = hdev->iso_pkts; break; default: -- cgit v1.2.3 From 339a87883a14d6a818ca436fed41aa5d10e0f4bd Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 13 Aug 2025 15:21:19 -0400 Subject: Bluetooth: ISO: Use sk_sndtimeo as conn_timeout This aligns the usage of socket sk_sndtimeo as conn_timeout when initiating a connection and then use it when scheduling the resulting HCI command, similar to what has been done in bf98feea5b65 ("Bluetooth: hci_conn: Always use sk_timeo as conn_timeout"). Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 10 ++++++---- net/bluetooth/hci_conn.c | 20 ++++++++++++-------- net/bluetooth/iso.c | 16 ++++++++++------ 3 files changed, 28 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 6560b32f3125..a068beae9318 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1587,16 +1587,18 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, __u16 setting, struct bt_codec *codec, u16 timeout); struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, - __u8 dst_type, struct bt_iso_qos *qos); + __u8 dst_type, struct bt_iso_qos *qos, + u16 timeout); struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, struct bt_iso_qos *qos, - __u8 base_len, __u8 *base); + __u8 base_len, __u8 *base, u16 timeout); struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, - __u8 dst_type, struct bt_iso_qos *qos); + __u8 dst_type, struct bt_iso_qos *qos, + u16 timeout); struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, __u8 sid, struct bt_iso_qos *qos, - __u8 data_len, __u8 *data); + __u8 data_len, __u8 *data, u16 timeout); struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, __u8 sid, struct bt_iso_qos *qos); int hci_conn_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 091cff2155e6..111f0e37b672 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1539,7 +1539,7 @@ static int qos_set_bis(struct hci_dev *hdev, struct bt_iso_qos *qos) /* This function requires the caller holds hdev->lock */ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, struct bt_iso_qos *qos, - __u8 base_len, __u8 *base) + __u8 base_len, __u8 *base, u16 timeout) { struct hci_conn *conn; int err; @@ -1581,6 +1581,7 @@ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst, conn->state = BT_CONNECT; conn->sid = sid; + conn->conn_timeout = timeout; hci_conn_hold(conn); return conn; @@ -1921,7 +1922,8 @@ done: } struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, - __u8 dst_type, struct bt_iso_qos *qos) + __u8 dst_type, struct bt_iso_qos *qos, + u16 timeout) { struct hci_conn *cis; @@ -1936,6 +1938,7 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, cis->dst_type = dst_type; cis->iso_qos.ucast.cig = BT_ISO_QOS_CIG_UNSET; cis->iso_qos.ucast.cis = BT_ISO_QOS_CIS_UNSET; + cis->conn_timeout = timeout; } if (cis->state == BT_CONNECTED) @@ -2175,7 +2178,7 @@ static void create_big_complete(struct hci_dev *hdev, void *data, int err) struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, struct bt_iso_qos *qos, - __u8 base_len, __u8 *base) + __u8 base_len, __u8 *base, u16 timeout) { struct hci_conn *conn; struct hci_conn *parent; @@ -2196,7 +2199,7 @@ struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, base, base_len); /* We need hci_conn object using the BDADDR_ANY as dst */ - conn = hci_add_bis(hdev, dst, sid, qos, base_len, eir); + conn = hci_add_bis(hdev, dst, sid, qos, base_len, eir, timeout); if (IS_ERR(conn)) return conn; @@ -2249,13 +2252,13 @@ static void bis_mark_per_adv(struct hci_conn *conn, void *data) struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, __u8 sid, struct bt_iso_qos *qos, - __u8 base_len, __u8 *base) + __u8 base_len, __u8 *base, u16 timeout) { struct hci_conn *conn; int err; struct iso_list_data data; - conn = hci_bind_bis(hdev, dst, sid, qos, base_len, base); + conn = hci_bind_bis(hdev, dst, sid, qos, base_len, base, timeout); if (IS_ERR(conn)) return conn; @@ -2298,7 +2301,8 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, } struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, - __u8 dst_type, struct bt_iso_qos *qos) + __u8 dst_type, struct bt_iso_qos *qos, + u16 timeout) { struct hci_conn *le; struct hci_conn *cis; @@ -2322,7 +2326,7 @@ struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, hci_iso_qos_setup(hdev, le, &qos->ucast.in, le->le_rx_phy ? le->le_rx_phy : hdev->le_rx_def_phys); - cis = hci_bind_cis(hdev, dst, dst_type, qos); + cis = hci_bind_cis(hdev, dst, dst_type, qos, timeout); if (IS_ERR(cis)) { hci_conn_drop(le); return cis; diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index ac6e83313b9b..5c68c0ea7d97 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -91,8 +91,8 @@ static struct sock *iso_get_sock(bdaddr_t *src, bdaddr_t *dst, iso_sock_match_t match, void *data); /* ---- ISO timers ---- */ -#define ISO_CONN_TIMEOUT (HZ * 40) -#define ISO_DISCONN_TIMEOUT (HZ * 2) +#define ISO_CONN_TIMEOUT secs_to_jiffies(20) +#define ISO_DISCONN_TIMEOUT secs_to_jiffies(2) static void iso_conn_free(struct kref *ref) { @@ -367,7 +367,8 @@ static int iso_connect_bis(struct sock *sk) if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { hcon = hci_bind_bis(hdev, &iso_pi(sk)->dst, iso_pi(sk)->bc_sid, &iso_pi(sk)->qos, iso_pi(sk)->base_len, - iso_pi(sk)->base); + iso_pi(sk)->base, + READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; @@ -376,7 +377,8 @@ static int iso_connect_bis(struct sock *sk) hcon = hci_connect_bis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), iso_pi(sk)->bc_sid, &iso_pi(sk)->qos, - iso_pi(sk)->base_len, iso_pi(sk)->base); + iso_pi(sk)->base_len, iso_pi(sk)->base, + READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; @@ -469,7 +471,8 @@ static int iso_connect_cis(struct sock *sk) if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { hcon = hci_bind_cis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), - &iso_pi(sk)->qos); + &iso_pi(sk)->qos, + READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; @@ -477,7 +480,8 @@ static int iso_connect_cis(struct sock *sk) } else { hcon = hci_connect_cis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), - &iso_pi(sk)->qos); + &iso_pi(sk)->qos, + READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; -- cgit v1.2.3 From c9beb36c14660713b948e289b1e352cc3d386d44 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 13 Aug 2025 15:57:39 -0400 Subject: Bluetooth: hci_core: Detect if an ISO link has stalled This attempts to detect if an ISO link has been waiting for an ISO buffer for longer than the maximum allowed transport latency then proceed to use hci_link_tx_to which prints an error and disconnects. Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 1 + include/net/bluetooth/hci_core.h | 1 + net/bluetooth/hci_core.c | 34 ++++++++++++++++++++++++++-------- 3 files changed, 28 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index df1847b74e55..9ecc70baaca9 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -488,6 +488,7 @@ enum { #define HCI_AUTO_OFF_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */ #define HCI_ACL_CONN_TIMEOUT msecs_to_jiffies(20000) /* 20 seconds */ #define HCI_LE_CONN_TIMEOUT msecs_to_jiffies(20000) /* 20 seconds */ +#define HCI_ISO_TX_TIMEOUT usecs_to_jiffies(0x7fffff) /* 8388607 usecs */ /* HCI data types */ #define HCI_COMMAND_PKT 0x01 diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index a068beae9318..2924c2bf2a98 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -487,6 +487,7 @@ struct hci_dev { unsigned long acl_last_tx; unsigned long le_last_tx; + unsigned long iso_last_tx; __u8 le_tx_def_phys; __u8 le_rx_def_phys; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index e2bffad9816f..4cf4bb1187dc 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3585,24 +3585,37 @@ static void hci_prio_recalculate(struct hci_dev *hdev, __u8 type) static void __check_timeout(struct hci_dev *hdev, unsigned int cnt, u8 type) { - unsigned long last_tx; + unsigned long timeout; if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) return; switch (type) { + case ACL_LINK: + /* tx timeout must be longer than maximum link supervision + * timeout (40.9 seconds) + */ + timeout = hdev->acl_last_tx + HCI_ACL_TX_TIMEOUT; + break; case LE_LINK: - last_tx = hdev->le_last_tx; + /* tx timeout must be longer than maximum link supervision + * timeout (40.9 seconds) + */ + timeout = hdev->le_last_tx + HCI_ACL_TX_TIMEOUT; break; - default: - last_tx = hdev->acl_last_tx; + case CIS_LINK: + case BIS_LINK: + case PA_LINK: + /* tx timeout must be longer than the maximum transport latency + * (8.388607 seconds) + */ + timeout = hdev->iso_last_tx + HCI_ISO_TX_TIMEOUT; break; + default: + return; } - /* tx timeout must be longer than maximum link supervision timeout - * (40.9 seconds) - */ - if (!cnt && time_after(jiffies, last_tx + HCI_ACL_TX_TIMEOUT)) + if (!cnt && time_after(jiffies, timeout)) hci_link_tx_to(hdev, type); } @@ -3759,10 +3772,15 @@ static void hci_sched_iso(struct hci_dev *hdev, __u8 type) return; cnt = &hdev->iso_cnt; + + __check_timeout(hdev, *cnt, type); + while (*cnt && (conn = hci_low_sent(hdev, type, "e))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); + hci_send_conn_frame(hdev, conn, skb); + hdev->iso_last_tx = jiffies; conn->sent++; if (conn->sent == ~0) -- cgit v1.2.3 From 79e562a52adea4afa0601a15964498fae66c823c Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 20 Aug 2025 08:50:12 -0400 Subject: Bluetooth: MGMT: Fix not exposing debug UUID on MGMT_OP_READ_EXP_FEATURES_INFO The debug UUID was only getting set if MGMT_OP_READ_EXP_FEATURES_INFO was not called with a specific index which breaks the likes of bluetoothd since it only invokes MGMT_OP_READ_EXP_FEATURES_INFO when an adapter is plugged, so instead of depending hdev not to be set just enable the UUID on any index like it was done with iso_sock_uuid. Fixes: e625e50ceee1 ("Bluetooth: Introduce debug feature when dynamic debug is disabled") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 225140fcb3d6..a3d16eece0d2 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -4542,13 +4542,11 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, return -ENOMEM; #ifdef CONFIG_BT_FEATURE_DEBUG - if (!hdev) { - flags = bt_dbg_get() ? BIT(0) : 0; + flags = bt_dbg_get() ? BIT(0) : 0; - memcpy(rp->features[idx].uuid, debug_uuid, 16); - rp->features[idx].flags = cpu_to_le32(flags); - idx++; - } + memcpy(rp->features[idx].uuid, debug_uuid, 16); + rp->features[idx].flags = cpu_to_le32(flags); + idx++; #endif if (hdev && hci_dev_le_state_simultaneous(hdev)) { -- cgit v1.2.3 From 3c34d6428740e47b29ae3afd85d6f9eb656a3ea3 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 19 Aug 2025 15:31:28 -0400 Subject: Bluetooth: hci_core: Print number of packets in conn->data_q This attempts to print the number of packets pending to be transmitted in the conn->data_q. Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 4cf4bb1187dc..198819577fe5 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3267,6 +3267,8 @@ static void hci_queue_acl(struct hci_chan *chan, struct sk_buff_head *queue, spin_unlock_bh(&queue->lock); } + + bt_dev_dbg(hdev, "chan %p queued %d", chan, skb_queue_len(queue)); } void hci_send_acl(struct hci_chan *chan, struct sk_buff *skb, __u16 flags) @@ -3298,6 +3300,10 @@ void hci_send_sco(struct hci_conn *conn, struct sk_buff *skb) hci_skb_pkt_type(skb) = HCI_SCODATA_PKT; skb_queue_tail(&conn->data_q, skb); + + bt_dev_dbg(hdev, "hcon %p queued %d", conn, + skb_queue_len(&conn->data_q)); + queue_work(hdev->workqueue, &hdev->tx_work); } @@ -3357,6 +3363,8 @@ static void hci_queue_iso(struct hci_conn *conn, struct sk_buff_head *queue, __skb_queue_tail(queue, skb); } while (list); } + + bt_dev_dbg(hdev, "hcon %p queued %d", conn, skb_queue_len(queue)); } void hci_send_iso(struct hci_conn *conn, struct sk_buff *skb) -- cgit v1.2.3 From 48a258b198c12685747beaf6392f2b68e6c542c2 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 20 Aug 2025 12:21:09 -0400 Subject: Bluetooth: hci_core: Print information of hcon on hci_low_sent This prints the information about the hcon on hci_low_sent to confirm all connection are being processed. Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 198819577fe5..3418d7b964a1 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3435,6 +3435,10 @@ static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type, skb_queue_empty(&c->data_q)) continue; + bt_dev_dbg(hdev, "hcon %p state %s queued %d", c, + state_to_string(c->state), + skb_queue_len(&c->data_q)); + if (c->state != BT_CONNECTED && c->state != BT_CONFIG) continue; -- cgit v1.2.3 From ecb9a843be4d6fd710d7026e359f21015a062572 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 22 Sep 2025 13:13:13 -0400 Subject: Bluetooth: SCO: Fix UAF on sco_conn_free BUG: KASAN: slab-use-after-free in sco_conn_free net/bluetooth/sco.c:87 [inline] BUG: KASAN: slab-use-after-free in kref_put include/linux/kref.h:65 [inline] BUG: KASAN: slab-use-after-free in sco_conn_put+0xdd/0x410 net/bluetooth/sco.c:107 Write of size 8 at addr ffff88811cb96b50 by task kworker/u17:4/352 CPU: 1 UID: 0 PID: 352 Comm: kworker/u17:4 Not tainted 6.17.0-rc5-g717368f83676 #4 PREEMPT(voluntary) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Workqueue: hci13 hci_cmd_sync_work Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x10b/0x170 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0x191/0x550 mm/kasan/report.c:482 kasan_report+0xc4/0x100 mm/kasan/report.c:595 sco_conn_free net/bluetooth/sco.c:87 [inline] kref_put include/linux/kref.h:65 [inline] sco_conn_put+0xdd/0x410 net/bluetooth/sco.c:107 sco_connect_cfm+0xb4/0xae0 net/bluetooth/sco.c:1441 hci_connect_cfm include/net/bluetooth/hci_core.h:2082 [inline] hci_conn_failed+0x20a/0x2e0 net/bluetooth/hci_conn.c:1313 hci_conn_unlink+0x55f/0x810 net/bluetooth/hci_conn.c:1121 hci_conn_del+0xb6/0x1110 net/bluetooth/hci_conn.c:1147 hci_abort_conn_sync+0x8c5/0xbb0 net/bluetooth/hci_sync.c:5689 hci_cmd_sync_work+0x281/0x380 net/bluetooth/hci_sync.c:332 process_one_work kernel/workqueue.c:3236 [inline] process_scheduled_works+0x77e/0x1040 kernel/workqueue.c:3319 worker_thread+0xbee/0x1200 kernel/workqueue.c:3400 kthread+0x3c7/0x870 kernel/kthread.c:463 ret_from_fork+0x13a/0x1e0 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 Allocated by task 31370: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x30/0x70 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:388 [inline] __kasan_kmalloc+0x82/0x90 mm/kasan/common.c:405 kasan_kmalloc include/linux/kasan.h:260 [inline] __do_kmalloc_node mm/slub.c:4382 [inline] __kmalloc_noprof+0x22f/0x390 mm/slub.c:4394 kmalloc_noprof include/linux/slab.h:909 [inline] sk_prot_alloc+0xae/0x220 net/core/sock.c:2239 sk_alloc+0x34/0x5a0 net/core/sock.c:2295 bt_sock_alloc+0x3c/0x330 net/bluetooth/af_bluetooth.c:151 sco_sock_alloc net/bluetooth/sco.c:562 [inline] sco_sock_create+0xc0/0x350 net/bluetooth/sco.c:593 bt_sock_create+0x161/0x3b0 net/bluetooth/af_bluetooth.c:135 __sock_create+0x3ad/0x780 net/socket.c:1589 sock_create net/socket.c:1647 [inline] __sys_socket_create net/socket.c:1684 [inline] __sys_socket+0xd5/0x330 net/socket.c:1731 __do_sys_socket net/socket.c:1745 [inline] __se_sys_socket net/socket.c:1743 [inline] __x64_sys_socket+0x7a/0x90 net/socket.c:1743 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xc7/0x240 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 31374: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x30/0x70 mm/kasan/common.c:68 kasan_save_free_info+0x40/0x50 mm/kasan/generic.c:576 poison_slab_object mm/kasan/common.c:243 [inline] __kasan_slab_free+0x3d/0x50 mm/kasan/common.c:275 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2428 [inline] slab_free mm/slub.c:4701 [inline] kfree+0x199/0x3b0 mm/slub.c:4900 sk_prot_free net/core/sock.c:2278 [inline] __sk_destruct+0x4aa/0x630 net/core/sock.c:2373 sco_sock_release+0x2ad/0x300 net/bluetooth/sco.c:1333 __sock_release net/socket.c:649 [inline] sock_close+0xb8/0x230 net/socket.c:1439 __fput+0x3d1/0x9e0 fs/file_table.c:468 task_work_run+0x206/0x2a0 kernel/task_work.c:227 get_signal+0x1201/0x1410 kernel/signal.c:2807 arch_do_signal_or_restart+0x34/0x740 arch/x86/kernel/signal.c:337 exit_to_user_mode_loop+0x68/0xc0 kernel/entry/common.c:40 exit_to_user_mode_prepare include/linux/irq-entry-common.h:225 [inline] syscall_exit_to_user_mode_work include/linux/entry-common.h:175 [inline] syscall_exit_to_user_mode include/linux/entry-common.h:210 [inline] do_syscall_64+0x1dd/0x240 arch/x86/entry/syscall_64.c:100 entry_SYSCALL_64_after_hwframe+0x77/0x7f Reported-by: cen zhang Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/sco.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index d382d980fd9a..ab0cf442d57b 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -498,6 +498,13 @@ static void sco_sock_kill(struct sock *sk) BT_DBG("sk %p state %d", sk, sk->sk_state); + /* Sock is dead, so set conn->sk to NULL to avoid possible UAF */ + if (sco_pi(sk)->conn) { + sco_conn_lock(sco_pi(sk)->conn); + sco_pi(sk)->conn->sk = NULL; + sco_conn_unlock(sco_pi(sk)->conn); + } + /* Kill poor orphan */ bt_sock_unlink(&sco_sk_list, sk); sock_set_flag(sk, SOCK_DEAD); -- cgit v1.2.3 From 9950f095d6c875dbe0c9ebfcf972ec88fdf26fc8 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 22 Sep 2025 16:27:51 -0400 Subject: Bluetooth: ISO: Fix possible UAF on iso_conn_free This attempt to fix similar issue to sco_conn_free where if the conn->sk is not set to NULL may lead to UAF on iso_conn_free. Fixes: ccf74f2390d6 ("Bluetooth: Add BTPROTO_ISO socket type") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 5c68c0ea7d97..d24c7a1ace92 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -761,6 +761,13 @@ static void iso_sock_kill(struct sock *sk) BT_DBG("sk %p state %d", sk, sk->sk_state); + /* Sock is dead, so set conn->sk to NULL to avoid possible UAF */ + if (iso_pi(sk)->conn) { + iso_conn_lock(iso_pi(sk)->conn); + iso_pi(sk)->conn->sk = NULL; + iso_conn_unlock(iso_pi(sk)->conn); + } + /* Kill poor orphan */ bt_sock_unlink(&iso_sk_list, sk); sock_set_flag(sk, SOCK_DEAD); -- cgit v1.2.3 From 6ba85da5804efffe15c89b03742ea868f20b4172 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Mon, 22 Sep 2025 21:11:21 +0300 Subject: Bluetooth: ISO: free rx_skb if not consumed If iso_conn is freed when RX is incomplete, free any leftover skb piece. Fixes: dc26097bdb86 ("Bluetooth: ISO: Use kref to track lifetime of iso_conn") Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index d24c7a1ace92..ad5c8118a6e3 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -111,6 +111,8 @@ static void iso_conn_free(struct kref *ref) /* Ensure no more work items will run since hci_conn has been dropped */ disable_delayed_work_sync(&conn->timeout_work); + kfree_skb(conn->rx_skb); + kfree(conn); } -- cgit v1.2.3 From 5bf863f4c5da055c1eb08887ae4f26d99dbc4aac Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Mon, 22 Sep 2025 21:11:22 +0300 Subject: Bluetooth: ISO: don't leak skb in ISO_CONT RX For ISO_CONT RX, the data from skb is copied to conn->rx_skb, but the skb is leaked. Free skb after copying its data. Fixes: ccf74f2390d6 ("Bluetooth: Add BTPROTO_ISO socket type") Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index ad5c8118a6e3..9b263d061e05 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -2427,7 +2427,7 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), skb->len); conn->rx_len -= skb->len; - return; + break; case ISO_END: skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), -- cgit v1.2.3 From 03ddb4ac251463ec5b7b069395d9ab89163dd56c Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 19 Sep 2025 12:30:05 -0400 Subject: Bluetooth: hci_sync: Fix using random address for BIG/PA advertisements When creating an advertisement for BIG the address shall not be non-resolvable since in case of acting as BASS/Broadcast Assistant the address must be the same as the connection in order to use the PAST method and even when PAST/BASS are not in the picture a Periodic Advertisement can still be synchronized thus the same argument as to connectable advertisements still stand. Fixes: eca0ae4aea66 ("Bluetooth: Add initial implementation of BIS connections") Signed-off-by: Luiz Augusto von Dentz Reviewed-by: Paul Menzel --- net/bluetooth/hci_sync.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 7a7d49890858..eefdb6134ca5 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -1325,7 +1325,7 @@ int hci_setup_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_ext_adv_params cp; struct hci_rp_le_set_ext_adv_params rp; - bool connectable; + bool connectable, require_privacy; u32 flags; bdaddr_t random_addr; u8 own_addr_type; @@ -1363,10 +1363,12 @@ int hci_setup_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance) return -EPERM; /* Set require_privacy to true only when non-connectable - * advertising is used. In that case it is fine to use a - * non-resolvable private address. + * advertising is used and it is not periodic. + * In that case it is fine to use a non-resolvable private address. */ - err = hci_get_random_address(hdev, !connectable, + require_privacy = !connectable && !(adv && adv->periodic); + + err = hci_get_random_address(hdev, require_privacy, adv_use_rpa(hdev, flags), adv, &own_addr_type, &random_addr); if (err < 0) -- cgit v1.2.3 From be812ace0378a9db86344ad637c5ed2a5d11f216 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 9 Sep 2025 14:13:35 +0200 Subject: Bluetooth: Avoid a couple dozen -Wflex-array-member-not-at-end warnings -Wflex-array-member-not-at-end was introduced in GCC-14, and we are getting ready to enable it, globally. Use the __struct_group() helper to fix 31 instances of the following type of warnings: 30 net/bluetooth/mgmt_config.c:16:33: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] 1 net/bluetooth/mgmt_config.c:22:33: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] Signed-off-by: Gustavo A. R. Silva Reviewed-by: Simon Horman Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/mgmt.h | 9 +++++++-- net/bluetooth/mgmt_config.c | 4 ++-- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 3575cd16049a..74edea06985b 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -53,10 +53,15 @@ struct mgmt_hdr { } __packed; struct mgmt_tlv { - __le16 type; - __u8 length; + /* New members MUST be added within the __struct_group() macro below. */ + __struct_group(mgmt_tlv_hdr, __hdr, __packed, + __le16 type; + __u8 length; + ); __u8 value[]; } __packed; +static_assert(offsetof(struct mgmt_tlv, value) == sizeof(struct mgmt_tlv_hdr), + "struct member likely outside of __struct_group()"); struct mgmt_addr_info { bdaddr_t bdaddr; diff --git a/net/bluetooth/mgmt_config.c b/net/bluetooth/mgmt_config.c index 6ef701c27da4..c4063d200c0a 100644 --- a/net/bluetooth/mgmt_config.c +++ b/net/bluetooth/mgmt_config.c @@ -13,13 +13,13 @@ #define HDEV_PARAM_U16(_param_name_) \ struct {\ - struct mgmt_tlv entry; \ + struct mgmt_tlv_hdr entry; \ __le16 value; \ } __packed _param_name_ #define HDEV_PARAM_U8(_param_name_) \ struct {\ - struct mgmt_tlv entry; \ + struct mgmt_tlv_hdr entry; \ __u8 value; \ } __packed _param_name_ -- cgit v1.2.3 From f12b69d8f22824a07f17c1399c99757072de73e0 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 27 Sep 2025 19:39:08 +0200 Subject: batman-adv: Release references to inactive interfaces Trying to dump the originators or the neighbors via netlink for a meshif with an inactive primary interface is not allowed. The dump functions were checking this correctly but they didn't handle non-existing primary interfaces and existing _inactive_ interfaces differently. (Primary) batadv_hard_ifaces hold a references to a net_device. And accessing them is only allowed when either being in a RCU/spinlock protected section or when holding a valid reference to them. The netlink dump functions use the latter. But because the missing specific error handling for inactive primary interfaces, the reference was never dropped. This reference counting error was only detected when the interface should have been removed from the system: unregister_netdevice: waiting for batadv_slave_0 to become free. Usage count = 2 Cc: stable@vger.kernel.org Fixes: 6ecc4fd6c2f4 ("batman-adv: netlink: reduce duplicate code by returning interfaces") Reported-by: syzbot+881d65229ca4f9ae8c84@syzkaller.appspotmail.com Reported-by: Tetsuo Handa Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/originator.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index a464ff96b929..ed89d7fd1e7f 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -764,11 +764,16 @@ int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb) bat_priv = netdev_priv(mesh_iface); primary_if = batadv_primary_if_get_selected(bat_priv); - if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + if (!primary_if) { ret = -ENOENT; goto out_put_mesh_iface; } + if (primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out_put_primary_if; + } + hard_iface = batadv_netlink_get_hardif(bat_priv, cb); if (IS_ERR(hard_iface) && PTR_ERR(hard_iface) != -ENONET) { ret = PTR_ERR(hard_iface); @@ -1333,11 +1338,16 @@ int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb) bat_priv = netdev_priv(mesh_iface); primary_if = batadv_primary_if_get_selected(bat_priv); - if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + if (!primary_if) { ret = -ENOENT; goto out_put_mesh_iface; } + if (primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out_put_primary_if; + } + hard_iface = batadv_netlink_get_hardif(bat_priv, cb); if (IS_ERR(hard_iface) && PTR_ERR(hard_iface) != -ENONET) { ret = PTR_ERR(hard_iface); -- cgit v1.2.3 From 2804359536275d8d5f92eb1949102eca4153ea1e Mon Sep 17 00:00:00 2001 From: Markus Heidelberg Date: Fri, 26 Sep 2025 15:13:23 +0200 Subject: net: ethtool: remove duplicated mm.o from Makefile Fixes: 2b30f8291a30 ("net: ethtool: add support for MAC Merge layer") Signed-off-by: Markus Heidelberg Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20250926131323.222192-1-m.heidelberg@cab.de Signed-off-by: Jakub Kicinski --- net/ethtool/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index a1490c4afe6b..1e493553b977 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -8,5 +8,5 @@ ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o rss.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ tunnels.o fec.o eeprom.o stats.o phc_vclocks.o mm.o \ - module.o cmis_fw_update.o cmis_cdb.o pse-pd.o plca.o mm.o \ + module.o cmis_fw_update.o cmis_cdb.o pse-pd.o plca.o \ phy.o tsconfig.o -- cgit v1.2.3 From 2b235765e9d4426cf56d7fd1a331f81a4dbbd85a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 25 Sep 2025 22:49:14 +0000 Subject: scm: use masked_user_access_begin() in put_cmsg() Use the greatest and latest uaccess construct to get an optimal code. Before : lea (%r9,%rcx,1),%r10 movabs $,%r11 mov $0xfffffff2,%eax cmp %rcx,%r10 jb ffffffff81cdc312 cmp %r11,%r10 ja ffffffff81cdc312 stac lfence mov %r9,(%rcx) After: movabs $,%r9 cmp %r9,%rax cmova %r9,%rax stac mov %rcx,(%rax) Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250925224914.3590290-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/scm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/scm.c b/net/core/scm.c index 072d5742440a..66eaee783e8b 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -273,7 +273,9 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) check_object_size(data, cmlen - sizeof(*cm), true); - if (!user_write_access_begin(cm, cmlen)) + if (can_do_masked_user_access()) + cm = masked_user_access_begin(cm); + else if (!user_write_access_begin(cm, cmlen)) goto efault; unsafe_put_user(cmlen, &cm->cmsg_len, efault_end); -- cgit v1.2.3 From 1fb0e471611dc6a79dee609a7e0037eb1d124400 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 25 Sep 2025 23:09:29 +0000 Subject: net: remove one stac/clac pair from move_addr_to_user() Convert the get_user() and __put_user() code to the fast masked_user_access_begin()/unsafe_{get|put}_user() variant. This patch increases the performance of an UDP recvfrom() receiver (netserver) on 120 bytes messages by 7 % on an AMD EPYC 7B12 64-Core Processor platform. Presence of audit_sockaddr() makes difficult to avoid the stac/clac pair in the copy_to_user() call, this is left for a future patch. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250925230929.3727873-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/socket.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index 682969deaed3..5bc4ee0bb75d 100644 --- a/net/socket.c +++ b/net/socket.c @@ -276,28 +276,41 @@ int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *k static int move_addr_to_user(struct sockaddr_storage *kaddr, int klen, void __user *uaddr, int __user *ulen) { - int err; int len; BUG_ON(klen > sizeof(struct sockaddr_storage)); - err = get_user(len, ulen); - if (err) - return err; + + if (can_do_masked_user_access()) + ulen = masked_user_access_begin(ulen); + else if (!user_access_begin(ulen, 4)) + return -EFAULT; + + unsafe_get_user(len, ulen, efault_end); + if (len > klen) len = klen; - if (len < 0) - return -EINVAL; + /* + * "fromlen shall refer to the value before truncation.." + * 1003.1g + */ + if (len >= 0) + unsafe_put_user(klen, ulen, efault_end); + + user_access_end(); + if (len) { + if (len < 0) + return -EINVAL; if (audit_sockaddr(klen, kaddr)) return -ENOMEM; if (copy_to_user(uaddr, kaddr, len)) return -EFAULT; } - /* - * "fromlen shall refer to the value before truncation.." - * 1003.1g - */ - return __put_user(klen, ulen); + return 0; + +efault_end: + user_access_end(); + return -EFAULT; } static struct kmem_cache *sock_inode_cachep __ro_after_init; -- cgit v1.2.3 From f017c1f768b670bced4464476655b27dfb937e67 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 27 Sep 2025 09:28:27 +0000 Subject: tcp: use skb->len instead of skb->truesize in tcp_can_ingest() Some applications are stuck to the 20th century and still use small SO_RCVBUF values. After the blamed commit, we can drop packets especially when using LRO/hw-gro enabled NIC and small MSS (1500) values. LRO/hw-gro NIC pack multiple segments into pages, allowing tp->scaling_ratio to be set to a high value. Whenever the receive queue gets full, we can receive a small packet filling RWIN, but with a high skb->truesize, because most NIC use 4K page plus sk_buff metadata even when receiving less than 1500 bytes of payload. Even if we refine how tp->scaling_ratio is estimated, we could have an issue at the start of the flow, because the first round of packets (IW10) will be sent based on the initial tp->scaling_ratio (1/2) Relax tcp_can_ingest() to use skb->len instead of skb->truesize, allowing the peer to use final RWIN, assuming a 'perfect' scaling_ratio of 1. Fixes: 1d2fbaad7cd8 ("tcp: stronger sk_rcvbuf checks") Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250927092827.2707901-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 79d5252ed6cc..0a2511ce34db 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5086,12 +5086,23 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb); /* Check if this incoming skb can be added to socket receive queues * while satisfying sk->sk_rcvbuf limit. + * + * In theory we should use skb->truesize, but this can cause problems + * when applications use too small SO_RCVBUF values. + * When LRO / hw gro is used, the socket might have a high tp->scaling_ratio, + * allowing RWIN to be close to available space. + * Whenever the receive queue gets full, we can receive a small packet + * filling RWIN, but with a high skb->truesize, because most NIC use 4K page + * plus sk_buff metadata even when receiving less than 1500 bytes of payload. + * + * Note that we use skb->len to decide to accept or drop this packet, + * but sk->sk_rmem_alloc is the sum of all skb->truesize. */ static bool tcp_can_ingest(const struct sock *sk, const struct sk_buff *skb) { - unsigned int new_mem = atomic_read(&sk->sk_rmem_alloc) + skb->truesize; + unsigned int rmem = atomic_read(&sk->sk_rmem_alloc); - return new_mem <= sk->sk_rcvbuf; + return rmem + skb->len <= sk->sk_rcvbuf; } static int tcp_try_rmem_schedule(struct sock *sk, const struct sk_buff *skb, -- cgit v1.2.3 From 9aa59323f2709370cb4f01acbba599a9167f317b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sat, 27 Sep 2025 11:40:37 +0200 Subject: mptcp: leverage skb deferral free Usage of the skb deferral API is straight-forward; with multiple subflows actives this allow moving part of the received application load into multiple CPUs. Also fix a typo in the related comment. Reviewed-by: Geliang Tang Tested-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250927-net-next-mptcp-rcv-path-imp-v1-1-5da266aa9c1a@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 735a209d4072..62cdd2bcff9d 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1943,12 +1943,13 @@ static int __mptcp_recvmsg_mskq(struct sock *sk, } if (!(flags & MSG_PEEK)) { - /* avoid the indirect call, we know the destructor is sock_wfree */ + /* avoid the indirect call, we know the destructor is sock_rfree */ skb->destructor = NULL; + skb->sk = NULL; atomic_sub(skb->truesize, &sk->sk_rmem_alloc); sk_mem_uncharge(sk, skb->truesize); __skb_unlink(skb, &sk->sk_receive_queue); - __kfree_skb(skb); + skb_attempt_defer_free(skb); msk->bytes_consumed += count; } -- cgit v1.2.3 From a7556779745c047efb7b0ce8732889b0cdc80936 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sat, 27 Sep 2025 11:40:38 +0200 Subject: tcp: make tcp_rcvbuf_grow() accessible to mptcp code To leverage the auto-tuning improvements brought by commit 2da35e4b4df9 ("Merge branch 'tcp-receive-side-improvements'"), the MPTCP stack need to access the mentioned helper. Acked-by: Geliang Tang Acked-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250927-net-next-mptcp-rcv-path-imp-v1-2-5da266aa9c1a@kernel.org Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 1 + net/ipv4/tcp_input.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 7c51a0a5ace8..5ca230ed526a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -370,6 +370,7 @@ void tcp_delack_timer_handler(struct sock *sk); int tcp_ioctl(struct sock *sk, int cmd, int *karg); enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); +void tcp_rcvbuf_grow(struct sock *sk); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0a2511ce34db..b44fdc309633 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -891,7 +891,7 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, } } -static void tcp_rcvbuf_grow(struct sock *sk) +void tcp_rcvbuf_grow(struct sock *sk) { const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); -- cgit v1.2.3 From e118cdc34dd109562b64f6a397f68cd33b041d5b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sat, 27 Sep 2025 11:40:39 +0200 Subject: mptcp: rcvbuf auto-tuning improvement Apply to the MPTCP auto-tuning the same improvements introduced for the TCP protocol by the merge commit 2da35e4b4df9 ("Merge branch 'tcp-receive-side-improvements'"). The main difference is that TCP subflow and the main MPTCP socket need to account separately for OoO: MPTCP does not care for TCP-level OoO and vice versa, as a consequence do not reflect MPTCP-level rcvbuf increase due to OoO packets at the subflow level. This refeactor additionally allow dropping the msk receive buffer update at receive time, as the latter only intended to cope with subflow receive buffer increase due to OoO packets. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/487 Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/559 Reviewed-by: Geliang Tang Tested-by: Geliang Tang Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250927-net-next-mptcp-rcv-path-imp-v1-3-5da266aa9c1a@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 97 +++++++++++++++++++++++++--------------------------- net/mptcp/protocol.h | 4 +-- 2 files changed, 49 insertions(+), 52 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 62cdd2bcff9d..f994e7f45f7b 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -179,6 +179,35 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to, return mptcp_try_coalesce((struct sock *)msk, to, from); } +/* "inspired" by tcp_rcvbuf_grow(), main difference: + * - mptcp does not maintain a msk-level window clamp + * - returns true when the receive buffer is actually updated + */ +static bool mptcp_rcvbuf_grow(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + const struct net *net = sock_net(sk); + int rcvwin, rcvbuf, cap; + + if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) || + (sk->sk_userlocks & SOCK_RCVBUF_LOCK)) + return false; + + rcvwin = msk->rcvq_space.space << 1; + + if (!RB_EMPTY_ROOT(&msk->out_of_order_queue)) + rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq; + + cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); + + rcvbuf = min_t(u32, mptcp_space_from_win(sk, rcvwin), cap); + if (rcvbuf > sk->sk_rcvbuf) { + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); + return true; + } + return false; +} + /* "inspired" by tcp_data_queue_ofo(), main differences: * - use mptcp seqs * - don't cope with sacks @@ -292,6 +321,9 @@ merge_right: end: skb_condense(skb); skb_set_owner_r(skb, sk); + /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */ + if (sk->sk_socket) + mptcp_rcvbuf_grow(sk); } static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, @@ -784,18 +816,10 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) return moved; } -static void __mptcp_rcvbuf_update(struct sock *sk, struct sock *ssk) -{ - if (unlikely(ssk->sk_rcvbuf > sk->sk_rcvbuf)) - WRITE_ONCE(sk->sk_rcvbuf, ssk->sk_rcvbuf); -} - static void __mptcp_data_ready(struct sock *sk, struct sock *ssk) { struct mptcp_sock *msk = mptcp_sk(sk); - __mptcp_rcvbuf_update(sk, ssk); - /* Wake-up the reader only for in-sequence data */ if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) sk->sk_data_ready(sk); @@ -2014,48 +2038,26 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) if (msk->rcvq_space.copied <= msk->rcvq_space.space) goto new_measure; - if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && - !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - u64 rcvwin, grow; - int rcvbuf; - - rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; - - grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space); - - do_div(grow, msk->rcvq_space.space); - rcvwin += (grow << 1); - - rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin), - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); - - if (rcvbuf > sk->sk_rcvbuf) { - u32 window_clamp; - - window_clamp = mptcp_win_from_space(sk, rcvbuf); - WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); + msk->rcvq_space.space = msk->rcvq_space.copied; + if (mptcp_rcvbuf_grow(sk)) { - /* Make subflows follow along. If we do not do this, we - * get drops at subflow level if skbs can't be moved to - * the mptcp rx queue fast enough (announced rcv_win can - * exceed ssk->sk_rcvbuf). - */ - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk; - bool slow; + /* Make subflows follow along. If we do not do this, we + * get drops at subflow level if skbs can't be moved to + * the mptcp rx queue fast enough (announced rcv_win can + * exceed ssk->sk_rcvbuf). + */ + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk; + bool slow; - ssk = mptcp_subflow_tcp_sock(subflow); - slow = lock_sock_fast(ssk); - WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf); - WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp); - if (tcp_can_send_ack(ssk)) - tcp_cleanup_rbuf(ssk, 1); - unlock_sock_fast(ssk, slow); - } + ssk = mptcp_subflow_tcp_sock(subflow); + slow = lock_sock_fast(ssk); + tcp_sk(ssk)->rcvq_space.space = msk->rcvq_space.copied; + tcp_rcvbuf_grow(ssk); + unlock_sock_fast(ssk, slow); } } - msk->rcvq_space.space = msk->rcvq_space.copied; new_measure: msk->rcvq_space.copied = 0; msk->rcvq_space.time = mstamp; @@ -2084,11 +2086,6 @@ static bool __mptcp_move_skbs(struct sock *sk) if (list_empty(&msk->conn_list)) return false; - /* verify we can move any data from the subflow, eventually updating */ - if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) - mptcp_for_each_subflow(msk, subflow) - __mptcp_rcvbuf_update(sk, subflow->tcp_sock); - subflow = list_first_entry(&msk->conn_list, struct mptcp_subflow_context, node); for (;;) { diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 371084a3fc22..52f9cfa4ce95 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -341,8 +341,8 @@ struct mptcp_sock { struct mptcp_pm_data pm; struct mptcp_sched_ops *sched; struct { - u32 space; /* bytes copied in last measurement window */ - u32 copied; /* bytes copied in this measurement window */ + int space; /* bytes copied in last measurement window */ + int copied; /* bytes copied in this measurement window */ u64 time; /* start time of measurement window */ u64 rtt_us; /* last maximum rtt of subflows */ } rcvq_space; -- cgit v1.2.3 From 9a0afe0db46720ce1a009c7dac168aa0584bd732 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sat, 27 Sep 2025 11:40:40 +0200 Subject: mptcp: introduce the mptcp_init_skb helper Factor out all the skb initialization step in a new helper and use it. Note that this change moves the MPTCP CB initialization earlier: we can do such step as soon as the skb leaves the subflow socket receive queues. Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni Reviewed-by: Geliang Tang Tested-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250927-net-next-mptcp-rcv-path-imp-v1-4-5da266aa9c1a@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 46 +++++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f994e7f45f7b..832782e23740 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -326,27 +326,11 @@ end: mptcp_rcvbuf_grow(sk); } -static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, - struct sk_buff *skb, unsigned int offset, - size_t copy_len) +static void mptcp_init_skb(struct sock *ssk, struct sk_buff *skb, int offset, + int copy_len) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); - struct sock *sk = (struct sock *)msk; - struct sk_buff *tail; - bool has_rxtstamp; - - __skb_unlink(skb, &ssk->sk_receive_queue); - - skb_ext_reset(skb); - skb_orphan(skb); - - /* try to fetch required memory from subflow */ - if (!sk_rmem_schedule(sk, skb, skb->truesize)) { - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); - goto drop; - } - - has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; + const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + bool has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; /* the skb map_seq accounts for the skb offset: * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq @@ -358,6 +342,24 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp; MPTCP_SKB_CB(skb)->cant_coalesce = 0; + __skb_unlink(skb, &ssk->sk_receive_queue); + + skb_ext_reset(skb); + skb_dst_drop(skb); +} + +static bool __mptcp_move_skb(struct sock *sk, struct sk_buff *skb) +{ + u64 copy_len = MPTCP_SKB_CB(skb)->end_seq - MPTCP_SKB_CB(skb)->map_seq; + struct mptcp_sock *msk = mptcp_sk(sk); + struct sk_buff *tail; + + /* try to fetch required memory from subflow */ + if (!sk_rmem_schedule(sk, skb, skb->truesize)) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); + goto drop; + } + if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { /* in sequence */ msk->bytes_received += copy_len; @@ -678,7 +680,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, if (offset < skb->len) { size_t len = skb->len - offset; - ret = __mptcp_move_skb(msk, ssk, skb, offset, len) || ret; + mptcp_init_skb(ssk, skb, offset, len); + skb_orphan(skb); + ret = __mptcp_move_skb(sk, skb) || ret; seq += len; if (unlikely(map_remaining < len)) { -- cgit v1.2.3 From c4ebc4ee4e751c6430604c52344d932bf1fde379 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sat, 27 Sep 2025 11:40:41 +0200 Subject: mptcp: remove unneeded mptcp_move_skb() Since commit b7535cfed223 ("mptcp: drop legacy code around RX EOF"), sk_shutdown can't change during the main recvmsg loop, we can drop the related race breaker. Reviewed-by: Geliang Tang Tested-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250927-net-next-mptcp-rcv-path-imp-v1-5-5da266aa9c1a@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 832782e23740..26fbd9f6a3f7 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2207,14 +2207,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, break; } - if (sk->sk_shutdown & RCV_SHUTDOWN) { - /* race breaker: the shutdown could be after the - * previous receive queue check - */ - if (__mptcp_move_skbs(sk)) - continue; + if (sk->sk_shutdown & RCV_SHUTDOWN) break; - } if (sk->sk_state == TCP_CLOSE) { copied = -ENOTCONN; -- cgit v1.2.3 From 68c7af988bd137479101e2b40ab5fdd0e0365364 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sat, 27 Sep 2025 11:40:42 +0200 Subject: mptcp: factor out a basic skb coalesce helper The upcoming patch will introduced backlog processing for MPTCP socket, and we want to leverage coalescing in such data path. Factor out the relevant bits not touching memory accounting to deal with such use-case. Co-developed-by: Geliang Tang Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250927-net-next-mptcp-rcv-path-imp-v1-6-5da266aa9c1a@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 26fbd9f6a3f7..da21f1807729 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -142,22 +142,33 @@ static void mptcp_drop(struct sock *sk, struct sk_buff *skb) __kfree_skb(skb); } -static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, - struct sk_buff *from) +static bool __mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, + struct sk_buff *from, bool *fragstolen, + int *delta) { - bool fragstolen; - int delta; + int limit = READ_ONCE(sk->sk_rcvbuf); if (unlikely(MPTCP_SKB_CB(to)->cant_coalesce) || MPTCP_SKB_CB(from)->offset || - ((to->len + from->len) > (sk->sk_rcvbuf >> 3)) || - !skb_try_coalesce(to, from, &fragstolen, &delta)) + ((to->len + from->len) > (limit >> 3)) || + !skb_try_coalesce(to, from, fragstolen, delta)) return false; pr_debug("colesced seq %llx into %llx new len %d new end seq %llx\n", MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq, to->len, MPTCP_SKB_CB(from)->end_seq); MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq; + return true; +} + +static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, + struct sk_buff *from) +{ + bool fragstolen; + int delta; + + if (!__mptcp_try_coalesce(sk, to, from, &fragstolen, &delta)) + return false; /* note the fwd memory can reach a negative value after accounting * for the delta, but the later skb free will restore a non -- cgit v1.2.3 From 59701b1870032c1bf32244d87476bcd4b5ecb41b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sat, 27 Sep 2025 11:40:43 +0200 Subject: mptcp: minor move_skbs_to_msk() cleanup Such function is called only by __mptcp_data_ready(), which in turn is always invoked when msk is not owned by the user: we can drop the redundant, related check. Additionally mptcp needs to propagate the socket error only for current subflow. Reviewed-by: Geliang Tang Tested-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Paolo Abeni Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250927-net-next-mptcp-rcv-path-imp-v1-7-5da266aa9c1a@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index da21f1807729..0292162a14ee 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -814,12 +814,8 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) moved = __mptcp_move_skbs_from_subflow(msk, ssk); __mptcp_ofo_queue(msk); - if (unlikely(ssk->sk_err)) { - if (!sock_owned_by_user(sk)) - __mptcp_error_report(sk); - else - __set_bit(MPTCP_ERROR_REPORT, &msk->cb_flags); - } + if (unlikely(ssk->sk_err)) + __mptcp_subflow_error_report(sk, ssk); /* If the moves have caught up with the DATA_FIN sequence number * it's time to ack the DATA_FIN and change socket state, but -- cgit v1.2.3 From 4ed9db2dc5d8981ecb7042f084f5cff43ba539d6 Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Mon, 29 Sep 2025 01:54:12 -0700 Subject: net: rtnetlink: fix typo in rtnl_unregister_all() comment Corrected "rtnl_unregster()" -> "rtnl_unregister()" in the documentation comment of "rtnl_unregister_all()" Signed-off-by: Alok Tiwari Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250929085418.49200-1-alok.a.tiwari@oracle.com Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d9e68ca84926..8040ff7c356e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -478,7 +478,7 @@ static int rtnl_unregister(int protocol, int msgtype) * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol * @protocol : Protocol family or PF_UNSPEC * - * Identical to calling rtnl_unregster() for all registered message types + * Identical to calling rtnl_unregister() for all registered message types * of a certain protocol family. */ void rtnl_unregister_all(int protocol) -- cgit v1.2.3 From 7bd80ed89d72285515db673803b021469ba71ee8 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 24 Sep 2025 14:02:41 +0200 Subject: Documentation: net: add flow control guide and document ethtool API Introduce a new document, flow_control.rst, to provide a comprehensive guide on Ethernet Flow Control in Linux. The guide explains how flow control works, how autonegotiation resolves pause capabilities, and how to configure it using ethtool and Netlink. In parallel, document the pause and pause-stat attributes in the ethtool.yaml netlink spec. This enables the ynl tool to generate kernel-doc comments for the corresponding enums in the UAPI header, making the C interface self-documenting. Finally, replace the legacy flow control section in phy.rst with a reference to the new document and add pointers in the relevant C source files. Signed-off-by: Oleksij Rempel Link: https://patch.msgid.link/20250924120241.724850-1-o.rempel@pengutronix.de Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/ethtool.yaml | 27 ++ Documentation/networking/flow_control.rst | 373 +++++++++++++++++++++++++ Documentation/networking/index.rst | 1 + Documentation/networking/phy.rst | 12 +- include/linux/ethtool.h | 45 ++- include/uapi/linux/ethtool_netlink_generated.h | 4 +- net/dcb/dcbnl.c | 2 + net/ethtool/pause.c | 4 + 8 files changed, 453 insertions(+), 15 deletions(-) create mode 100644 Documentation/networking/flow_control.rst (limited to 'net') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 6a0fb1974513..e4852505294f 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -864,7 +864,9 @@ attribute-sets: - name: pause-stat + doc: Statistics counters for link-wide PAUSE frames (IEEE 802.3 Annex 31B). attr-cnt-name: __ethtool-a-pause-stat-cnt + enum-name: ethtool-a-pause-stat attributes: - name: unspec @@ -875,13 +877,17 @@ attribute-sets: type: pad - name: tx-frames + doc: Number of PAUSE frames transmitted. type: u64 - name: rx-frames + doc: Number of PAUSE frames received. type: u64 - name: pause + doc: Parameters for link-wide PAUSE (IEEE 802.3 Annex 31B). attr-cnt-name: __ethtool-a-pause-cnt + enum-name: ethtool-a-pause attributes: - name: unspec @@ -893,19 +899,40 @@ attribute-sets: nested-attributes: header - name: autoneg + doc: | + Acts as a mode selector for the driver. + On GET: indicates the driver's behavior. If true, the driver will + respect the negotiated outcome; if false, the driver will use a + forced configuration. + On SET: if true, the driver configures the PHY's advertisement based + on the rx and tx attributes. If false, the driver forces the MAC + into the state defined by the rx and tx attributes. type: u8 - name: rx + doc: | + Enable receiving PAUSE frames (pausing local TX). + On GET: reflects the currently preferred configuration state. type: u8 - name: tx + doc: | + Enable transmitting PAUSE frames (pausing peer TX). + On GET: reflects the currently preferred configuration state. type: u8 - name: stats + doc: | + Contains the pause statistics counters. The source of these + statistics is determined by stats-src. type: nest nested-attributes: pause-stat - name: stats-src + doc: | + Selects the source of the MAC statistics, values from + enum ethtool_mac_stats_src. This allows requesting statistics + from the individual components of the MAC Merge layer. type: u32 - name: eee diff --git a/Documentation/networking/flow_control.rst b/Documentation/networking/flow_control.rst new file mode 100644 index 000000000000..48646d54513f --- /dev/null +++ b/Documentation/networking/flow_control.rst @@ -0,0 +1,373 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. _ethernet-flow-control: + +===================== +Ethernet Flow Control +===================== + +This document is a practical guide to Ethernet Flow Control in Linux, covering +what it is, how it works, and how to configure it. + +What is Flow Control? +===================== + +Flow control is a mechanism to prevent a fast sender from overwhelming a +slow receiver with data, which would cause buffer overruns and dropped packets. +The receiver can signal the sender to temporarily stop transmitting, giving it +time to process its backlog. + +Standards references +==================== + +Ethernet flow control mechanisms are specified across consolidated IEEE base +standards; some originated as amendments: + +- Collision-based flow control is part of CSMA/CD in **IEEE 802.3** + (half-duplex). +- Link-wide PAUSE is defined in **IEEE 802.3 Annex 31B** + (originally **802.3x**). +- Priority-based Flow Control (PFC) is defined in **IEEE 802.1Q Clause 36** + (originally **802.1Qbb**). + +In the remainder of this document, the consolidated clause numbers are used. + +How It Works: The Mechanisms +============================ + +The method used for flow control depends on the link's duplex mode. + +.. note:: + The user-visible ``ethtool`` pause API described in this document controls + **link-wide PAUSE** (IEEE 802.3 Annex 31B) only. It does not control the + collision-based behavior that exists on half-duplex links. + +1. Half-Duplex: Collision-Based Flow Control +-------------------------------------------- +On half-duplex links, a device cannot send and receive simultaneously, so PAUSE +frames are not used. Flow control is achieved by leveraging the CSMA/CD +(Carrier Sense Multiple Access with Collision Detection) protocol itself. + +* **How it works**: To inhibit incoming data, a receiving device can force a + collision on the line. When the sending station detects this collision, it + terminates its transmission, sends a "jam" signal, and then executes the + "Collision backoff and retransmission" procedure as defined in IEEE 802.3, + Section 4.2.3.2.5. This algorithm makes the sender wait for a random + period before attempting to retransmit. By repeatedly forcing collisions, + the receiver can effectively throttle the sender's transmission rate. + +.. note:: + While this mechanism is part of the IEEE standard, there is currently no + generic kernel API to configure or control it. Drivers should not enable + this feature until a standardized interface is available. + +.. warning:: + On shared-medium networks (e.g. 10BASE2, or twisted-pair networks using a + hub rather than a switch) forcing collisions inhibits traffic **across the + entire shared segment**, not just a single point-to-point link. Enabling + such behavior is generally undesirable. + +2. Full-Duplex: Link-wide PAUSE (IEEE 802.3 Annex 31B) +------------------------------------------------------ +On full-duplex links, devices can send and receive at the same time. Flow +control is achieved by sending a special **PAUSE frame**, defined by IEEE +802.3 Annex 31B. This mechanism pauses all traffic on the link and is therefore +called *link-wide PAUSE*. + +* **What it is**: A standard Ethernet frame with a globally reserved + destination MAC address (``01-80-C2-00-00-01``). This address is in a range + that standard IEEE 802.1D-compliant bridges do not forward. However, some + unmanaged or misconfigured bridges have been reported to forward these + frames, which can disrupt flow control across a network. + +* **How it works**: The frame contains a MAC Control opcode for PAUSE + (``0x0001``) and a ``pause_time`` value, telling the sender how long to + wait before sending more data frames. This time is specified in units of + "pause quantum", where one quantum is the time it takes to transmit 512 bits. + For example, one pause quantum is 51.2 microseconds on a 10 Mbit/s link, + and 512 nanoseconds on a 1 Gbit/s link. A ``pause_time`` of zero indicates + that the transmitter can resume transmission, even if a previous non-zero + pause time has not yet elapsed. + +* **Who uses it**: Any full-duplex link, from 10 Mbit/s to multi-gigabit speeds. + +3. Full-Duplex: Priority-based Flow Control (PFC) (IEEE 802.1Q Clause 36) +------------------------------------------------------------------------- +Priority-based Flow Control is an enhancement to the standard PAUSE mechanism +that allows flow control to be applied independently to different classes of +traffic, identified by their priority level. + +* **What it is**: PFC allows a receiver to pause traffic for one or more of the + 8 standard priority levels without stopping traffic for other priorities. + This is critical in data center environments for protocols that cannot + tolerate packet loss due to congestion (e.g., Fibre Channel over Ethernet + or RoCE). + +* **How it works**: PFC uses a specific PAUSE frame format. It shares the same + globally reserved destination MAC address (``01-80-C2-00-00-01``) as legacy + PAUSE frames but uses a unique opcode (``0x0101``). The frame payload + contains two key fields: + + - **``priority_enable_vector``**: An 8-bit mask where each bit corresponds to + one of the 8 priorities. If a bit is set to 1, it means the pause time + for that priority is active. + - **``time_vector``**: A list of eight 2-octet fields, one for each priority. + Each field specifies the ``pause_time`` for its corresponding priority, + measured in units of ``pause_quanta`` (the time to transmit 512 bits). + +.. note:: + When PFC is enabled for at least one priority on a port, the standard + **link-wide PAUSE** (IEEE 802.3 Annex 31B) must be disabled for that port. + The two mechanisms are mutually exclusive (IEEE 802.1Q Clause 36). + +Configuring Flow Control +======================== + +Link-wide PAUSE and Priority-based Flow Control are configured with different +tools. + +Configuring Link-wide PAUSE with ``ethtool`` (IEEE 802.3 Annex 31B) +------------------------------------------------------------------- +Use ``ethtool -a `` to view and ``ethtool -A `` to change +the link-wide PAUSE settings. + +.. code-block:: bash + + # View current link-wide PAUSE settings + ethtool -a eth0 + + # Enable RX and TX pause, with autonegotiation + ethtool -A eth0 autoneg on rx on tx on + +**Key Configuration Concepts**: + +* **Pause Autoneg vs Generic Autoneg**: ``ethtool -A ... autoneg {on,off}`` + controls **Pause Autoneg** (Annex 31B) only. It is independent from the + **Generic link autonegotiation** configured with ``ethtool -s``. A device can + have Generic autoneg **on** while Pause Autoneg is **off**, and vice versa. + +* **If Pause Autoneg is off** (``-A ... autoneg off``): the device will **not** + advertise pause in the PHY. The MAC PAUSE state is **forced** according to + ``rx``/``tx`` and does not depend on partner capabilities or resolution. + Ensure the peer is configured complementarily for PAUSE to be effective. + +* **If generic autoneg is off** but **Pause Autoneg is on**, the pause policy + is **remembered** by the kernel and applied later when Generic autoneg is + enabled again. + +* **Autonegotiation Mode**: The PHY will *advertise* the ``rx`` and ``tx`` + capabilities. The final active state is determined by what both sides of the + link agree on. See the "PHY (Physical Layer Transceiver)" section below, + especially the *Resolution* subsection, for details of the negotiation rules. + +* **Forced Mode**: This mode is necessary when autonegotiation is not used or + not possible. This includes links where one or both partners have + autonegotiation disabled, or in setups without a PHY (e.g., direct + MAC-to-MAC connections). The driver bypasses PHY advertisement and + directly forces the MAC into the specified ``rx``/``tx`` state. The + configuration on both sides of the link must be complementary. For + example, if one side is set to ``tx on`` ``rx off``, the link partner must be + set to ``tx off`` ``rx on`` for flow control to function correctly. + +Configuring PFC with ``dcb`` (IEEE 802.1Q Clause 36) +---------------------------------------------------- +PFC is part of the Data Center Bridging (DCB) subsystem and is managed with the +``dcb`` tool (iproute2). Some deployments use ``dcbtool`` (lldpad) instead; this +document shows ``dcb(8)`` examples. + +**Viewing PFC Settings**: + +.. code-block:: text + + $ dcb pfc show dev eth0 + pfc-cap 8 macsec-bypass off delay 4096 + prio-pfc 0:off 1:off 2:off 3:off 4:off 5:off 6:on 7:on + +This shows the PFC state (on/off) for each priority (0-7). + +**Changing PFC Settings**: + +.. code-block:: bash + + # Enable PFC on priorities 6 and 7, leaving others as they are + $ dcb pfc set dev eth0 prio-pfc 6:on 7:on + + # Disable PFC for all priorities except 6 and 7 + $ dcb pfc set dev eth0 prio-pfc all:off 6:on 7:on + +Monitoring Flow Control +======================= + +The standard way to check if flow control is actively being used is to view the +pause-related statistics. + +**Monitoring Link-wide PAUSE**: +Use ``ethtool --include-statistics -a ``. + +.. code-block:: text + + $ ethtool --include-statistics -a eth0 + Pause parameters for eth0: + ... + Statistics: + tx_pause_frames: 0 + rx_pause_frames: 0 + +**Monitoring PFC**: +PFC statistics (sent and received frames per priority) are available +through the ``dcb`` tool. + +.. code-block:: text + + $ dcb pfc show dev eth0 requests indications + requests 0:0 1:0 2:0 3:1024 4:2048 5:0 6:0 7:0 + indications 0:0 1:0 2:0 3:512 4:4096 5:0 6:0 7:0 + +The ``requests`` counters track transmitted PFC frames (TX), and the +``indications`` counters track received PFC frames (RX). + +Link-wide PAUSE Autonegotiation Details +======================================= + +The autonegotiation process for link-wide PAUSE is managed by the PHY and +involves advertising capabilities and resolving the outcome. + +* Terminology (link-wide PAUSE): + + - **Symmetric pause**: both directions are paused when requested (TX+RX + enabled). + - **Asymmetric pause**: only one direction is paused (e.g., RX-only or + TX-only). + + In IEEE 802.3 advertisement/resolution, symmetric/asymmetric are encoded + using two bits (Pause/Asym) and resolved per the standard truth tables + below. + +* **Advertisement**: The PHY advertises the MAC's flow control capabilities. + This is done using two bits in the advertisement register: "Symmetric + Pause" (Pause) and "Asymmetric Pause" (Asym). These bits should be + interpreted as a combined value, not as independent flags. The kernel + converts the user's ``rx`` and ``tx`` settings into this two-bit value as + follows: + + .. code-block:: text + + tx rx | Pause Asym + -------+------------- + 0 0 | 0 0 + 0 1 | 1 1 + 1 0 | 0 1 + 1 1 | 1 0 + +* **Resolution**: After negotiation, the PHY reports the link partner's + advertised Pause and Asym bits. The final flow control mode is determined + by the combination of the local and partner advertisements, according to + the IEEE 802.3 standard: + + .. code-block:: text + + Local Device | Link Partner | Result + Pause Asym | Pause Asym | + -------------------+--------------------+--------- + 0 X | 0 X | Disabled + 0 1 | 1 0 | Disabled + 0 1 | 1 1 | TX only + 1 0 | 0 X | Disabled + 1 X | 1 X | TX + RX + 1 1 | 0 1 | RX only + + It is important to note that the advertised bits reflect the *current + configuration* of the MAC, which may not represent its full hardware + capabilities. + +Kernel Policy: "Set and Trust" +============================== + +The ethtool pause API is defined as a **wish policy** for +IEEE 802.3 link-wide PAUSE only. A user request is always accepted +as the preferred configuration, but it may not be possible to apply +it in all link states. + +Key constraints: + +- Link-wide PAUSE is not valid on half-duplex links. +- Link-wide PAUSE cannot be used together with Priority-based Flow Control + (PFC, IEEE 802.1Q Clause 36). +- If autonegotiation is active and the link is currently down, the future + mode is not yet known. + +Because of these constraints, the kernel stores the requested setting +and applies it only when the link is in a compatible state. + +Implications for userspace: + +1. Set once (the "wish"): the requested Rx/Tx PAUSE policy is + remembered even if it cannot be applied immediately. +2. Applied conditionally: when the link comes up, the kernel enables + PAUSE only if the active mode allows it. + +Component Roles in Flow Control +=============================== + +The configuration of flow control involves several components, each with a +distinct role. + +The MAC (Media Access Controller) +--------------------------------- +The MAC is the hardware component that actually sends and receives PAUSE +frames. Its capabilities define the upper limit of what the driver can support. +For link-wide PAUSE, MACs can vary in their support for symmetric (both +directions) or asymmetric (independent TX/RX) flow control. + +For PFC, the MAC must be capable of generating and interpreting the +priority-based PAUSE frames and managing separate pause states for each +traffic class. + +Many MACs also implement automatic PAUSE frame transmission based on the fill +level of their internal RX FIFO. This is typically configured with two +thresholds: + +* **FLOW_ON (High Water Mark)**: When the RX FIFO usage reaches this + threshold, the MAC automatically transmits a PAUSE frame to stop the sender. + +* **FLOW_OFF (Low Water Mark)**: When the RX FIFO usage drops below this + threshold, the MAC transmits a PAUSE frame with a quantum of zero to tell + the sender it can resume transmission. + +The PHY (Physical Layer Transceiver) +------------------------------------ +The PHY's role is distinct for each flow control mechanism: + +* **Link-wide PAUSE**: During the autonegotiation process, the PHY is + responsible for advertising the device's flow control capabilities. See the + "Link-wide PAUSE Autonegotiation Details" section for more information. + +* **Half-Duplex Collision-Based Flow Control**: The PHY is fundamental to the + CSMA/CD process. It performs carrier sensing (checking if the line is idle) + and collision detection, which is the mechanism leveraged to throttle the + sender. + +* **Priority-based Flow Control (PFC)**: The PHY is not directly involved in + negotiating PFC capabilities. Its role is to establish the physical link. + PFC negotiation happens at a higher layer via the Data Center Bridging + Capability Exchange Protocol (DCBX). + +User Space Interface +==================== +The primary user space tools are ``ethtool`` for link-wide PAUSE and ``dcb`` for +PFC. They communicate with the kernel to configure the network device driver +and underlying hardware. + +**Link-wide PAUSE Netlink Interface (``ethtool``)** + +See the ethtool Netlink spec (``Documentation/netlink/specs/ethtool.yaml``) +for the authoritative definition of the Pause control and Pause statistics +attributes. The generated UAPI is in +``include/uapi/linux/ethtool_netlink_generated.h``. + +**PFC Netlink Interface (``dcb``)** + +The authoritative definitions for DCB/PFC netlink attributes and commands are in +``include/uapi/linux/dcbnl.h``. See also the ``dcb(8)`` manual page and the DCB +subsystem documentation for userspace configuration details. + diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index c775cababc8c..52aafdc85f6a 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -55,6 +55,7 @@ Contents: eql fib_trie filter + flow_control generic-hdlc generic_netlink ../netlink/specs/index diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst index b0f2ef83735d..40cc0a988d60 100644 --- a/Documentation/networking/phy.rst +++ b/Documentation/networking/phy.rst @@ -343,16 +343,8 @@ Some of the interface modes are described below: Pause frames / flow control =========================== -The PHY does not participate directly in flow control/pause frames except by -making sure that the SUPPORTED_Pause and SUPPORTED_AsymPause bits are set in -MII_ADVERTISE to indicate towards the link partner that the Ethernet MAC -controller supports such a thing. Since flow control/pause frames generation -involves the Ethernet MAC driver, it is recommended that this driver takes care -of properly indicating advertisement and support for such features by setting -the SUPPORTED_Pause and SUPPORTED_AsymPause bits accordingly. This can be done -either before or after phy_connect() and/or as a result of implementing the -ethtool::set_pauseparam feature. - +For detailed link-wide PAUSE and PFC behavior and configuration, see +flow_control.rst. Keeping Close Tabs on the PAL ============================= diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index c2d8b4ec62eb..eeed1ea50369 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -953,9 +953,48 @@ struct kernel_ethtool_ts_info { * @get_pause_stats: Report pause frame statistics. Drivers must not zero * statistics which they don't report. The stats structure is initialized * to ETHTOOL_STAT_NOT_SET indicating driver does not report statistics. - * @get_pauseparam: Report pause parameters - * @set_pauseparam: Set pause parameters. Returns a negative error code - * or zero. + * + * @get_pauseparam: Report the configured policy for link-wide PAUSE + * (IEEE 802.3 Annex 31B). Drivers must fill struct ethtool_pauseparam + * such that: + * @autoneg: + * This refers to **Pause Autoneg** (IEEE 802.3 Annex 31B) only + * and is independent of generic link autonegotiation configured + * via ethtool -s. + * true -> the device follows the negotiated result of pause + * autonegotiation (Pause/Asym); + * false -> the device uses a forced MAC state independent of + * negotiation. + * @rx_pause/@tx_pause: + * represent the desired policy (preferred configuration). + * In autoneg mode they describe what is to be advertised; + * in forced mode they describe the MAC state to apply. + * + * Drivers (and/or frameworks) should persist this policy across link + * changes and reapply appropriate MAC programming when link parameters + * change. + * + * @set_pauseparam: Apply a policy for link-wide PAUSE (IEEE 802.3 Annex 31B). + * If @autoneg is true: + * Arrange for pause advertisement (Pause/Asym) based on + * @rx_pause/@tx_pause and program the MAC to follow the + * negotiated result (which may be symmetric, asymmetric, or off + * depending on the link partner). + * If @autoneg is false: + * Do not rely on autonegotiation; force the MAC RX/TX pause + * state directly per @rx_pause/@tx_pause. + * + * Implementations that integrate with PHYLIB/PHYLINK should cooperate + * with those frameworks for advertisement and resolution; MAC drivers are + * still responsible for applying the required MAC state. + * + * Return: 0 on success or a negative errno. Return -EOPNOTSUPP if + * link-wide PAUSE is unsupported. If only symmetric pause is supported, + * reject unsupported asymmetric requests with -EINVAL (or document any + * coercion policy). + * + * See also: Documentation/networking/flow_control.rst + * * @self_test: Run specified self-tests * @get_strings: Return a set of strings that describe the requested objects * @set_phys_id: Identify the physical devices, e.g. by flashing an LED diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 0e8ac0d974e2..3dd9d7cde86e 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -375,7 +375,7 @@ enum { ETHTOOL_A_COALESCE_MAX = (__ETHTOOL_A_COALESCE_CNT - 1) }; -enum { +enum ethtool_a_pause_stat { ETHTOOL_A_PAUSE_STAT_UNSPEC, ETHTOOL_A_PAUSE_STAT_PAD, ETHTOOL_A_PAUSE_STAT_TX_FRAMES, @@ -385,7 +385,7 @@ enum { ETHTOOL_A_PAUSE_STAT_MAX = (__ETHTOOL_A_PAUSE_STAT_CNT - 1) }; -enum { +enum ethtool_a_pause { ETHTOOL_A_PAUSE_UNSPEC, ETHTOOL_A_PAUSE_HEADER, ETHTOOL_A_PAUSE_AUTONEG, diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 03eb1d941fca..91ee22f53774 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -27,6 +27,8 @@ * * Priority-based Flow Control (PFC) - provides a flow control mechanism which * can work independently for each 802.1p priority. + * See Documentation/networking/flow_control.rst for a high level description + * of the user space interface for Priority-based Flow Control (PFC). * * Congestion Notification - provides a mechanism for end-to-end congestion * control for protocols which do not have built-in congestion management. diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c index 0f9af1e66548..eacf6a4859bf 100644 --- a/net/ethtool/pause.c +++ b/net/ethtool/pause.c @@ -1,5 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-only +/* See Documentation/networking/flow_control.rst for a high level description of + * the userspace interface. + */ + #include "netlink.h" #include "common.h" -- cgit v1.2.3 From 9c328f54741bd5465ca1dc717c84c04242fac2e1 Mon Sep 17 00:00:00 2001 From: Deepak Sharma Date: Thu, 25 Sep 2025 18:58:46 +0530 Subject: net: nfc: nci: Add parameter validation for packet data Syzbot reported an uninitialized value bug in nci_init_req, which was introduced by commit 5aca7966d2a7 ("Merge tag 'perf-tools-fixes-for-v6.17-2025-09-16' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools"). This bug arises due to very limited and poor input validation that was done at nic_valid_size(). This validation only validates the skb->len (directly reflects size provided at the userspace interface) with the length provided in the buffer itself (interpreted as NCI_HEADER). This leads to the processing of memory content at the address assuming the correct layout per what opcode requires there. This leads to the accesses to buffer of `skb_buff->data` which is not assigned anything yet. Following the same silent drop of packets of invalid sizes at `nic_valid_size()`, add validation of the data in the respective handlers and return error values in case of failure. Release the skb if error values are returned from handlers in `nci_nft_packet` and effectively do a silent drop Possible TODO: because we silently drop the packets, the call to `nci_request` will be waiting for completion of request and will face timeouts. These timeouts can get excessively logged in the dmesg. A proper handling of them may require to export `nci_request_cancel` (or propagate error handling from the nft packets handlers). Reported-by: syzbot+740e04c2a93467a0f8c8@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=740e04c2a93467a0f8c8 Fixes: 6a2968aaf50c ("NFC: basic NCI protocol implementation") Tested-by: syzbot+740e04c2a93467a0f8c8@syzkaller.appspotmail.com Cc: stable@vger.kernel.org Signed-off-by: Deepak Sharma Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20250925132846.213425-1-deepak.sharma.472935@gmail.com Signed-off-by: Paolo Abeni --- net/nfc/nci/ntf.c | 135 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 99 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index a818eff27e6b..418b84e2b260 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -27,11 +27,16 @@ /* Handle NCI Notification packets */ -static void nci_core_reset_ntf_packet(struct nci_dev *ndev, - const struct sk_buff *skb) +static int nci_core_reset_ntf_packet(struct nci_dev *ndev, + const struct sk_buff *skb) { /* Handle NCI 2.x core reset notification */ - const struct nci_core_reset_ntf *ntf = (void *)skb->data; + const struct nci_core_reset_ntf *ntf; + + if (skb->len < sizeof(struct nci_core_reset_ntf)) + return -EINVAL; + + ntf = (struct nci_core_reset_ntf *)skb->data; ndev->nci_ver = ntf->nci_ver; pr_debug("nci_ver 0x%x, config_status 0x%x\n", @@ -42,15 +47,22 @@ static void nci_core_reset_ntf_packet(struct nci_dev *ndev, __le32_to_cpu(ntf->manufact_specific_info); nci_req_complete(ndev, NCI_STATUS_OK); + + return 0; } -static void nci_core_conn_credits_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) +static int nci_core_conn_credits_ntf_packet(struct nci_dev *ndev, + struct sk_buff *skb) { - struct nci_core_conn_credit_ntf *ntf = (void *) skb->data; + struct nci_core_conn_credit_ntf *ntf; struct nci_conn_info *conn_info; int i; + if (skb->len < sizeof(struct nci_core_conn_credit_ntf)) + return -EINVAL; + + ntf = (struct nci_core_conn_credit_ntf *)skb->data; + pr_debug("num_entries %d\n", ntf->num_entries); if (ntf->num_entries > NCI_MAX_NUM_CONN) @@ -68,7 +80,7 @@ static void nci_core_conn_credits_ntf_packet(struct nci_dev *ndev, conn_info = nci_get_conn_info_by_conn_id(ndev, ntf->conn_entries[i].conn_id); if (!conn_info) - return; + return 0; atomic_add(ntf->conn_entries[i].credits, &conn_info->credits_cnt); @@ -77,12 +89,19 @@ static void nci_core_conn_credits_ntf_packet(struct nci_dev *ndev, /* trigger the next tx */ if (!skb_queue_empty(&ndev->tx_q)) queue_work(ndev->tx_wq, &ndev->tx_work); + + return 0; } -static void nci_core_generic_error_ntf_packet(struct nci_dev *ndev, - const struct sk_buff *skb) +static int nci_core_generic_error_ntf_packet(struct nci_dev *ndev, + const struct sk_buff *skb) { - __u8 status = skb->data[0]; + __u8 status; + + if (skb->len < 1) + return -EINVAL; + + status = skb->data[0]; pr_debug("status 0x%x\n", status); @@ -91,12 +110,19 @@ static void nci_core_generic_error_ntf_packet(struct nci_dev *ndev, (the state remains the same) */ nci_req_complete(ndev, status); } + + return 0; } -static void nci_core_conn_intf_error_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) +static int nci_core_conn_intf_error_ntf_packet(struct nci_dev *ndev, + struct sk_buff *skb) { - struct nci_core_intf_error_ntf *ntf = (void *) skb->data; + struct nci_core_intf_error_ntf *ntf; + + if (skb->len < sizeof(struct nci_core_intf_error_ntf)) + return -EINVAL; + + ntf = (struct nci_core_intf_error_ntf *)skb->data; ntf->conn_id = nci_conn_id(&ntf->conn_id); @@ -105,6 +131,8 @@ static void nci_core_conn_intf_error_ntf_packet(struct nci_dev *ndev, /* complete the data exchange transaction, if exists */ if (test_bit(NCI_DATA_EXCHANGE, &ndev->flags)) nci_data_exchange_complete(ndev, NULL, ntf->conn_id, -EIO); + + return 0; } static const __u8 * @@ -329,13 +357,18 @@ void nci_clear_target_list(struct nci_dev *ndev) ndev->n_targets = 0; } -static void nci_rf_discover_ntf_packet(struct nci_dev *ndev, - const struct sk_buff *skb) +static int nci_rf_discover_ntf_packet(struct nci_dev *ndev, + const struct sk_buff *skb) { struct nci_rf_discover_ntf ntf; - const __u8 *data = skb->data; + const __u8 *data; bool add_target = true; + if (skb->len < sizeof(struct nci_rf_discover_ntf)) + return -EINVAL; + + data = skb->data; + ntf.rf_discovery_id = *data++; ntf.rf_protocol = *data++; ntf.rf_tech_and_mode = *data++; @@ -390,6 +423,8 @@ static void nci_rf_discover_ntf_packet(struct nci_dev *ndev, nfc_targets_found(ndev->nfc_dev, ndev->targets, ndev->n_targets); } + + return 0; } static int nci_extract_activation_params_iso_dep(struct nci_dev *ndev, @@ -553,14 +588,19 @@ static int nci_store_ats_nfc_iso_dep(struct nci_dev *ndev, return NCI_STATUS_OK; } -static void nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, - const struct sk_buff *skb) +static int nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, + const struct sk_buff *skb) { struct nci_conn_info *conn_info; struct nci_rf_intf_activated_ntf ntf; - const __u8 *data = skb->data; + const __u8 *data; int err = NCI_STATUS_OK; + if (skb->len < sizeof(struct nci_rf_intf_activated_ntf)) + return -EINVAL; + + data = skb->data; + ntf.rf_discovery_id = *data++; ntf.rf_interface = *data++; ntf.rf_protocol = *data++; @@ -667,7 +707,7 @@ exit: if (err == NCI_STATUS_OK) { conn_info = ndev->rf_conn_info; if (!conn_info) - return; + return 0; conn_info->max_pkt_payload_len = ntf.max_data_pkt_payload_size; conn_info->initial_num_credits = ntf.initial_num_credits; @@ -721,19 +761,26 @@ listen: pr_err("error when signaling tm activation\n"); } } + + return 0; } -static void nci_rf_deactivate_ntf_packet(struct nci_dev *ndev, - const struct sk_buff *skb) +static int nci_rf_deactivate_ntf_packet(struct nci_dev *ndev, + const struct sk_buff *skb) { const struct nci_conn_info *conn_info; - const struct nci_rf_deactivate_ntf *ntf = (void *)skb->data; + const struct nci_rf_deactivate_ntf *ntf; + + if (skb->len < sizeof(struct nci_rf_deactivate_ntf)) + return -EINVAL; + + ntf = (struct nci_rf_deactivate_ntf *)skb->data; pr_debug("entry, type 0x%x, reason 0x%x\n", ntf->type, ntf->reason); conn_info = ndev->rf_conn_info; if (!conn_info) - return; + return 0; /* drop tx data queue */ skb_queue_purge(&ndev->tx_q); @@ -765,14 +812,20 @@ static void nci_rf_deactivate_ntf_packet(struct nci_dev *ndev, } nci_req_complete(ndev, NCI_STATUS_OK); + + return 0; } -static void nci_nfcee_discover_ntf_packet(struct nci_dev *ndev, - const struct sk_buff *skb) +static int nci_nfcee_discover_ntf_packet(struct nci_dev *ndev, + const struct sk_buff *skb) { u8 status = NCI_STATUS_OK; - const struct nci_nfcee_discover_ntf *nfcee_ntf = - (struct nci_nfcee_discover_ntf *)skb->data; + const struct nci_nfcee_discover_ntf *nfcee_ntf; + + if (skb->len < sizeof(struct nci_nfcee_discover_ntf)) + return -EINVAL; + + nfcee_ntf = (struct nci_nfcee_discover_ntf *)skb->data; /* NFCForum NCI 9.2.1 HCI Network Specific Handling * If the NFCC supports the HCI Network, it SHALL return one, @@ -783,6 +836,8 @@ static void nci_nfcee_discover_ntf_packet(struct nci_dev *ndev, ndev->cur_params.id = nfcee_ntf->nfcee_id; nci_req_complete(ndev, status); + + return 0; } void nci_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb) @@ -809,35 +864,43 @@ void nci_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb) switch (ntf_opcode) { case NCI_OP_CORE_RESET_NTF: - nci_core_reset_ntf_packet(ndev, skb); + if (nci_core_reset_ntf_packet(ndev, skb)) + goto end; break; case NCI_OP_CORE_CONN_CREDITS_NTF: - nci_core_conn_credits_ntf_packet(ndev, skb); + if (nci_core_conn_credits_ntf_packet(ndev, skb)) + goto end; break; case NCI_OP_CORE_GENERIC_ERROR_NTF: - nci_core_generic_error_ntf_packet(ndev, skb); + if (nci_core_generic_error_ntf_packet(ndev, skb)) + goto end; break; case NCI_OP_CORE_INTF_ERROR_NTF: - nci_core_conn_intf_error_ntf_packet(ndev, skb); + if (nci_core_conn_intf_error_ntf_packet(ndev, skb)) + goto end; break; case NCI_OP_RF_DISCOVER_NTF: - nci_rf_discover_ntf_packet(ndev, skb); + if (nci_rf_discover_ntf_packet(ndev, skb)) + goto end; break; case NCI_OP_RF_INTF_ACTIVATED_NTF: - nci_rf_intf_activated_ntf_packet(ndev, skb); + if (nci_rf_intf_activated_ntf_packet(ndev, skb)) + goto end; break; case NCI_OP_RF_DEACTIVATE_NTF: - nci_rf_deactivate_ntf_packet(ndev, skb); + if (nci_rf_deactivate_ntf_packet(ndev, skb)) + goto end; break; case NCI_OP_NFCEE_DISCOVER_NTF: - nci_nfcee_discover_ntf_packet(ndev, skb); + if (nci_nfcee_discover_ntf_packet(ndev, skb)) + goto end; break; case NCI_OP_RF_NFCEE_ACTION_NTF: -- cgit v1.2.3 From 2ade91705b596b7b6b7de84c0ca59eced7acd1f6 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Fri, 26 Sep 2025 10:41:13 +0300 Subject: tipc: adjust tipc_nodeid2string() to return string length Since the value returned by 'tipc_nodeid2string()' is not used, the function may be adjusted to return the length of the result, which is helpful to drop a few calls to 'strlen()' in 'tipc_link_create()' and 'tipc_link_bc_create()'. Compile tested only. Signed-off-by: Dmitry Antipov Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250926074113.914399-1-dmantipov@yandex.ru Signed-off-by: Paolo Abeni --- net/tipc/addr.c | 6 +++--- net/tipc/addr.h | 2 +- net/tipc/link.c | 9 +++------ 3 files changed, 7 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/tipc/addr.c b/net/tipc/addr.c index fd0796269eed..6f5c54cbf8d9 100644 --- a/net/tipc/addr.c +++ b/net/tipc/addr.c @@ -79,7 +79,7 @@ void tipc_set_node_addr(struct net *net, u32 addr) pr_info("Node number set to %u\n", addr); } -char *tipc_nodeid2string(char *str, u8 *id) +int tipc_nodeid2string(char *str, u8 *id) { int i; u8 c; @@ -109,7 +109,7 @@ char *tipc_nodeid2string(char *str, u8 *id) if (i == NODE_ID_LEN) { memcpy(str, id, NODE_ID_LEN); str[NODE_ID_LEN] = 0; - return str; + return i; } /* Translate to hex string */ @@ -120,5 +120,5 @@ char *tipc_nodeid2string(char *str, u8 *id) for (i = NODE_ID_STR_LEN - 2; str[i] == '0'; i--) str[i] = 0; - return str; + return i + 1; } diff --git a/net/tipc/addr.h b/net/tipc/addr.h index 93f82398283d..a113cf7e1f89 100644 --- a/net/tipc/addr.h +++ b/net/tipc/addr.h @@ -130,6 +130,6 @@ static inline int in_own_node(struct net *net, u32 addr) bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr); void tipc_set_node_id(struct net *net, u8 *id); void tipc_set_node_addr(struct net *net, u32 addr); -char *tipc_nodeid2string(char *str, u8 *id); +int tipc_nodeid2string(char *str, u8 *id); #endif diff --git a/net/tipc/link.c b/net/tipc/link.c index 3ee44d731700..931f55f781a1 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -495,11 +495,9 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, /* Set link name for unicast links only */ if (peer_id) { - tipc_nodeid2string(self_str, tipc_own_id(net)); - if (strlen(self_str) > 16) + if (tipc_nodeid2string(self_str, tipc_own_id(net)) > NODE_ID_LEN) sprintf(self_str, "%x", self); - tipc_nodeid2string(peer_str, peer_id); - if (strlen(peer_str) > 16) + if (tipc_nodeid2string(peer_str, peer_id) > NODE_ID_LEN) sprintf(peer_str, "%x", peer); } /* Peer i/f name will be completed by reset/activate message */ @@ -570,8 +568,7 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, u8 *peer_id, if (peer_id) { char peer_str[NODE_ID_STR_LEN] = {0,}; - tipc_nodeid2string(peer_str, peer_id); - if (strlen(peer_str) > 16) + if (tipc_nodeid2string(peer_str, peer_id) > NODE_ID_LEN) sprintf(peer_str, "%x", peer); /* Broadcast receiver link name: "broadcast-link:" */ snprintf(l->name, sizeof(l->name), "%s:%s", tipc_bclink_name, -- cgit v1.2.3 From a1b501a8c6a87c9265fd03bd004035199e2e8128 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Fri, 26 Sep 2025 16:16:05 +0300 Subject: page_pool: Clamp pool size to max 16K pages page_pool_init() returns E2BIG when the page_pool size goes above 32K pages. As some drivers are configuring the page_pool size according to the MTU and ring size, there are cases where this limit is exceeded and the queue creation fails. The page_pool size doesn't have to cover a full queue, especially for larger ring size. So clamp the size instead of returning an error. Do this in the core to avoid having each driver do the clamping. The current limit was deemed to high [1] so it was reduced to 16K to avoid page waste. [1] https://lore.kernel.org/all/1758532715-820422-3-git-send-email-tariqt@nvidia.com/ Signed-off-by: Dragos Tatulea Reviewed-by: Tariq Toukan Link: https://patch.msgid.link/20250926131605.2276734-2-dtatulea@nvidia.com Signed-off-by: Paolo Abeni --- net/core/page_pool.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 36a98f2bcac3..492728f9e021 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -211,11 +211,7 @@ static int page_pool_init(struct page_pool *pool, return -EINVAL; if (pool->p.pool_size) - ring_qsize = pool->p.pool_size; - - /* Sanity limit mem that can be pinned down */ - if (ring_qsize > 32768) - return -E2BIG; + ring_qsize = min(pool->p.pool_size, 16384); /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, -- cgit v1.2.3 From f857478d62066ee94831a5e0679fc18c246cd534 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 27 Sep 2025 15:54:13 -0700 Subject: netdevsim: a basic test PSP implementation Provide a PSP implementation for netdevsim. Use psp_dev_encapsulate() and psp_dev_rcv() to do actual encapsulation and decapsulation on skbs, but perform no encryption or decryption. In order to make encryption with a bad key result in a drop on the peer's rx side, we stash our psd's generation number in the first byte of each key before handing to the peer. Signed-off-by: Jakub Kicinski Co-developed-by: Daniel Zahka Signed-off-by: Daniel Zahka Link: https://patch.msgid.link/20250927225420.1443468-2-kuba@kernel.org Reviewed-by: Willem de Bruijn Signed-off-by: Paolo Abeni --- drivers/net/netdevsim/Makefile | 4 + drivers/net/netdevsim/netdev.c | 43 +++++++- drivers/net/netdevsim/netdevsim.h | 27 +++++ drivers/net/netdevsim/psp.c | 225 ++++++++++++++++++++++++++++++++++++++ net/core/skbuff.c | 1 + 5 files changed, 294 insertions(+), 6 deletions(-) create mode 100644 drivers/net/netdevsim/psp.c (limited to 'net') diff --git a/drivers/net/netdevsim/Makefile b/drivers/net/netdevsim/Makefile index f8de93bc5f5b..14a553e000ec 100644 --- a/drivers/net/netdevsim/Makefile +++ b/drivers/net/netdevsim/Makefile @@ -18,6 +18,10 @@ ifneq ($(CONFIG_PSAMPLE),) netdevsim-objs += psample.o endif +ifneq ($(CONFIG_INET_PSP),) +netdevsim-objs += psp.o +endif + ifneq ($(CONFIG_MACSEC),) netdevsim-objs += macsec.o endif diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c index 0178219f0db5..ebc3833e95b4 100644 --- a/drivers/net/netdevsim/netdev.c +++ b/drivers/net/netdevsim/netdev.c @@ -103,28 +103,42 @@ static int nsim_napi_rx(struct net_device *tx_dev, struct net_device *rx_dev, static int nsim_forward_skb(struct net_device *tx_dev, struct net_device *rx_dev, struct sk_buff *skb, - struct nsim_rq *rq) + struct nsim_rq *rq, + struct skb_ext *psp_ext) { - return __dev_forward_skb(rx_dev, skb) ?: - nsim_napi_rx(tx_dev, rx_dev, rq, skb); + int ret; + + ret = __dev_forward_skb(rx_dev, skb); + if (ret) + return ret; + + nsim_psp_handle_ext(skb, psp_ext); + + return nsim_napi_rx(tx_dev, rx_dev, rq, skb); } static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct netdevsim *ns = netdev_priv(dev); + struct skb_ext *psp_ext = NULL; struct net_device *peer_dev; unsigned int len = skb->len; struct netdevsim *peer_ns; struct netdev_config *cfg; struct nsim_rq *rq; int rxq; + int dr; rcu_read_lock(); if (!nsim_ipsec_tx(ns, skb)) - goto out_drop_free; + goto out_drop_any; peer_ns = rcu_dereference(ns->peer); if (!peer_ns) + goto out_drop_any; + + dr = nsim_do_psp(skb, ns, peer_ns, &psp_ext); + if (dr) goto out_drop_free; peer_dev = peer_ns->netdev; @@ -141,7 +155,8 @@ static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev) skb_linearize(skb); skb_tx_timestamp(skb); - if (unlikely(nsim_forward_skb(dev, peer_dev, skb, rq) == NET_RX_DROP)) + if (unlikely(nsim_forward_skb(dev, peer_dev, + skb, rq, psp_ext) == NET_RX_DROP)) goto out_drop_cnt; if (!hrtimer_active(&rq->napi_timer)) @@ -151,8 +166,10 @@ static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev) dev_dstats_tx_add(dev, len); return NETDEV_TX_OK; +out_drop_any: + dr = SKB_DROP_REASON_NOT_SPECIFIED; out_drop_free: - dev_kfree_skb(skb); + kfree_skb_reason(skb, dr); out_drop_cnt: rcu_read_unlock(); dev_dstats_tx_dropped(dev); @@ -1002,6 +1019,7 @@ static void nsim_queue_uninit(struct netdevsim *ns) static int nsim_init_netdevsim(struct netdevsim *ns) { + struct netdevsim *peer; struct mock_phc *phc; int err; @@ -1036,6 +1054,10 @@ static int nsim_init_netdevsim(struct netdevsim *ns) goto err_ipsec_teardown; rtnl_unlock(); + err = nsim_psp_init(ns); + if (err) + goto err_unregister_netdev; + if (IS_ENABLED(CONFIG_DEBUG_NET)) { ns->nb.notifier_call = netdev_debug_event; if (register_netdevice_notifier_dev_net(ns->netdev, &ns->nb, @@ -1045,6 +1067,13 @@ static int nsim_init_netdevsim(struct netdevsim *ns) return 0; +err_unregister_netdev: + rtnl_lock(); + peer = rtnl_dereference(ns->peer); + if (peer) + RCU_INIT_POINTER(peer->peer, NULL); + RCU_INIT_POINTER(ns->peer, NULL); + unregister_netdevice(ns->netdev); err_ipsec_teardown: nsim_ipsec_teardown(ns); nsim_macsec_teardown(ns); @@ -1132,6 +1161,8 @@ void nsim_destroy(struct netdevsim *ns) unregister_netdevice_notifier_dev_net(ns->netdev, &ns->nb, &ns->nn); + nsim_psp_uninit(ns); + rtnl_lock(); peer = rtnl_dereference(ns->peer); if (peer) diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h index bddd24c1389d..02c1c97b7008 100644 --- a/drivers/net/netdevsim/netdevsim.h +++ b/drivers/net/netdevsim/netdevsim.h @@ -108,6 +108,12 @@ struct netdevsim { int rq_reset_mode; + struct { + struct psp_dev *dev; + u32 spi; + u32 assoc_cnt; + } psp; + struct nsim_bus_dev *nsim_bus_dev; struct bpf_prog *bpf_offloaded; @@ -421,6 +427,27 @@ static inline void nsim_macsec_teardown(struct netdevsim *ns) } #endif +#if IS_ENABLED(CONFIG_INET_PSP) +int nsim_psp_init(struct netdevsim *ns); +void nsim_psp_uninit(struct netdevsim *ns); +void nsim_psp_handle_ext(struct sk_buff *skb, struct skb_ext *psp_ext); +enum skb_drop_reason +nsim_do_psp(struct sk_buff *skb, struct netdevsim *ns, + struct netdevsim *peer_ns, struct skb_ext **psp_ext); +#else +static inline int nsim_psp_init(struct netdevsim *ns) { return 0; } +static inline void nsim_psp_uninit(struct netdevsim *ns) {} +static inline enum skb_drop_reason +nsim_do_psp(struct sk_buff *skb, struct netdevsim *ns, + struct netdevsim *peer_ns, struct skb_ext **psp_ext) +{ + return 0; +} + +static inline void +nsim_psp_handle_ext(struct sk_buff *skb, struct skb_ext *psp_ext) {} +#endif + struct nsim_bus_dev { struct device dev; struct list_head list; diff --git a/drivers/net/netdevsim/psp.c b/drivers/net/netdevsim/psp.c new file mode 100644 index 000000000000..332b5b744f01 --- /dev/null +++ b/drivers/net/netdevsim/psp.c @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include + +#include "netdevsim.h" + +void nsim_psp_handle_ext(struct sk_buff *skb, struct skb_ext *psp_ext) +{ + if (psp_ext) + __skb_ext_set(skb, SKB_EXT_PSP, psp_ext); +} + +enum skb_drop_reason +nsim_do_psp(struct sk_buff *skb, struct netdevsim *ns, + struct netdevsim *peer_ns, struct skb_ext **psp_ext) +{ + enum skb_drop_reason rc = 0; + struct psp_assoc *pas; + struct net *net; + void **ptr; + + rcu_read_lock(); + pas = psp_skb_get_assoc_rcu(skb); + if (!pas) { + rc = SKB_NOT_DROPPED_YET; + goto out_unlock; + } + + if (!skb_transport_header_was_set(skb)) { + rc = SKB_DROP_REASON_PSP_OUTPUT; + goto out_unlock; + } + + ptr = psp_assoc_drv_data(pas); + if (*ptr != ns) { + rc = SKB_DROP_REASON_PSP_OUTPUT; + goto out_unlock; + } + + net = sock_net(skb->sk); + if (!psp_dev_encapsulate(net, skb, pas->tx.spi, pas->version, 0)) { + rc = SKB_DROP_REASON_PSP_OUTPUT; + goto out_unlock; + } + + /* Now pretend we just received this frame */ + if (peer_ns->psp.dev->config.versions & (1 << pas->version)) { + bool strip_icv = false; + u8 generation; + + /* We cheat a bit and put the generation in the key. + * In real life if generation was too old, then decryption would + * fail. Here, we just make it so a bad key causes a bad + * generation too, and psp_sk_rx_policy_check() will fail. + */ + generation = pas->tx.key[0]; + + skb_ext_reset(skb); + skb->mac_len = ETH_HLEN; + if (psp_dev_rcv(skb, peer_ns->psp.dev->id, generation, + strip_icv)) { + rc = SKB_DROP_REASON_PSP_OUTPUT; + goto out_unlock; + } + + *psp_ext = skb->extensions; + refcount_inc(&(*psp_ext)->refcnt); + skb->decrypted = 1; + } else { + struct ipv6hdr *ip6h __maybe_unused; + struct iphdr *iph; + struct udphdr *uh; + __wsum csum; + + /* Do not decapsulate. Receive the skb with the udp and psp + * headers still there as if this is a normal udp packet. + * psp_dev_encapsulate() sets udp checksum to 0, so we need to + * provide a valid checksum here, so the skb isn't dropped. + */ + uh = udp_hdr(skb); + csum = skb_checksum(skb, skb_transport_offset(skb), + ntohs(uh->len), 0); + + switch (skb->protocol) { + case htons(ETH_P_IP): + iph = ip_hdr(skb); + uh->check = udp_v4_check(ntohs(uh->len), iph->saddr, + iph->daddr, csum); + break; +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + ip6h = ipv6_hdr(skb); + uh->check = udp_v6_check(ntohs(uh->len), &ip6h->saddr, + &ip6h->daddr, csum); + break; +#endif + } + + uh->check = uh->check ?: CSUM_MANGLED_0; + skb->ip_summed = CHECKSUM_NONE; + } + +out_unlock: + rcu_read_unlock(); + return rc; +} + +static int +nsim_psp_set_config(struct psp_dev *psd, struct psp_dev_config *conf, + struct netlink_ext_ack *extack) +{ + return 0; +} + +static int +nsim_rx_spi_alloc(struct psp_dev *psd, u32 version, + struct psp_key_parsed *assoc, + struct netlink_ext_ack *extack) +{ + struct netdevsim *ns = psd->drv_priv; + unsigned int new; + int i; + + new = ++ns->psp.spi & PSP_SPI_KEY_ID; + if (psd->generation & 1) + new |= PSP_SPI_KEY_PHASE; + + assoc->spi = cpu_to_be32(new); + assoc->key[0] = psd->generation; + for (i = 1; i < PSP_MAX_KEY; i++) + assoc->key[i] = ns->psp.spi + i; + + return 0; +} + +static int nsim_assoc_add(struct psp_dev *psd, struct psp_assoc *pas, + struct netlink_ext_ack *extack) +{ + struct netdevsim *ns = psd->drv_priv; + void **ptr = psp_assoc_drv_data(pas); + + /* Copy drv_priv from psd to assoc */ + *ptr = psd->drv_priv; + ns->psp.assoc_cnt++; + + return 0; +} + +static int nsim_key_rotate(struct psp_dev *psd, struct netlink_ext_ack *extack) +{ + return 0; +} + +static void nsim_assoc_del(struct psp_dev *psd, struct psp_assoc *pas) +{ + struct netdevsim *ns = psd->drv_priv; + void **ptr = psp_assoc_drv_data(pas); + + *ptr = NULL; + ns->psp.assoc_cnt--; +} + +static struct psp_dev_ops nsim_psp_ops = { + .set_config = nsim_psp_set_config, + .rx_spi_alloc = nsim_rx_spi_alloc, + .tx_key_add = nsim_assoc_add, + .tx_key_del = nsim_assoc_del, + .key_rotate = nsim_key_rotate, +}; + +static struct psp_dev_caps nsim_psp_caps = { + .versions = 1 << PSP_VERSION_HDR0_AES_GCM_128 | + 1 << PSP_VERSION_HDR0_AES_GMAC_128 | + 1 << PSP_VERSION_HDR0_AES_GCM_256 | + 1 << PSP_VERSION_HDR0_AES_GMAC_256, + .assoc_drv_spc = sizeof(void *), +}; + +void nsim_psp_uninit(struct netdevsim *ns) +{ + if (!IS_ERR(ns->psp.dev)) + psp_dev_unregister(ns->psp.dev); + WARN_ON(ns->psp.assoc_cnt); +} + +static ssize_t +nsim_psp_rereg_write(struct file *file, const char __user *data, size_t count, + loff_t *ppos) +{ + struct netdevsim *ns = file->private_data; + int err; + + nsim_psp_uninit(ns); + + ns->psp.dev = psp_dev_create(ns->netdev, &nsim_psp_ops, + &nsim_psp_caps, ns); + err = PTR_ERR_OR_ZERO(ns->psp.dev); + return err ?: count; +} + +static const struct file_operations nsim_psp_rereg_fops = { + .open = simple_open, + .write = nsim_psp_rereg_write, + .llseek = generic_file_llseek, + .owner = THIS_MODULE, +}; + +int nsim_psp_init(struct netdevsim *ns) +{ + struct dentry *ddir = ns->nsim_dev_port->ddir; + int err; + + ns->psp.dev = psp_dev_create(ns->netdev, &nsim_psp_ops, + &nsim_psp_caps, ns); + err = PTR_ERR_OR_ZERO(ns->psp.dev); + if (err) + return err; + + debugfs_create_file("psp_rereg", 0200, ddir, ns, &nsim_psp_rereg_fops); + return 0; +} diff --git a/net/core/skbuff.c b/net/core/skbuff.c index daaf6da43cc9..618afd59afff 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -7048,6 +7048,7 @@ void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, skb->active_extensions = 1 << id; return skb_ext_get_ptr(ext, id); } +EXPORT_SYMBOL_NS_GPL(__skb_ext_set, "NETDEV_INTERNAL"); /** * skb_ext_add - allocate space for given extension, COW if needed -- cgit v1.2.3 From 9c94ae6bb0b2895024b6e29fcc1cbec968b4776a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 28 Sep 2025 08:49:32 +0000 Subject: net: make softnet_data.defer_count an atomic This is preparation work to remove the softnet_data.defer_lock, as it is contended on hosts with large number of cores. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250928084934.3266948-2-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 2 +- net/core/dev.c | 2 +- net/core/skbuff.c | 6 ++---- 3 files changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1b85454116f6..27e3fa69253f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3538,7 +3538,7 @@ struct softnet_data { /* Another possibly contended cache line */ spinlock_t defer_lock ____cacheline_aligned_in_smp; - int defer_count; + atomic_t defer_count; int defer_ipi_scheduled; struct sk_buff *defer_list; call_single_data_t defer_csd; diff --git a/net/core/dev.c b/net/core/dev.c index 8b54fdf0289a..8566678d8344 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6726,7 +6726,7 @@ static void skb_defer_free_flush(struct softnet_data *sd) spin_lock(&sd->defer_lock); skb = sd->defer_list; sd->defer_list = NULL; - sd->defer_count = 0; + atomic_set(&sd->defer_count, 0); spin_unlock(&sd->defer_lock); while (skb != NULL) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 618afd59afff..16cd357d62a6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -7202,14 +7202,12 @@ nodefer: kfree_skb_napi_cache(skb); sd = &per_cpu(softnet_data, cpu); defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max); - if (READ_ONCE(sd->defer_count) >= defer_max) + if (atomic_read(&sd->defer_count) >= defer_max) goto nodefer; spin_lock_bh(&sd->defer_lock); /* Send an IPI every time queue reaches half capacity. */ - kick = sd->defer_count == (defer_max >> 1); - /* Paired with the READ_ONCE() few lines above */ - WRITE_ONCE(sd->defer_count, sd->defer_count + 1); + kick = (atomic_inc_return(&sd->defer_count) - 1) == (defer_max >> 1); skb->next = sd->defer_list; /* Paired with READ_ONCE() in skb_defer_free_flush() */ -- cgit v1.2.3 From 844c9db7f7f5fe1b0b53ed9f1c2bc7313b3021c8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 28 Sep 2025 08:49:33 +0000 Subject: net: use llist for sd->defer_list Get rid of sd->defer_lock and adopt llist operations. We optimize skb_attempt_defer_free() for the common case, where the packet is queued. Otherwise sd->defer_count is increasing, until skb_defer_free_flush() clears it. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250928084934.3266948-3-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 8 ++++---- net/core/dev.c | 18 ++++++------------ net/core/skbuff.c | 15 +++++++-------- 3 files changed, 17 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 27e3fa69253f..5c9aa16933d1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3537,10 +3537,10 @@ struct softnet_data { struct numa_drop_counters drop_counters; /* Another possibly contended cache line */ - spinlock_t defer_lock ____cacheline_aligned_in_smp; - atomic_t defer_count; - int defer_ipi_scheduled; - struct sk_buff *defer_list; + struct llist_head defer_list ____cacheline_aligned_in_smp; + atomic_long_t defer_count; + + int defer_ipi_scheduled ____cacheline_aligned_in_smp; call_single_data_t defer_csd; }; diff --git a/net/core/dev.c b/net/core/dev.c index 8566678d8344..fb67372774de 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6717,22 +6717,16 @@ EXPORT_SYMBOL(napi_complete_done); static void skb_defer_free_flush(struct softnet_data *sd) { + struct llist_node *free_list; struct sk_buff *skb, *next; - /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ - if (!READ_ONCE(sd->defer_list)) + if (llist_empty(&sd->defer_list)) return; + atomic_long_set(&sd->defer_count, 0); + free_list = llist_del_all(&sd->defer_list); - spin_lock(&sd->defer_lock); - skb = sd->defer_list; - sd->defer_list = NULL; - atomic_set(&sd->defer_count, 0); - spin_unlock(&sd->defer_lock); - - while (skb != NULL) { - next = skb->next; + llist_for_each_entry_safe(skb, next, free_list, ll_node) { napi_consume_skb(skb, 1); - skb = next; } } @@ -12995,7 +12989,7 @@ static int __init net_dev_init(void) sd->cpu = i; #endif INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); - spin_lock_init(&sd->defer_lock); + init_llist_head(&sd->defer_list); gro_init(&sd->backlog.gro); sd->backlog.poll = process_backlog; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 16cd357d62a6..17455fc1e692 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -7185,6 +7185,7 @@ static void kfree_skb_napi_cache(struct sk_buff *skb) */ void skb_attempt_defer_free(struct sk_buff *skb) { + unsigned long defer_count; int cpu = skb->alloc_cpu; struct softnet_data *sd; unsigned int defer_max; @@ -7202,17 +7203,15 @@ nodefer: kfree_skb_napi_cache(skb); sd = &per_cpu(softnet_data, cpu); defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max); - if (atomic_read(&sd->defer_count) >= defer_max) + defer_count = atomic_long_inc_return(&sd->defer_count); + + if (defer_count >= defer_max) goto nodefer; - spin_lock_bh(&sd->defer_lock); - /* Send an IPI every time queue reaches half capacity. */ - kick = (atomic_inc_return(&sd->defer_count) - 1) == (defer_max >> 1); + llist_add(&skb->ll_node, &sd->defer_list); - skb->next = sd->defer_list; - /* Paired with READ_ONCE() in skb_defer_free_flush() */ - WRITE_ONCE(sd->defer_list, skb); - spin_unlock_bh(&sd->defer_lock); + /* Send an IPI every time queue reaches half capacity. */ + kick = (defer_count - 1) == (defer_max >> 1); /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU * if we are unlucky enough (this seems very unlikely). -- cgit v1.2.3 From 5628f3fe3b16114e8424bbfcf0594caef8958a06 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 28 Sep 2025 08:49:34 +0000 Subject: net: add NUMA awareness to skb_attempt_defer_free() Instead of sharing sd->defer_list & sd->defer_count with many cpus, add one pair for each NUMA node. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250928084934.3266948-4-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 4 ---- include/net/hotdata.h | 7 +++++++ net/core/dev.c | 35 +++++++++++++++++++++++------------ net/core/dev.h | 2 +- net/core/skbuff.c | 11 ++++++----- 5 files changed, 37 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5c9aa16933d1..d1a687444b27 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3536,10 +3536,6 @@ struct softnet_data { struct numa_drop_counters drop_counters; - /* Another possibly contended cache line */ - struct llist_head defer_list ____cacheline_aligned_in_smp; - atomic_long_t defer_count; - int defer_ipi_scheduled ____cacheline_aligned_in_smp; call_single_data_t defer_csd; }; diff --git a/include/net/hotdata.h b/include/net/hotdata.h index fda94b2647ff..4acec191c54a 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -2,10 +2,16 @@ #ifndef _NET_HOTDATA_H #define _NET_HOTDATA_H +#include #include #include #include +struct skb_defer_node { + struct llist_head defer_list; + atomic_long_t defer_count; +} ____cacheline_aligned_in_smp; + /* Read mostly data used in network fast paths. */ struct net_hotdata { #if IS_ENABLED(CONFIG_INET) @@ -30,6 +36,7 @@ struct net_hotdata { struct rps_sock_flow_table __rcu *rps_sock_flow_table; u32 rps_cpu_mask; #endif + struct skb_defer_node __percpu *skb_defer_nodes; int gro_normal_batch; int netdev_budget; int netdev_budget_usecs; diff --git a/net/core/dev.c b/net/core/dev.c index fb67372774de..a64cef2c537e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5180,8 +5180,9 @@ static void napi_schedule_rps(struct softnet_data *sd) __napi_schedule_irqoff(&mysd->backlog); } -void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu) +void kick_defer_list_purge(unsigned int cpu) { + struct softnet_data *sd = &per_cpu(softnet_data, cpu); unsigned long flags; if (use_backlog_threads()) { @@ -6715,18 +6716,24 @@ bool napi_complete_done(struct napi_struct *n, int work_done) } EXPORT_SYMBOL(napi_complete_done); -static void skb_defer_free_flush(struct softnet_data *sd) +static void skb_defer_free_flush(void) { struct llist_node *free_list; struct sk_buff *skb, *next; + struct skb_defer_node *sdn; + int node; - if (llist_empty(&sd->defer_list)) - return; - atomic_long_set(&sd->defer_count, 0); - free_list = llist_del_all(&sd->defer_list); + for_each_node(node) { + sdn = this_cpu_ptr(net_hotdata.skb_defer_nodes) + node; + + if (llist_empty(&sdn->defer_list)) + continue; + atomic_long_set(&sdn->defer_count, 0); + free_list = llist_del_all(&sdn->defer_list); - llist_for_each_entry_safe(skb, next, free_list, ll_node) { - napi_consume_skb(skb, 1); + llist_for_each_entry_safe(skb, next, free_list, ll_node) { + napi_consume_skb(skb, 1); + } } } @@ -6854,7 +6861,7 @@ count: if (work > 0) __NET_ADD_STATS(dev_net(napi->dev), LINUX_MIB_BUSYPOLLRXPACKETS, work); - skb_defer_free_flush(this_cpu_ptr(&softnet_data)); + skb_defer_free_flush(); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); @@ -7713,7 +7720,7 @@ static void napi_threaded_poll_loop(struct napi_struct *napi) local_irq_disable(); net_rps_action_and_irq_enable(sd); } - skb_defer_free_flush(sd); + skb_defer_free_flush(); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); @@ -7755,7 +7762,7 @@ start: for (;;) { struct napi_struct *n; - skb_defer_free_flush(sd); + skb_defer_free_flush(); if (list_empty(&list)) { if (list_empty(&repoll)) { @@ -12989,7 +12996,6 @@ static int __init net_dev_init(void) sd->cpu = i; #endif INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); - init_llist_head(&sd->defer_list); gro_init(&sd->backlog.gro); sd->backlog.poll = process_backlog; @@ -12999,6 +13005,11 @@ static int __init net_dev_init(void) if (net_page_pool_create(i)) goto out; } + net_hotdata.skb_defer_nodes = + __alloc_percpu(sizeof(struct skb_defer_node) * nr_node_ids, + __alignof__(struct skb_defer_node)); + if (!net_hotdata.skb_defer_nodes) + goto out; if (use_backlog_threads()) smpboot_register_percpu_thread(&backlog_threads); diff --git a/net/core/dev.h b/net/core/dev.h index d6b08d435479..900880e8b5b4 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -357,7 +357,7 @@ static inline void napi_assert_will_not_race(const struct napi_struct *napi) WARN_ON(READ_ONCE(napi->list_owner) != -1); } -void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); +void kick_defer_list_purge(unsigned int cpu); #define XMIT_RECURSION_LIMIT 8 diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 17455fc1e692..bc12790017b0 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -7185,9 +7185,9 @@ static void kfree_skb_napi_cache(struct sk_buff *skb) */ void skb_attempt_defer_free(struct sk_buff *skb) { + struct skb_defer_node *sdn; unsigned long defer_count; int cpu = skb->alloc_cpu; - struct softnet_data *sd; unsigned int defer_max; bool kick; @@ -7201,14 +7201,15 @@ nodefer: kfree_skb_napi_cache(skb); DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); DEBUG_NET_WARN_ON_ONCE(skb->destructor); - sd = &per_cpu(softnet_data, cpu); + sdn = per_cpu_ptr(net_hotdata.skb_defer_nodes, cpu) + numa_node_id(); + defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max); - defer_count = atomic_long_inc_return(&sd->defer_count); + defer_count = atomic_long_inc_return(&sdn->defer_count); if (defer_count >= defer_max) goto nodefer; - llist_add(&skb->ll_node, &sd->defer_list); + llist_add(&skb->ll_node, &sdn->defer_list); /* Send an IPI every time queue reaches half capacity. */ kick = (defer_count - 1) == (defer_max >> 1); @@ -7217,7 +7218,7 @@ nodefer: kfree_skb_napi_cache(skb); * if we are unlucky enough (this seems very unlikely). */ if (unlikely(kick)) - kick_defer_list_purge(sd, cpu); + kick_defer_list_purge(cpu); } static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, -- cgit v1.2.3 From ffe381923d87c8cf5d4372c12eea6127dc36fd3a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 22 Sep 2025 17:36:48 -0400 Subject: sunrpc: unexport rpc_malloc() and rpc_free() These are not used outside of sunrpc code. Signed-off-by: Jeff Layton Signed-off-by: Anna Schumaker --- net/sunrpc/sched.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 9b45fbdc90ca..016f16ca5779 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -1074,7 +1074,6 @@ int rpc_malloc(struct rpc_task *task) rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize; return 0; } -EXPORT_SYMBOL_GPL(rpc_malloc); /** * rpc_free - free RPC buffer resources allocated via rpc_malloc @@ -1095,7 +1094,6 @@ void rpc_free(struct rpc_task *task) else kfree(buf); } -EXPORT_SYMBOL_GPL(rpc_free); /* * Creation and deletion of RPC task structures -- cgit v1.2.3 From 94b04355e6397a0a70b69c2571fa5c7d9990b835 Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Mon, 15 Sep 2025 16:46:03 -0700 Subject: Drivers: hv: Add CONFIG_HYPERV_VMBUS option At present VMBus driver is hinged off of CONFIG_HYPERV which entails lot of builtin code and encompasses too much. It's not always clear what depends on builtin hv code and what depends on VMBus. Setting CONFIG_HYPERV as a module and fudging the Makefile to switch to builtin adds even more confusion. VMBus is an independent module and should have its own config option. Also, there are scenarios like baremetal dom0/root where support is built in with CONFIG_HYPERV but without VMBus. Lastly, there are more features coming down that use CONFIG_HYPERV and add more dependencies on it. So, create a fine grained HYPERV_VMBUS option and update Kconfigs for dependency on VMBus. Signed-off-by: Mukesh Rathor Acked-by: Bjorn Helgaas # drivers/pci Signed-off-by: Wei Liu --- drivers/gpu/drm/Kconfig | 2 +- drivers/hid/Kconfig | 2 +- drivers/hv/Kconfig | 11 +++++++++-- drivers/hv/Makefile | 2 +- drivers/input/serio/Kconfig | 4 ++-- drivers/net/hyperv/Kconfig | 2 +- drivers/pci/Kconfig | 2 +- drivers/scsi/Kconfig | 2 +- drivers/uio/Kconfig | 2 +- drivers/video/fbdev/Kconfig | 2 +- include/asm-generic/mshyperv.h | 8 +++++--- net/vmw_vsock/Kconfig | 2 +- 12 files changed, 25 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index f7ea8e895c0c..58f34da061c6 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -398,7 +398,7 @@ source "drivers/gpu/drm/imagination/Kconfig" config DRM_HYPERV tristate "DRM Support for Hyper-V synthetic video device" - depends on DRM && PCI && HYPERV + depends on DRM && PCI && HYPERV_VMBUS select DRM_CLIENT_SELECTION select DRM_KMS_HELPER select DRM_GEM_SHMEM_HELPER diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig index a57901203aeb..fe3dc8c0db99 100644 --- a/drivers/hid/Kconfig +++ b/drivers/hid/Kconfig @@ -1162,7 +1162,7 @@ config GREENASIA_FF config HID_HYPERV_MOUSE tristate "Microsoft Hyper-V mouse driver" - depends on HYPERV + depends on HYPERV_VMBUS help Select this option to enable the Hyper-V mouse driver. diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig index e24f6299c376..29f8637f441a 100644 --- a/drivers/hv/Kconfig +++ b/drivers/hv/Kconfig @@ -45,18 +45,25 @@ config HYPERV_TIMER config HYPERV_UTILS tristate "Microsoft Hyper-V Utilities driver" - depends on HYPERV && CONNECTOR && NLS + depends on HYPERV_VMBUS && CONNECTOR && NLS depends on PTP_1588_CLOCK_OPTIONAL help Select this option to enable the Hyper-V Utilities. config HYPERV_BALLOON tristate "Microsoft Hyper-V Balloon driver" - depends on HYPERV + depends on HYPERV_VMBUS select PAGE_REPORTING help Select this option to enable Hyper-V Balloon driver. +config HYPERV_VMBUS + tristate "Microsoft Hyper-V VMBus driver" + depends on HYPERV + default HYPERV + help + Select this option to enable Hyper-V Vmbus driver. + config MSHV_ROOT tristate "Microsoft Hyper-V root partition support" depends on HYPERV && (X86_64 || ARM64) diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile index 976189c725dc..4bb41663767d 100644 --- a/drivers/hv/Makefile +++ b/drivers/hv/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_HYPERV) += hv_vmbus.o +obj-$(CONFIG_HYPERV_VMBUS) += hv_vmbus.o obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o obj-$(CONFIG_MSHV_ROOT) += mshv_root.o diff --git a/drivers/input/serio/Kconfig b/drivers/input/serio/Kconfig index 17edc1597446..c7ef347a4dff 100644 --- a/drivers/input/serio/Kconfig +++ b/drivers/input/serio/Kconfig @@ -276,8 +276,8 @@ config SERIO_OLPC_APSP config HYPERV_KEYBOARD tristate "Microsoft Synthetic Keyboard driver" - depends on HYPERV - default HYPERV + depends on HYPERV_VMBUS + default HYPERV_VMBUS help Select this option to enable the Hyper-V Keyboard driver. diff --git a/drivers/net/hyperv/Kconfig b/drivers/net/hyperv/Kconfig index c8cbd85adcf9..982964c1a9fb 100644 --- a/drivers/net/hyperv/Kconfig +++ b/drivers/net/hyperv/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config HYPERV_NET tristate "Microsoft Hyper-V virtual network driver" - depends on HYPERV + depends on HYPERV_VMBUS select UCS2_STRING select NLS help diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index 9a249c65aedc..7065a8e5f9b1 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -221,7 +221,7 @@ config PCI_LABEL config PCI_HYPERV tristate "Hyper-V PCI Frontend" - depends on ((X86 && X86_64) || ARM64) && HYPERV && PCI_MSI && SYSFS + depends on ((X86 && X86_64) || ARM64) && HYPERV_VMBUS && PCI_MSI && SYSFS select PCI_HYPERV_INTERFACE select IRQ_MSI_LIB help diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index 5522310bab8d..19d0884479a2 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -589,7 +589,7 @@ config XEN_SCSI_FRONTEND config HYPERV_STORAGE tristate "Microsoft Hyper-V virtual storage driver" - depends on SCSI && HYPERV + depends on SCSI && HYPERV_VMBUS depends on m || SCSI_FC_ATTRS != m default HYPERV help diff --git a/drivers/uio/Kconfig b/drivers/uio/Kconfig index b060dcd7c635..6f86a61231e6 100644 --- a/drivers/uio/Kconfig +++ b/drivers/uio/Kconfig @@ -140,7 +140,7 @@ config UIO_MF624 config UIO_HV_GENERIC tristate "Generic driver for Hyper-V VMBus" - depends on HYPERV + depends on HYPERV_VMBUS help Generic driver that you can bind, dynamically, to any Hyper-V VMBus device. It is useful to provide direct access diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig index c21484d15f0c..72c63eaeb983 100644 --- a/drivers/video/fbdev/Kconfig +++ b/drivers/video/fbdev/Kconfig @@ -1774,7 +1774,7 @@ config FB_BROADSHEET config FB_HYPERV tristate "Microsoft Hyper-V Synthetic Video support" - depends on FB && HYPERV + depends on FB && HYPERV_VMBUS select DMA_CMA if HAVE_DMA_CONTIGUOUS && CMA select FB_IOMEM_HELPERS_DEFERRED help diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index dbd4c2f3aee3..64ba6bc807d9 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -163,6 +163,7 @@ static inline u64 hv_generate_guest_id(u64 kernel_version) return guest_id; } +#if IS_ENABLED(CONFIG_HYPERV_VMBUS) /* Free the message slot and signal end-of-message if required */ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) { @@ -198,6 +199,10 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) } } +extern int vmbus_interrupt; +extern int vmbus_irq; +#endif /* CONFIG_HYPERV_VMBUS */ + int hv_get_hypervisor_version(union hv_hypervisor_version_info *info); void hv_setup_vmbus_handler(void (*handler)(void)); @@ -211,9 +216,6 @@ void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs)); void hv_remove_crash_handler(void); void hv_setup_mshv_handler(void (*handler)(void)); -extern int vmbus_interrupt; -extern int vmbus_irq; - #if IS_ENABLED(CONFIG_HYPERV) /* * Hypervisor's notion of virtual processor ID is different from diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig index 56356d2980c8..8e803c4828c4 100644 --- a/net/vmw_vsock/Kconfig +++ b/net/vmw_vsock/Kconfig @@ -72,7 +72,7 @@ config VIRTIO_VSOCKETS_COMMON config HYPERV_VSOCKETS tristate "Hyper-V transport for Virtual Sockets" - depends on VSOCKETS && HYPERV + depends on VSOCKETS && HYPERV_VMBUS help This module implements a Hyper-V transport for Virtual Sockets. -- cgit v1.2.3 From 1a98f5699bd57c9b3f66ec54cc38571d5e42ffb1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 30 Sep 2025 15:45:06 +0200 Subject: Revert "Documentation: net: add flow control guide and document ethtool API" This reverts commit 7bd80ed89d72285515db673803b021469ba71ee8. I should not have merged it to begin with due to pending review and changes to be addressed. Link: https://patch.msgid.link/c6f3af12df9b7998920a02027fc8893ce82afc4c.1759239721.git.pabeni@redhat.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/ethtool.yaml | 27 -- Documentation/networking/flow_control.rst | 373 ------------------------- Documentation/networking/index.rst | 1 - Documentation/networking/phy.rst | 12 +- include/linux/ethtool.h | 45 +-- include/uapi/linux/ethtool_netlink_generated.h | 4 +- net/dcb/dcbnl.c | 2 - net/ethtool/pause.c | 4 - 8 files changed, 15 insertions(+), 453 deletions(-) delete mode 100644 Documentation/networking/flow_control.rst (limited to 'net') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index e4852505294f..6a0fb1974513 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -864,9 +864,7 @@ attribute-sets: - name: pause-stat - doc: Statistics counters for link-wide PAUSE frames (IEEE 802.3 Annex 31B). attr-cnt-name: __ethtool-a-pause-stat-cnt - enum-name: ethtool-a-pause-stat attributes: - name: unspec @@ -877,17 +875,13 @@ attribute-sets: type: pad - name: tx-frames - doc: Number of PAUSE frames transmitted. type: u64 - name: rx-frames - doc: Number of PAUSE frames received. type: u64 - name: pause - doc: Parameters for link-wide PAUSE (IEEE 802.3 Annex 31B). attr-cnt-name: __ethtool-a-pause-cnt - enum-name: ethtool-a-pause attributes: - name: unspec @@ -899,40 +893,19 @@ attribute-sets: nested-attributes: header - name: autoneg - doc: | - Acts as a mode selector for the driver. - On GET: indicates the driver's behavior. If true, the driver will - respect the negotiated outcome; if false, the driver will use a - forced configuration. - On SET: if true, the driver configures the PHY's advertisement based - on the rx and tx attributes. If false, the driver forces the MAC - into the state defined by the rx and tx attributes. type: u8 - name: rx - doc: | - Enable receiving PAUSE frames (pausing local TX). - On GET: reflects the currently preferred configuration state. type: u8 - name: tx - doc: | - Enable transmitting PAUSE frames (pausing peer TX). - On GET: reflects the currently preferred configuration state. type: u8 - name: stats - doc: | - Contains the pause statistics counters. The source of these - statistics is determined by stats-src. type: nest nested-attributes: pause-stat - name: stats-src - doc: | - Selects the source of the MAC statistics, values from - enum ethtool_mac_stats_src. This allows requesting statistics - from the individual components of the MAC Merge layer. type: u32 - name: eee diff --git a/Documentation/networking/flow_control.rst b/Documentation/networking/flow_control.rst deleted file mode 100644 index 48646d54513f..000000000000 --- a/Documentation/networking/flow_control.rst +++ /dev/null @@ -1,373 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -.. _ethernet-flow-control: - -===================== -Ethernet Flow Control -===================== - -This document is a practical guide to Ethernet Flow Control in Linux, covering -what it is, how it works, and how to configure it. - -What is Flow Control? -===================== - -Flow control is a mechanism to prevent a fast sender from overwhelming a -slow receiver with data, which would cause buffer overruns and dropped packets. -The receiver can signal the sender to temporarily stop transmitting, giving it -time to process its backlog. - -Standards references -==================== - -Ethernet flow control mechanisms are specified across consolidated IEEE base -standards; some originated as amendments: - -- Collision-based flow control is part of CSMA/CD in **IEEE 802.3** - (half-duplex). -- Link-wide PAUSE is defined in **IEEE 802.3 Annex 31B** - (originally **802.3x**). -- Priority-based Flow Control (PFC) is defined in **IEEE 802.1Q Clause 36** - (originally **802.1Qbb**). - -In the remainder of this document, the consolidated clause numbers are used. - -How It Works: The Mechanisms -============================ - -The method used for flow control depends on the link's duplex mode. - -.. note:: - The user-visible ``ethtool`` pause API described in this document controls - **link-wide PAUSE** (IEEE 802.3 Annex 31B) only. It does not control the - collision-based behavior that exists on half-duplex links. - -1. Half-Duplex: Collision-Based Flow Control --------------------------------------------- -On half-duplex links, a device cannot send and receive simultaneously, so PAUSE -frames are not used. Flow control is achieved by leveraging the CSMA/CD -(Carrier Sense Multiple Access with Collision Detection) protocol itself. - -* **How it works**: To inhibit incoming data, a receiving device can force a - collision on the line. When the sending station detects this collision, it - terminates its transmission, sends a "jam" signal, and then executes the - "Collision backoff and retransmission" procedure as defined in IEEE 802.3, - Section 4.2.3.2.5. This algorithm makes the sender wait for a random - period before attempting to retransmit. By repeatedly forcing collisions, - the receiver can effectively throttle the sender's transmission rate. - -.. note:: - While this mechanism is part of the IEEE standard, there is currently no - generic kernel API to configure or control it. Drivers should not enable - this feature until a standardized interface is available. - -.. warning:: - On shared-medium networks (e.g. 10BASE2, or twisted-pair networks using a - hub rather than a switch) forcing collisions inhibits traffic **across the - entire shared segment**, not just a single point-to-point link. Enabling - such behavior is generally undesirable. - -2. Full-Duplex: Link-wide PAUSE (IEEE 802.3 Annex 31B) ------------------------------------------------------- -On full-duplex links, devices can send and receive at the same time. Flow -control is achieved by sending a special **PAUSE frame**, defined by IEEE -802.3 Annex 31B. This mechanism pauses all traffic on the link and is therefore -called *link-wide PAUSE*. - -* **What it is**: A standard Ethernet frame with a globally reserved - destination MAC address (``01-80-C2-00-00-01``). This address is in a range - that standard IEEE 802.1D-compliant bridges do not forward. However, some - unmanaged or misconfigured bridges have been reported to forward these - frames, which can disrupt flow control across a network. - -* **How it works**: The frame contains a MAC Control opcode for PAUSE - (``0x0001``) and a ``pause_time`` value, telling the sender how long to - wait before sending more data frames. This time is specified in units of - "pause quantum", where one quantum is the time it takes to transmit 512 bits. - For example, one pause quantum is 51.2 microseconds on a 10 Mbit/s link, - and 512 nanoseconds on a 1 Gbit/s link. A ``pause_time`` of zero indicates - that the transmitter can resume transmission, even if a previous non-zero - pause time has not yet elapsed. - -* **Who uses it**: Any full-duplex link, from 10 Mbit/s to multi-gigabit speeds. - -3. Full-Duplex: Priority-based Flow Control (PFC) (IEEE 802.1Q Clause 36) -------------------------------------------------------------------------- -Priority-based Flow Control is an enhancement to the standard PAUSE mechanism -that allows flow control to be applied independently to different classes of -traffic, identified by their priority level. - -* **What it is**: PFC allows a receiver to pause traffic for one or more of the - 8 standard priority levels without stopping traffic for other priorities. - This is critical in data center environments for protocols that cannot - tolerate packet loss due to congestion (e.g., Fibre Channel over Ethernet - or RoCE). - -* **How it works**: PFC uses a specific PAUSE frame format. It shares the same - globally reserved destination MAC address (``01-80-C2-00-00-01``) as legacy - PAUSE frames but uses a unique opcode (``0x0101``). The frame payload - contains two key fields: - - - **``priority_enable_vector``**: An 8-bit mask where each bit corresponds to - one of the 8 priorities. If a bit is set to 1, it means the pause time - for that priority is active. - - **``time_vector``**: A list of eight 2-octet fields, one for each priority. - Each field specifies the ``pause_time`` for its corresponding priority, - measured in units of ``pause_quanta`` (the time to transmit 512 bits). - -.. note:: - When PFC is enabled for at least one priority on a port, the standard - **link-wide PAUSE** (IEEE 802.3 Annex 31B) must be disabled for that port. - The two mechanisms are mutually exclusive (IEEE 802.1Q Clause 36). - -Configuring Flow Control -======================== - -Link-wide PAUSE and Priority-based Flow Control are configured with different -tools. - -Configuring Link-wide PAUSE with ``ethtool`` (IEEE 802.3 Annex 31B) -------------------------------------------------------------------- -Use ``ethtool -a `` to view and ``ethtool -A `` to change -the link-wide PAUSE settings. - -.. code-block:: bash - - # View current link-wide PAUSE settings - ethtool -a eth0 - - # Enable RX and TX pause, with autonegotiation - ethtool -A eth0 autoneg on rx on tx on - -**Key Configuration Concepts**: - -* **Pause Autoneg vs Generic Autoneg**: ``ethtool -A ... autoneg {on,off}`` - controls **Pause Autoneg** (Annex 31B) only. It is independent from the - **Generic link autonegotiation** configured with ``ethtool -s``. A device can - have Generic autoneg **on** while Pause Autoneg is **off**, and vice versa. - -* **If Pause Autoneg is off** (``-A ... autoneg off``): the device will **not** - advertise pause in the PHY. The MAC PAUSE state is **forced** according to - ``rx``/``tx`` and does not depend on partner capabilities or resolution. - Ensure the peer is configured complementarily for PAUSE to be effective. - -* **If generic autoneg is off** but **Pause Autoneg is on**, the pause policy - is **remembered** by the kernel and applied later when Generic autoneg is - enabled again. - -* **Autonegotiation Mode**: The PHY will *advertise* the ``rx`` and ``tx`` - capabilities. The final active state is determined by what both sides of the - link agree on. See the "PHY (Physical Layer Transceiver)" section below, - especially the *Resolution* subsection, for details of the negotiation rules. - -* **Forced Mode**: This mode is necessary when autonegotiation is not used or - not possible. This includes links where one or both partners have - autonegotiation disabled, or in setups without a PHY (e.g., direct - MAC-to-MAC connections). The driver bypasses PHY advertisement and - directly forces the MAC into the specified ``rx``/``tx`` state. The - configuration on both sides of the link must be complementary. For - example, if one side is set to ``tx on`` ``rx off``, the link partner must be - set to ``tx off`` ``rx on`` for flow control to function correctly. - -Configuring PFC with ``dcb`` (IEEE 802.1Q Clause 36) ----------------------------------------------------- -PFC is part of the Data Center Bridging (DCB) subsystem and is managed with the -``dcb`` tool (iproute2). Some deployments use ``dcbtool`` (lldpad) instead; this -document shows ``dcb(8)`` examples. - -**Viewing PFC Settings**: - -.. code-block:: text - - $ dcb pfc show dev eth0 - pfc-cap 8 macsec-bypass off delay 4096 - prio-pfc 0:off 1:off 2:off 3:off 4:off 5:off 6:on 7:on - -This shows the PFC state (on/off) for each priority (0-7). - -**Changing PFC Settings**: - -.. code-block:: bash - - # Enable PFC on priorities 6 and 7, leaving others as they are - $ dcb pfc set dev eth0 prio-pfc 6:on 7:on - - # Disable PFC for all priorities except 6 and 7 - $ dcb pfc set dev eth0 prio-pfc all:off 6:on 7:on - -Monitoring Flow Control -======================= - -The standard way to check if flow control is actively being used is to view the -pause-related statistics. - -**Monitoring Link-wide PAUSE**: -Use ``ethtool --include-statistics -a ``. - -.. code-block:: text - - $ ethtool --include-statistics -a eth0 - Pause parameters for eth0: - ... - Statistics: - tx_pause_frames: 0 - rx_pause_frames: 0 - -**Monitoring PFC**: -PFC statistics (sent and received frames per priority) are available -through the ``dcb`` tool. - -.. code-block:: text - - $ dcb pfc show dev eth0 requests indications - requests 0:0 1:0 2:0 3:1024 4:2048 5:0 6:0 7:0 - indications 0:0 1:0 2:0 3:512 4:4096 5:0 6:0 7:0 - -The ``requests`` counters track transmitted PFC frames (TX), and the -``indications`` counters track received PFC frames (RX). - -Link-wide PAUSE Autonegotiation Details -======================================= - -The autonegotiation process for link-wide PAUSE is managed by the PHY and -involves advertising capabilities and resolving the outcome. - -* Terminology (link-wide PAUSE): - - - **Symmetric pause**: both directions are paused when requested (TX+RX - enabled). - - **Asymmetric pause**: only one direction is paused (e.g., RX-only or - TX-only). - - In IEEE 802.3 advertisement/resolution, symmetric/asymmetric are encoded - using two bits (Pause/Asym) and resolved per the standard truth tables - below. - -* **Advertisement**: The PHY advertises the MAC's flow control capabilities. - This is done using two bits in the advertisement register: "Symmetric - Pause" (Pause) and "Asymmetric Pause" (Asym). These bits should be - interpreted as a combined value, not as independent flags. The kernel - converts the user's ``rx`` and ``tx`` settings into this two-bit value as - follows: - - .. code-block:: text - - tx rx | Pause Asym - -------+------------- - 0 0 | 0 0 - 0 1 | 1 1 - 1 0 | 0 1 - 1 1 | 1 0 - -* **Resolution**: After negotiation, the PHY reports the link partner's - advertised Pause and Asym bits. The final flow control mode is determined - by the combination of the local and partner advertisements, according to - the IEEE 802.3 standard: - - .. code-block:: text - - Local Device | Link Partner | Result - Pause Asym | Pause Asym | - -------------------+--------------------+--------- - 0 X | 0 X | Disabled - 0 1 | 1 0 | Disabled - 0 1 | 1 1 | TX only - 1 0 | 0 X | Disabled - 1 X | 1 X | TX + RX - 1 1 | 0 1 | RX only - - It is important to note that the advertised bits reflect the *current - configuration* of the MAC, which may not represent its full hardware - capabilities. - -Kernel Policy: "Set and Trust" -============================== - -The ethtool pause API is defined as a **wish policy** for -IEEE 802.3 link-wide PAUSE only. A user request is always accepted -as the preferred configuration, but it may not be possible to apply -it in all link states. - -Key constraints: - -- Link-wide PAUSE is not valid on half-duplex links. -- Link-wide PAUSE cannot be used together with Priority-based Flow Control - (PFC, IEEE 802.1Q Clause 36). -- If autonegotiation is active and the link is currently down, the future - mode is not yet known. - -Because of these constraints, the kernel stores the requested setting -and applies it only when the link is in a compatible state. - -Implications for userspace: - -1. Set once (the "wish"): the requested Rx/Tx PAUSE policy is - remembered even if it cannot be applied immediately. -2. Applied conditionally: when the link comes up, the kernel enables - PAUSE only if the active mode allows it. - -Component Roles in Flow Control -=============================== - -The configuration of flow control involves several components, each with a -distinct role. - -The MAC (Media Access Controller) ---------------------------------- -The MAC is the hardware component that actually sends and receives PAUSE -frames. Its capabilities define the upper limit of what the driver can support. -For link-wide PAUSE, MACs can vary in their support for symmetric (both -directions) or asymmetric (independent TX/RX) flow control. - -For PFC, the MAC must be capable of generating and interpreting the -priority-based PAUSE frames and managing separate pause states for each -traffic class. - -Many MACs also implement automatic PAUSE frame transmission based on the fill -level of their internal RX FIFO. This is typically configured with two -thresholds: - -* **FLOW_ON (High Water Mark)**: When the RX FIFO usage reaches this - threshold, the MAC automatically transmits a PAUSE frame to stop the sender. - -* **FLOW_OFF (Low Water Mark)**: When the RX FIFO usage drops below this - threshold, the MAC transmits a PAUSE frame with a quantum of zero to tell - the sender it can resume transmission. - -The PHY (Physical Layer Transceiver) ------------------------------------- -The PHY's role is distinct for each flow control mechanism: - -* **Link-wide PAUSE**: During the autonegotiation process, the PHY is - responsible for advertising the device's flow control capabilities. See the - "Link-wide PAUSE Autonegotiation Details" section for more information. - -* **Half-Duplex Collision-Based Flow Control**: The PHY is fundamental to the - CSMA/CD process. It performs carrier sensing (checking if the line is idle) - and collision detection, which is the mechanism leveraged to throttle the - sender. - -* **Priority-based Flow Control (PFC)**: The PHY is not directly involved in - negotiating PFC capabilities. Its role is to establish the physical link. - PFC negotiation happens at a higher layer via the Data Center Bridging - Capability Exchange Protocol (DCBX). - -User Space Interface -==================== -The primary user space tools are ``ethtool`` for link-wide PAUSE and ``dcb`` for -PFC. They communicate with the kernel to configure the network device driver -and underlying hardware. - -**Link-wide PAUSE Netlink Interface (``ethtool``)** - -See the ethtool Netlink spec (``Documentation/netlink/specs/ethtool.yaml``) -for the authoritative definition of the Pause control and Pause statistics -attributes. The generated UAPI is in -``include/uapi/linux/ethtool_netlink_generated.h``. - -**PFC Netlink Interface (``dcb``)** - -The authoritative definitions for DCB/PFC netlink attributes and commands are in -``include/uapi/linux/dcbnl.h``. See also the ``dcb(8)`` manual page and the DCB -subsystem documentation for userspace configuration details. - diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 52aafdc85f6a..c775cababc8c 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -55,7 +55,6 @@ Contents: eql fib_trie filter - flow_control generic-hdlc generic_netlink ../netlink/specs/index diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst index 40cc0a988d60..b0f2ef83735d 100644 --- a/Documentation/networking/phy.rst +++ b/Documentation/networking/phy.rst @@ -343,8 +343,16 @@ Some of the interface modes are described below: Pause frames / flow control =========================== -For detailed link-wide PAUSE and PFC behavior and configuration, see -flow_control.rst. +The PHY does not participate directly in flow control/pause frames except by +making sure that the SUPPORTED_Pause and SUPPORTED_AsymPause bits are set in +MII_ADVERTISE to indicate towards the link partner that the Ethernet MAC +controller supports such a thing. Since flow control/pause frames generation +involves the Ethernet MAC driver, it is recommended that this driver takes care +of properly indicating advertisement and support for such features by setting +the SUPPORTED_Pause and SUPPORTED_AsymPause bits accordingly. This can be done +either before or after phy_connect() and/or as a result of implementing the +ethtool::set_pauseparam feature. + Keeping Close Tabs on the PAL ============================= diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index eeed1ea50369..c2d8b4ec62eb 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -953,48 +953,9 @@ struct kernel_ethtool_ts_info { * @get_pause_stats: Report pause frame statistics. Drivers must not zero * statistics which they don't report. The stats structure is initialized * to ETHTOOL_STAT_NOT_SET indicating driver does not report statistics. - * - * @get_pauseparam: Report the configured policy for link-wide PAUSE - * (IEEE 802.3 Annex 31B). Drivers must fill struct ethtool_pauseparam - * such that: - * @autoneg: - * This refers to **Pause Autoneg** (IEEE 802.3 Annex 31B) only - * and is independent of generic link autonegotiation configured - * via ethtool -s. - * true -> the device follows the negotiated result of pause - * autonegotiation (Pause/Asym); - * false -> the device uses a forced MAC state independent of - * negotiation. - * @rx_pause/@tx_pause: - * represent the desired policy (preferred configuration). - * In autoneg mode they describe what is to be advertised; - * in forced mode they describe the MAC state to apply. - * - * Drivers (and/or frameworks) should persist this policy across link - * changes and reapply appropriate MAC programming when link parameters - * change. - * - * @set_pauseparam: Apply a policy for link-wide PAUSE (IEEE 802.3 Annex 31B). - * If @autoneg is true: - * Arrange for pause advertisement (Pause/Asym) based on - * @rx_pause/@tx_pause and program the MAC to follow the - * negotiated result (which may be symmetric, asymmetric, or off - * depending on the link partner). - * If @autoneg is false: - * Do not rely on autonegotiation; force the MAC RX/TX pause - * state directly per @rx_pause/@tx_pause. - * - * Implementations that integrate with PHYLIB/PHYLINK should cooperate - * with those frameworks for advertisement and resolution; MAC drivers are - * still responsible for applying the required MAC state. - * - * Return: 0 on success or a negative errno. Return -EOPNOTSUPP if - * link-wide PAUSE is unsupported. If only symmetric pause is supported, - * reject unsupported asymmetric requests with -EINVAL (or document any - * coercion policy). - * - * See also: Documentation/networking/flow_control.rst - * + * @get_pauseparam: Report pause parameters + * @set_pauseparam: Set pause parameters. Returns a negative error code + * or zero. * @self_test: Run specified self-tests * @get_strings: Return a set of strings that describe the requested objects * @set_phys_id: Identify the physical devices, e.g. by flashing an LED diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 3dd9d7cde86e..0e8ac0d974e2 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -375,7 +375,7 @@ enum { ETHTOOL_A_COALESCE_MAX = (__ETHTOOL_A_COALESCE_CNT - 1) }; -enum ethtool_a_pause_stat { +enum { ETHTOOL_A_PAUSE_STAT_UNSPEC, ETHTOOL_A_PAUSE_STAT_PAD, ETHTOOL_A_PAUSE_STAT_TX_FRAMES, @@ -385,7 +385,7 @@ enum ethtool_a_pause_stat { ETHTOOL_A_PAUSE_STAT_MAX = (__ETHTOOL_A_PAUSE_STAT_CNT - 1) }; -enum ethtool_a_pause { +enum { ETHTOOL_A_PAUSE_UNSPEC, ETHTOOL_A_PAUSE_HEADER, ETHTOOL_A_PAUSE_AUTONEG, diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 91ee22f53774..03eb1d941fca 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -27,8 +27,6 @@ * * Priority-based Flow Control (PFC) - provides a flow control mechanism which * can work independently for each 802.1p priority. - * See Documentation/networking/flow_control.rst for a high level description - * of the user space interface for Priority-based Flow Control (PFC). * * Congestion Notification - provides a mechanism for end-to-end congestion * control for protocols which do not have built-in congestion management. diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c index eacf6a4859bf..0f9af1e66548 100644 --- a/net/ethtool/pause.c +++ b/net/ethtool/pause.c @@ -1,9 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* See Documentation/networking/flow_control.rst for a high level description of - * the userspace interface. - */ - #include "netlink.h" #include "common.h" -- cgit v1.2.3 From d8e97cc476e33037ac69c5b09b351f5cc8d0589d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 6 Sep 2025 16:00:19 -0700 Subject: SUNRPC: Make RPCSEC_GSS_KRB5 select CRYPTO instead of depending on it Make RPCSEC_GSS_KRB5 select CRYPTO instead of depending on it. This unblocks the eventual removal of the selection of CRYPTO from NFSD_V4, which will no longer be needed by nfsd itself due to switching to the crypto library functions. But NFSD_V4 selects RPCSEC_GSS_KRB5, which still needs CRYPTO. It makes more sense for RPCSEC_GSS_KRB5 to select CRYPTO itself, like most other kconfig options that need CRYPTO do. Signed-off-by: Eric Biggers Acked-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index 2d8b67dac7b5..33aafdc8392e 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -18,9 +18,10 @@ config SUNRPC_SWAP config RPCSEC_GSS_KRB5 tristate "Secure RPC: Kerberos V mechanism" - depends on SUNRPC && CRYPTO + depends on SUNRPC default y select SUNRPC_GSS + select CRYPTO select CRYPTO_SKCIPHER select CRYPTO_HASH help -- cgit v1.2.3 From 7a0f94361ffd6e1d31c79023e8674b492bef05e3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 30 Sep 2025 19:24:26 -0700 Subject: net: psp: don't assume reply skbs will have a socket Rx path may be passing around unreferenced sockets, which means that skb_set_owner_edemux() may not set skb->sk and PSP will crash: KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017] RIP: 0010:psp_reply_set_decrypted (./include/net/psp/functions.h:132 net/psp/psp_sock.c:287) tcp_v6_send_response.constprop.0 (net/ipv6/tcp_ipv6.c:979) tcp_v6_send_reset (net/ipv6/tcp_ipv6.c:1140 (discriminator 1)) tcp_v6_do_rcv (net/ipv6/tcp_ipv6.c:1683) tcp_v6_rcv (net/ipv6/tcp_ipv6.c:1912) Fixes: 659a2899a57d ("tcp: add datapath logic for PSP with inline key exchange") Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20251001022426.2592750-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/psp/functions.h | 4 ++-- net/ipv4/ip_output.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- net/psp/psp_sock.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index ef7743664da3..c5c23a54774e 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -34,7 +34,7 @@ unsigned int psp_key_size(u32 version); void psp_sk_assoc_free(struct sock *sk); void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk); void psp_twsk_assoc_free(struct inet_timewait_sock *tw); -void psp_reply_set_decrypted(struct sk_buff *skb); +void psp_reply_set_decrypted(const struct sock *sk, struct sk_buff *skb); static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk) { @@ -160,7 +160,7 @@ static inline void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) { } static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } static inline void -psp_reply_set_decrypted(struct sk_buff *skb) { } +psp_reply_set_decrypted(const struct sock *sk, struct sk_buff *skb) { } static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk) { diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 5ca97ede979c..ff11d3a85a36 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1668,7 +1668,7 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, nskb->ip_summed = CHECKSUM_NONE; if (orig_sk) { skb_set_owner_edemux(nskb, (struct sock *)orig_sk); - psp_reply_set_decrypted(nskb); + psp_reply_set_decrypted(orig_sk, nskb); } if (transmit_time) nskb->tstamp_type = SKB_CLOCK_MONOTONIC; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 9622c2776ade..59c4977a811a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -974,7 +974,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 if (sk) { /* unconstify the socket only to attach it to buff with care. */ skb_set_owner_edemux(buff, (struct sock *)sk); - psp_reply_set_decrypted(buff); + psp_reply_set_decrypted(sk, buff); if (sk->sk_state == TCP_TIME_WAIT) mark = inet_twsk(sk)->tw_mark; diff --git a/net/psp/psp_sock.c b/net/psp/psp_sock.c index 5324a7603bed..a931d825d1cc 100644 --- a/net/psp/psp_sock.c +++ b/net/psp/psp_sock.c @@ -279,12 +279,12 @@ void psp_twsk_assoc_free(struct inet_timewait_sock *tw) psp_assoc_put(pas); } -void psp_reply_set_decrypted(struct sk_buff *skb) +void psp_reply_set_decrypted(const struct sock *sk, struct sk_buff *skb) { struct psp_assoc *pas; rcu_read_lock(); - pas = psp_sk_get_assoc_rcu(skb->sk); + pas = psp_sk_get_assoc_rcu(sk); if (pas && pas->tx.spi) skb->decrypted = 1; rcu_read_unlock(); -- cgit v1.2.3 From 1b54b0756f051c11f5a5d0fbc1581e0b9a18e2bc Mon Sep 17 00:00:00 2001 From: Bhanu Seshu Kumar Valluri Date: Wed, 1 Oct 2025 16:27:15 +0530 Subject: net: doc: Fix typos in docs Fix typos in doc comments. Signed-off-by: Bhanu Seshu Kumar Valluri Link: https://patch.msgid.link/20251001105715.50462-1-bhanuseshukumar@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 4 ++-- net/tipc/crypto.c | 2 +- net/tipc/topsrv.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/phy.h b/include/linux/phy.h index 7a54a8b4d277..3c7634482356 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -297,7 +297,7 @@ static inline const char *phy_modes(phy_interface_t interface) * * Description: maps RGMII supported link speeds into the clock rates. * This can also be used for MII, GMII, and RMII interface modes as the - * clock rates are indentical, but the caller must be aware that errors + * clock rates are identical, but the caller must be aware that errors * for unsupported clock rates will not be signalled. * * Returns: clock rate or negative errno @@ -519,7 +519,7 @@ enum phy_state { * struct phy_c45_device_ids - 802.3-c45 Device Identifiers * @devices_in_package: IEEE 802.3 devices in package register value. * @mmds_present: bit vector of MMDs present. - * @device_ids: The device identifer for each present device. + * @device_ids: The device identifier for each present device. */ struct phy_c45_device_ids { u32 devices_in_package; diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index ea5bb131ebd0..751904f10aab 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -1797,7 +1797,7 @@ exit: * @b: bearer where the message has been received * * If the decryption is successful, the decrypted skb is returned directly or - * as the callback, the encryption header and auth tag will be trimed out + * as the callback, the encryption header and auth tag will be trimmed out * before forwarding to tipc_rcv() via the tipc_crypto_rcv_complete(). * Otherwise, the skb will be freed! * Note: RX key(s) can be re-aligned, or in case of no key suitable, TX diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index ffe577bf6b51..aad7f96b6009 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -57,7 +57,7 @@ * @conn_idr: identifier set of connection * @idr_lock: protect the connection identifier set * @idr_in_use: amount of allocated identifier entry - * @net: network namspace instance + * @net: network namespace instance * @awork: accept work item * @rcv_wq: receive workqueue * @send_wq: send workqueue @@ -83,7 +83,7 @@ struct tipc_topsrv { * @sock: socket handler associated with connection * @flags: indicates connection state * @server: pointer to connected server - * @sub_list: lsit to all pertaing subscriptions + * @sub_list: list to all pertaining subscriptions * @sub_lock: lock protecting the subscription list * @rwork: receive work item * @outqueue: pointer to first outbound message in queue -- cgit v1.2.3 From 2f3119686ef50319490ccaec81a575973da98815 Mon Sep 17 00:00:00 2001 From: Alexandr Sapozhnikov Date: Thu, 2 Oct 2025 12:14:47 +0300 Subject: net/sctp: fix a null dereference in sctp_disposition sctp_sf_do_5_1D_ce() If new_asoc->peer.adaptation_ind=0 and sctp_ulpevent_make_authkey=0 and sctp_ulpevent_make_authkey() returns 0, then the variable ai_ev remains zero and the zero will be dereferenced in the sctp_ulpevent_free() function. Signed-off-by: Alexandr Sapozhnikov Acked-by: Xin Long Fixes: 30f6ebf65bc4 ("sctp: add SCTP_AUTH_NO_AUTH type for AUTHENTICATION_EVENT") Link: https://patch.msgid.link/20251002091448.11-1-alsp705@gmail.com Signed-off-by: Jakub Kicinski --- net/sctp/sm_statefuns.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 4cb8f393434d..3755ba079d07 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -886,7 +886,8 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, return SCTP_DISPOSITION_CONSUME; nomem_authev: - sctp_ulpevent_free(ai_ev); + if (ai_ev) + sctp_ulpevent_free(ai_ev); nomem_aiev: sctp_ulpevent_free(ev); nomem_ev: -- cgit v1.2.3 From 2e7cbbbe3d61c63606994b7ff73c72537afe2e1c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 1 Oct 2025 23:37:54 +0000 Subject: tcp: Don't call reqsk_fastopen_remove() in tcp_conn_request(). syzbot reported the splat below in tcp_conn_request(). [0] If a listener is close()d while a TFO socket is being processed in tcp_conn_request(), inet_csk_reqsk_queue_add() does not set reqsk->sk and calls inet_child_forget(), which calls tcp_disconnect() for the TFO socket. After the cited commit, tcp_disconnect() calls reqsk_fastopen_remove(), where reqsk_put() is called due to !reqsk->sk. Then, reqsk_fastopen_remove() in tcp_conn_request() decrements the last req->rsk_refcnt and frees reqsk, and __reqsk_free() at the drop_and_free label causes the refcount underflow for the listener and double-free of the reqsk. Let's remove reqsk_fastopen_remove() in tcp_conn_request(). Note that other callers make sure tp->fastopen_rsk is not NULL. [0]: refcount_t: underflow; use-after-free. WARNING: CPU: 12 PID: 5563 at lib/refcount.c:28 refcount_warn_saturate (lib/refcount.c:28) Modules linked in: CPU: 12 UID: 0 PID: 5563 Comm: syz-executor Not tainted syzkaller #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/12/2025 RIP: 0010:refcount_warn_saturate (lib/refcount.c:28) Code: ab e8 8e b4 98 ff 0f 0b c3 cc cc cc cc cc 80 3d a4 e4 d6 01 00 75 9c c6 05 9b e4 d6 01 01 48 c7 c7 e8 df fb ab e8 6a b4 98 ff <0f> 0b e9 03 5b 76 00 cc 80 3d 7d e4 d6 01 00 0f 85 74 ff ff ff c6 RSP: 0018:ffffa79fc0304a98 EFLAGS: 00010246 RAX: d83af4db1c6b3900 RBX: ffff9f65c7a69020 RCX: d83af4db1c6b3900 RDX: 0000000000000000 RSI: 00000000ffff7fff RDI: ffffffffac78a280 RBP: 000000009d781b60 R08: 0000000000007fff R09: ffffffffac6ca280 R10: 0000000000017ffd R11: 0000000000000004 R12: ffff9f65c7b4f100 R13: ffff9f65c7d23c00 R14: ffff9f65c7d26000 R15: ffff9f65c7a64ef8 FS: 00007f9f962176c0(0000) GS:ffff9f65fcf00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000200000000180 CR3: 000000000dbbe006 CR4: 0000000000372ef0 Call Trace: tcp_conn_request (./include/linux/refcount.h:400 ./include/linux/refcount.h:432 ./include/linux/refcount.h:450 ./include/net/sock.h:1965 ./include/net/request_sock.h:131 net/ipv4/tcp_input.c:7301) tcp_rcv_state_process (net/ipv4/tcp_input.c:6708) tcp_v6_do_rcv (net/ipv6/tcp_ipv6.c:1670) tcp_v6_rcv (net/ipv6/tcp_ipv6.c:1906) ip6_protocol_deliver_rcu (net/ipv6/ip6_input.c:438) ip6_input (net/ipv6/ip6_input.c:500) ipv6_rcv (net/ipv6/ip6_input.c:311) __netif_receive_skb (net/core/dev.c:6104) process_backlog (net/core/dev.c:6456) __napi_poll (net/core/dev.c:7506) net_rx_action (net/core/dev.c:7569 net/core/dev.c:7696) handle_softirqs (kernel/softirq.c:579) do_softirq (kernel/softirq.c:480) Fixes: 45c8a6cc2bcd ("tcp: Clear tcp_sk(sk)->fastopen_rsk in tcp_disconnect().") Reported-by: syzkaller Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20251001233755.1340927-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b44fdc309633..31ea5af49f2d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -7509,7 +7509,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, &foc, TCP_SYNACK_FASTOPEN, skb); /* Add the child socket directly into the accept queue */ if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) { - reqsk_fastopen_remove(fastopen_sk, req, false); bh_unlock_sock(fastopen_sk); sock_put(fastopen_sk); goto drop_and_free; -- cgit v1.2.3 From 95920c2ed02bde551ab654e9749c2ca7bc3100e0 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 30 Sep 2025 13:43:29 +0200 Subject: page_pool: Fix PP_MAGIC_MASK to avoid crashing on some 32-bit arches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Helge reported that the introduction of PP_MAGIC_MASK let to crashes on boot on his 32-bit parisc machine. The cause of this is the mask is set too wide, so the page_pool_page_is_pp() incurs false positives which crashes the machine. Just disabling the check in page_pool_is_pp() will lead to the page_pool code itself malfunctioning; so instead of doing this, this patch changes the define for PP_DMA_INDEX_BITS to avoid mistaking arbitrary kernel pointers for page_pool-tagged pages. The fix relies on the kernel pointers that alias with the pp_magic field always being above PAGE_OFFSET. With this assumption, we can use the lowest bit of the value of PAGE_OFFSET as the upper bound of the PP_DMA_INDEX_MASK, which should avoid the false positives. Because we cannot rely on PAGE_OFFSET always being a compile-time constant, nor on it always being >0, we fall back to disabling the dma_index storage when there are not enough bits available. This leaves us in the situation we were in before the patch in the Fixes tag, but only on a subset of architecture configurations. This seems to be the best we can do until the transition to page types in complete for page_pool pages. v2: - Make sure there's at least 8 bits available and that the PAGE_OFFSET bit calculation doesn't wrap Link: https://lore.kernel.org/all/aMNJMFa5fDalFmtn@p100/ Fixes: ee62ce7a1d90 ("page_pool: Track DMA-mapped pages and unmap them when destroying the pool") Cc: stable@vger.kernel.org # 6.15+ Tested-by: Helge Deller Signed-off-by: Toke Høiland-Jørgensen Reviewed-by: Mina Almasry Tested-by: Helge Deller Link: https://patch.msgid.link/20250930114331.675412-1-toke@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/mm.h | 22 ++++++++------- net/core/page_pool.c | 76 ++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 66 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1ae97a0b8ec7..0905eb6b55ec 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4159,14 +4159,13 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); * since this value becomes part of PP_SIGNATURE; meaning we can just use the * space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the * lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is - * 0, we make sure that we leave the two topmost bits empty, as that guarantees - * we won't mistake a valid kernel pointer for a value we set, regardless of the - * VMSPLIT setting. + * 0, we use the lowest bit of PAGE_OFFSET as the boundary if that value is + * known at compile-time. * - * Altogether, this means that the number of bits available is constrained by - * the size of an unsigned long (at the upper end, subtracting two bits per the - * above), and the definition of PP_SIGNATURE (with or without - * POISON_POINTER_DELTA). + * If the value of PAGE_OFFSET is not known at compile time, or if it is too + * small to leave at least 8 bits available above PP_SIGNATURE, we define the + * number of bits to be 0, which turns off the DMA index tracking altogether + * (see page_pool_register_dma_index()). */ #define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA)) #if POISON_POINTER_DELTA > 0 @@ -4175,8 +4174,13 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); */ #define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT) #else -/* Always leave out the topmost two; see above. */ -#define PP_DMA_INDEX_BITS MIN(32, BITS_PER_LONG - PP_DMA_INDEX_SHIFT - 2) +/* Use the lowest bit of PAGE_OFFSET if there's at least 8 bits available; see above */ +#define PP_DMA_INDEX_MIN_OFFSET (1 << (PP_DMA_INDEX_SHIFT + 8)) +#define PP_DMA_INDEX_BITS ((__builtin_constant_p(PAGE_OFFSET) && \ + PAGE_OFFSET >= PP_DMA_INDEX_MIN_OFFSET && \ + !(PAGE_OFFSET & (PP_DMA_INDEX_MIN_OFFSET - 1))) ? \ + MIN(32, __ffs(PAGE_OFFSET) - PP_DMA_INDEX_SHIFT) : 0) + #endif #define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \ diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 492728f9e021..1a5edec485f1 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -468,11 +468,60 @@ page_pool_dma_sync_for_device(const struct page_pool *pool, } } +static int page_pool_register_dma_index(struct page_pool *pool, + netmem_ref netmem, gfp_t gfp) +{ + int err = 0; + u32 id; + + if (unlikely(!PP_DMA_INDEX_BITS)) + goto out; + + if (in_softirq()) + err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + else + err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + if (err) { + WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@"); + goto out; + } + + netmem_set_dma_index(netmem, id); +out: + return err; +} + +static int page_pool_release_dma_index(struct page_pool *pool, + netmem_ref netmem) +{ + struct page *old, *page = netmem_to_page(netmem); + unsigned long id; + + if (unlikely(!PP_DMA_INDEX_BITS)) + return 0; + + id = netmem_get_dma_index(netmem); + if (!id) + return -1; + + if (in_softirq()) + old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0); + else + old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0); + if (old != page) + return -1; + + netmem_set_dma_index(netmem, 0); + + return 0; +} + static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp) { dma_addr_t dma; int err; - u32 id; /* Setup DMA mapping: use 'struct page' area for storing DMA-addr * since dma_addr_t can be either 32 or 64 bits and does not always fit @@ -491,18 +540,10 @@ static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t g goto unmap_failed; } - if (in_softirq()) - err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), - PP_DMA_INDEX_LIMIT, gfp); - else - err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), - PP_DMA_INDEX_LIMIT, gfp); - if (err) { - WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@"); + err = page_pool_register_dma_index(pool, netmem, gfp); + if (err) goto unset_failed; - } - netmem_set_dma_index(netmem, id); page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len); return true; @@ -680,8 +721,6 @@ void page_pool_clear_pp_info(netmem_ref netmem) static __always_inline void __page_pool_release_netmem_dma(struct page_pool *pool, netmem_ref netmem) { - struct page *old, *page = netmem_to_page(netmem); - unsigned long id; dma_addr_t dma; if (!pool->dma_map) @@ -690,15 +729,7 @@ static __always_inline void __page_pool_release_netmem_dma(struct page_pool *poo */ return; - id = netmem_get_dma_index(netmem); - if (!id) - return; - - if (in_softirq()) - old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0); - else - old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0); - if (old != page) + if (page_pool_release_dma_index(pool, netmem)) return; dma = page_pool_get_dma_addr_netmem(netmem); @@ -708,7 +739,6 @@ static __always_inline void __page_pool_release_netmem_dma(struct page_pool *poo PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); page_pool_set_dma_addr_netmem(netmem, 0); - netmem_set_dma_index(netmem, 0); } /* Disconnects a page (from a page_pool). API users can have a need -- cgit v1.2.3 From 21b29e74ffe5a6c851c235bb80bf5ee26292c67b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Oct 2025 18:41:19 +0000 Subject: tcp: take care of zero tp->window_clamp in tcp_set_rcvlowat() Some applications (like selftests/net/tcp_mmap.c) call SO_RCVLOWAT on their listener, before accept(). This has an unfortunate effect on wscale selection in tcp_select_initial_window() during 3WHS. For instance, tcp_mmap was negotiating wscale 4, regardless of tcp_rmem[2] and sysctl_rmem_max. Do not change tp->window_clamp if it is zero or bigger than our computed value. Zero value is special, it allows tcp_select_initial_window() to enable autotuning. Note that SO_RCVLOWAT use on listener is probably not wise, because tp->scaling_ratio has a default value, possibly wrong. Fixes: d1361840f8c5 ("tcp: fix SO_RCVLOWAT and RCVBUF autotuning") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20251003184119.2526655-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7949d16506a4..8a18aeca7ab0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1791,6 +1791,7 @@ EXPORT_IPV6_MOD(tcp_peek_len); /* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */ int tcp_set_rcvlowat(struct sock *sk, int val) { + struct tcp_sock *tp = tcp_sk(sk); int space, cap; if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) @@ -1809,7 +1810,9 @@ int tcp_set_rcvlowat(struct sock *sk, int val) space = tcp_space_from_win(sk, val); if (space > sk->sk_rcvbuf) { WRITE_ONCE(sk->sk_rcvbuf, space); - WRITE_ONCE(tcp_sk(sk)->window_clamp, val); + + if (tp->window_clamp && tp->window_clamp < val) + WRITE_ONCE(tp->window_clamp, val); } return 0; } -- cgit v1.2.3 From 23f3770e1a53e6c7a553135011f547209e141e72 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 3 Oct 2025 09:34:18 +0200 Subject: bpf: Fix metadata_dst leak __bpf_redirect_neigh_v{4,6} Cilium has a BPF egress gateway feature which forces outgoing K8s Pod traffic to pass through dedicated egress gateways which then SNAT the traffic in order to interact with stable IPs outside the cluster. The traffic is directed to the gateway via vxlan tunnel in collect md mode. A recent BPF change utilized the bpf_redirect_neigh() helper to forward packets after the arrival and decap on vxlan, which turned out over time that the kmalloc-256 slab usage in kernel was ever-increasing. The issue was that vxlan allocates the metadata_dst object and attaches it through a fake dst entry to the skb. The latter was never released though given bpf_redirect_neigh() was merely setting the new dst entry via skb_dst_set() without dropping an existing one first. Fixes: b4ab31414970 ("bpf: Add redirect_neigh helper as redirect drop-in") Reported-by: Yusuke Suzuki Reported-by: Julian Wiedmann Signed-off-by: Daniel Borkmann Cc: Martin KaFai Lau Cc: Jakub Kicinski Cc: Jordan Rife Reviewed-by: Simon Horman Reviewed-by: Jordan Rife Reviewed-by: Jakub Kicinski Reviewed-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20251003073418.291171-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 5d1838ff1ab9..76628df1fc82 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2281,6 +2281,7 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, if (IS_ERR(dst)) goto out_drop; + skb_dst_drop(skb); skb_dst_set(skb, dst); } else if (nh->nh_family != AF_INET6) { goto out_drop; @@ -2389,6 +2390,7 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, goto out_drop; } + skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); } -- cgit v1.2.3 From f359b809d54c6e3dd1d039b97e0b68390b0e53e4 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Wed, 8 Oct 2025 12:08:16 +0200 Subject: netfilter: nft_objref: validate objref and objrefmap expressions Referencing a synproxy stateful object from OUTPUT hook causes kernel crash due to infinite recursive calls: BUG: TASK stack guard page was hit at 000000008bda5b8c (stack is 000000003ab1c4a5..00000000494d8b12) [...] Call Trace: __find_rr_leaf+0x99/0x230 fib6_table_lookup+0x13b/0x2d0 ip6_pol_route+0xa4/0x400 fib6_rule_lookup+0x156/0x240 ip6_route_output_flags+0xc6/0x150 __nf_ip6_route+0x23/0x50 synproxy_send_tcp_ipv6+0x106/0x200 synproxy_send_client_synack_ipv6+0x1aa/0x1f0 nft_synproxy_do_eval+0x263/0x310 nft_do_chain+0x5a8/0x5f0 [nf_tables nft_do_chain_inet+0x98/0x110 nf_hook_slow+0x43/0xc0 __ip6_local_out+0xf0/0x170 ip6_local_out+0x17/0x70 synproxy_send_tcp_ipv6+0x1a2/0x200 synproxy_send_client_synack_ipv6+0x1aa/0x1f0 [...] Implement objref and objrefmap expression validate functions. Currently, only NFT_OBJECT_SYNPROXY object type requires validation. This will also handle a jump to a chain using a synproxy object from the OUTPUT hook. Now when trying to reference a synproxy object in the OUTPUT hook, nft will produce the following error: synproxy_crash.nft: Error: Could not process rule: Operation not supported synproxy name mysynproxy ^^^^^^^^^^^^^^^^^^^^^^^^ Fixes: ee394f96ad75 ("netfilter: nft_synproxy: add synproxy stateful object support") Reported-by: Georg Pfuetzenreuter Closes: https://bugzilla.suse.com/1250237 Signed-off-by: Fernando Fernandez Mancera Reviewed-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal --- net/netfilter/nft_objref.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 8ee66a86c3bc..1a62e384766a 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -22,6 +22,35 @@ void nft_objref_eval(const struct nft_expr *expr, obj->ops->eval(obj, regs, pkt); } +static int nft_objref_validate_obj_type(const struct nft_ctx *ctx, u32 type) +{ + unsigned int hooks; + + switch (type) { + case NFT_OBJECT_SYNPROXY: + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + + hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD); + + return nft_chain_validate_hooks(ctx->chain, hooks); + default: + break; + } + + return 0; +} + +static int nft_objref_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_object *obj = nft_objref_priv(expr); + + return nft_objref_validate_obj_type(ctx, obj->ops->type->type); +} + static int nft_objref_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -93,6 +122,7 @@ static const struct nft_expr_ops nft_objref_ops = { .activate = nft_objref_activate, .deactivate = nft_objref_deactivate, .dump = nft_objref_dump, + .validate = nft_objref_validate, .reduce = NFT_REDUCE_READONLY, }; @@ -197,6 +227,14 @@ static void nft_objref_map_destroy(const struct nft_ctx *ctx, nf_tables_destroy_set(ctx, priv->set); } +static int nft_objref_map_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + const struct nft_objref_map *priv = nft_expr_priv(expr); + + return nft_objref_validate_obj_type(ctx, priv->set->objtype); +} + static const struct nft_expr_ops nft_objref_map_ops = { .type = &nft_objref_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)), @@ -206,6 +244,7 @@ static const struct nft_expr_ops nft_objref_map_ops = { .deactivate = nft_objref_map_deactivate, .destroy = nft_objref_map_destroy, .dump = nft_objref_map_dump, + .validate = nft_objref_map_validate, .reduce = NFT_REDUCE_READONLY, }; -- cgit v1.2.3 From bbf0c98b3ad9edaea1f982de6c199cc11d3b7705 Mon Sep 17 00:00:00 2001 From: Eric Woudstra Date: Tue, 7 Oct 2025 10:15:01 +0200 Subject: bridge: br_vlan_fill_forward_path_pvid: use br_vlan_group_rcu() net/bridge/br_private.h:1627 suspicious rcu_dereference_protected() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 7 locks held by socat/410: #0: ffff88800d7a9c90 (sk_lock-AF_INET){+.+.}-{0:0}, at: inet_stream_connect+0x43/0xa0 #1: ffffffff9a779900 (rcu_read_lock){....}-{1:3}, at: __ip_queue_xmit+0x62/0x1830 [..] #6: ffffffff9a779900 (rcu_read_lock){....}-{1:3}, at: nf_hook.constprop.0+0x8a/0x440 Call Trace: lockdep_rcu_suspicious.cold+0x4f/0xb1 br_vlan_fill_forward_path_pvid+0x32c/0x410 [bridge] br_fill_forward_path+0x7a/0x4d0 [bridge] Use to correct helper, non _rcu variant requires RTNL mutex. Fixes: bcf2766b1377 ("net: bridge: resolve forwarding path for VLAN tag actions in bridge devices") Signed-off-by: Eric Woudstra Signed-off-by: Florian Westphal --- net/bridge/br_vlan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index ae911220cb3c..ce72b837ff8e 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -1457,7 +1457,7 @@ void br_vlan_fill_forward_path_pvid(struct net_bridge *br, if (!br_opt_get(br, BROPT_VLAN_ENABLED)) return; - vg = br_vlan_group(br); + vg = br_vlan_group_rcu(br); if (idx >= 0 && ctx->vlan[idx].proto == br->vlan_proto) { -- cgit v1.2.3 From 27c0a7b05d13a0dc54ed0b95fc12218210fdea1a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 31 Jul 2025 12:02:27 -0700 Subject: libceph: Use HMAC-SHA256 library instead of crypto_shash Use the HMAC-SHA256 library functions instead of crypto_shash. This is simpler and faster. Signed-off-by: Eric Biggers Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- include/linux/ceph/messenger.h | 4 ++- net/ceph/Kconfig | 3 +- net/ceph/messenger_v2.c | 77 ++++++++++++------------------------------ 3 files changed, 26 insertions(+), 58 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 1717cc57cdac..4b49592a738f 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -2,6 +2,7 @@ #ifndef __FS_CEPH_MESSENGER_H #define __FS_CEPH_MESSENGER_H +#include #include #include #include @@ -412,7 +413,8 @@ struct ceph_connection_v2_info { struct ceph_msg_data_cursor in_cursor; struct ceph_msg_data_cursor out_cursor; - struct crypto_shash *hmac_tfm; /* post-auth signature */ + struct hmac_sha256_key hmac_key; /* post-auth signature */ + bool hmac_key_set; struct crypto_aead *gcm_tfm; /* on-wire encryption */ struct aead_request *gcm_req; struct crypto_wait gcm_wait; diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig index 0aa21fcbf6ec..ea60e3ef0834 100644 --- a/net/ceph/Kconfig +++ b/net/ceph/Kconfig @@ -6,8 +6,7 @@ config CEPH_LIB select CRYPTO_AES select CRYPTO_CBC select CRYPTO_GCM - select CRYPTO_HMAC - select CRYPTO_SHA256 + select CRYPTO_LIB_SHA256 select CRYPTO select KEYS default n diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index 5483b4eed94e..c54c8b5a6526 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -709,7 +709,7 @@ static int setup_crypto(struct ceph_connection *con, dout("%s con %p con_mode %d session_key_len %d con_secret_len %d\n", __func__, con, con->v2.con_mode, session_key_len, con_secret_len); - WARN_ON(con->v2.hmac_tfm || con->v2.gcm_tfm || con->v2.gcm_req); + WARN_ON(con->v2.hmac_key_set || con->v2.gcm_tfm || con->v2.gcm_req); if (con->v2.con_mode != CEPH_CON_MODE_CRC && con->v2.con_mode != CEPH_CON_MODE_SECURE) { @@ -723,22 +723,8 @@ static int setup_crypto(struct ceph_connection *con, return 0; /* auth_none */ } - noio_flag = memalloc_noio_save(); - con->v2.hmac_tfm = crypto_alloc_shash("hmac(sha256)", 0, 0); - memalloc_noio_restore(noio_flag); - if (IS_ERR(con->v2.hmac_tfm)) { - ret = PTR_ERR(con->v2.hmac_tfm); - con->v2.hmac_tfm = NULL; - pr_err("failed to allocate hmac tfm context: %d\n", ret); - return ret; - } - - ret = crypto_shash_setkey(con->v2.hmac_tfm, session_key, - session_key_len); - if (ret) { - pr_err("failed to set hmac key: %d\n", ret); - return ret; - } + hmac_sha256_preparekey(&con->v2.hmac_key, session_key, session_key_len); + con->v2.hmac_key_set = true; if (con->v2.con_mode == CEPH_CON_MODE_CRC) { WARN_ON(con_secret_len); @@ -793,38 +779,26 @@ static int setup_crypto(struct ceph_connection *con, return 0; /* auth_x, secure mode */ } -static int ceph_hmac_sha256(struct ceph_connection *con, - const struct kvec *kvecs, int kvec_cnt, u8 *hmac) +static void ceph_hmac_sha256(struct ceph_connection *con, + const struct kvec *kvecs, int kvec_cnt, + u8 hmac[SHA256_DIGEST_SIZE]) { - SHASH_DESC_ON_STACK(desc, con->v2.hmac_tfm); /* tfm arg is ignored */ - int ret; + struct hmac_sha256_ctx ctx; int i; - dout("%s con %p hmac_tfm %p kvec_cnt %d\n", __func__, con, - con->v2.hmac_tfm, kvec_cnt); + dout("%s con %p hmac_key_set %d kvec_cnt %d\n", __func__, con, + con->v2.hmac_key_set, kvec_cnt); - if (!con->v2.hmac_tfm) { + if (!con->v2.hmac_key_set) { memset(hmac, 0, SHA256_DIGEST_SIZE); - return 0; /* auth_none */ + return; /* auth_none */ } - desc->tfm = con->v2.hmac_tfm; - ret = crypto_shash_init(desc); - if (ret) - goto out; - - for (i = 0; i < kvec_cnt; i++) { - ret = crypto_shash_update(desc, kvecs[i].iov_base, - kvecs[i].iov_len); - if (ret) - goto out; - } - - ret = crypto_shash_final(desc, hmac); - -out: - shash_desc_zero(desc); - return ret; /* auth_x, both plain and secure modes */ + /* auth_x, both plain and secure modes */ + hmac_sha256_init(&ctx, &con->v2.hmac_key); + for (i = 0; i < kvec_cnt; i++) + hmac_sha256_update(&ctx, kvecs[i].iov_base, kvecs[i].iov_len); + hmac_sha256_final(&ctx, hmac); } static void gcm_inc_nonce(struct ceph_gcm_nonce *nonce) @@ -1455,17 +1429,14 @@ static int prepare_auth_request_more(struct ceph_connection *con, static int prepare_auth_signature(struct ceph_connection *con) { void *buf; - int ret; buf = alloc_conn_buf(con, head_onwire_len(SHA256_DIGEST_SIZE, con_secure(con))); if (!buf) return -ENOMEM; - ret = ceph_hmac_sha256(con, con->v2.in_sign_kvecs, - con->v2.in_sign_kvec_cnt, CTRL_BODY(buf)); - if (ret) - return ret; + ceph_hmac_sha256(con, con->v2.in_sign_kvecs, con->v2.in_sign_kvec_cnt, + CTRL_BODY(buf)); return prepare_control(con, FRAME_TAG_AUTH_SIGNATURE, buf, SHA256_DIGEST_SIZE); @@ -2460,10 +2431,8 @@ static int process_auth_signature(struct ceph_connection *con, return -EINVAL; } - ret = ceph_hmac_sha256(con, con->v2.out_sign_kvecs, - con->v2.out_sign_kvec_cnt, hmac); - if (ret) - return ret; + ceph_hmac_sha256(con, con->v2.out_sign_kvecs, con->v2.out_sign_kvec_cnt, + hmac); ceph_decode_need(&p, end, SHA256_DIGEST_SIZE, bad); if (crypto_memneq(p, hmac, SHA256_DIGEST_SIZE)) { @@ -3814,10 +3783,8 @@ void ceph_con_v2_reset_protocol(struct ceph_connection *con) memzero_explicit(&con->v2.in_gcm_nonce, CEPH_GCM_IV_LEN); memzero_explicit(&con->v2.out_gcm_nonce, CEPH_GCM_IV_LEN); - if (con->v2.hmac_tfm) { - crypto_free_shash(con->v2.hmac_tfm); - con->v2.hmac_tfm = NULL; - } + memzero_explicit(&con->v2.hmac_key, sizeof(con->v2.hmac_key)); + con->v2.hmac_key_set = false; if (con->v2.gcm_req) { aead_request_free(con->v2.gcm_req); con->v2.gcm_req = NULL; -- cgit v1.2.3 From 59699a5a7114f09f890e86c09a6b32afb5eaa64c Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Wed, 6 Aug 2025 11:48:53 +0200 Subject: libceph: make ceph_con_get_out_msg() return the message pointer The caller in messenger_v1.c loads it anyway, so let's keep the pointer in the register instead of reloading it from memory. This eliminates a tiny bit of unnecessary overhead. Signed-off-by: Max Kellermann Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- include/linux/ceph/messenger.h | 2 +- net/ceph/messenger.c | 4 ++-- net/ceph/messenger_v1.c | 3 +-- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 4b49592a738f..9ebcac2981fd 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -550,7 +550,7 @@ void ceph_addr_set_port(struct ceph_entity_addr *addr, int p); void ceph_con_process_message(struct ceph_connection *con); int ceph_con_in_msg_alloc(struct ceph_connection *con, struct ceph_msg_header *hdr, int *skip); -void ceph_con_get_out_msg(struct ceph_connection *con); +struct ceph_msg *ceph_con_get_out_msg(struct ceph_connection *con); /* messenger_v1.c */ int ceph_con_v1_try_read(struct ceph_connection *con); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 9f6d860411cb..b6c7bfc03503 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2110,7 +2110,7 @@ int ceph_con_in_msg_alloc(struct ceph_connection *con, return ret; } -void ceph_con_get_out_msg(struct ceph_connection *con) +struct ceph_msg *ceph_con_get_out_msg(struct ceph_connection *con) { struct ceph_msg *msg; @@ -2141,7 +2141,7 @@ void ceph_con_get_out_msg(struct ceph_connection *con) * message or in case of a fault. */ WARN_ON(con->out_msg); - con->out_msg = ceph_msg_get(msg); + return con->out_msg = ceph_msg_get(msg); } /* diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c index 0cb61c76b9b8..eebe4e19d75a 100644 --- a/net/ceph/messenger_v1.c +++ b/net/ceph/messenger_v1.c @@ -210,8 +210,7 @@ static void prepare_write_message(struct ceph_connection *con) &con->v1.out_temp_ack); } - ceph_con_get_out_msg(con); - m = con->out_msg; + m = ceph_con_get_out_msg(con); dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n", m, con->out_seq, le16_to_cpu(m->hdr.type), -- cgit v1.2.3 From 7399212dcf64d90a6ab239bdd98bd325d922fc7e Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Wed, 6 Aug 2025 11:48:54 +0200 Subject: libceph: pass the message pointer instead of loading con->out_msg This pointer is in a register anyway, so let's use that instead of reloading from memory everywhere. [ idryomov: formatting ] Signed-off-by: Max Kellermann Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- include/linux/ceph/messenger.h | 4 +- net/ceph/messenger.c | 4 +- net/ceph/messenger_v1.c | 45 ++++++----- net/ceph/messenger_v2.c | 168 +++++++++++++++++++++-------------------- 4 files changed, 114 insertions(+), 107 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 9ebcac2981fd..6aa4c6478c9f 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -555,7 +555,7 @@ struct ceph_msg *ceph_con_get_out_msg(struct ceph_connection *con); /* messenger_v1.c */ int ceph_con_v1_try_read(struct ceph_connection *con); int ceph_con_v1_try_write(struct ceph_connection *con); -void ceph_con_v1_revoke(struct ceph_connection *con); +void ceph_con_v1_revoke(struct ceph_connection *con, struct ceph_msg *msg); void ceph_con_v1_revoke_incoming(struct ceph_connection *con); bool ceph_con_v1_opened(struct ceph_connection *con); void ceph_con_v1_reset_session(struct ceph_connection *con); @@ -564,7 +564,7 @@ void ceph_con_v1_reset_protocol(struct ceph_connection *con); /* messenger_v2.c */ int ceph_con_v2_try_read(struct ceph_connection *con); int ceph_con_v2_try_write(struct ceph_connection *con); -void ceph_con_v2_revoke(struct ceph_connection *con); +void ceph_con_v2_revoke(struct ceph_connection *con, struct ceph_msg *msg); void ceph_con_v2_revoke_incoming(struct ceph_connection *con); bool ceph_con_v2_opened(struct ceph_connection *con); void ceph_con_v2_reset_session(struct ceph_connection *con); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index b6c7bfc03503..08a6a083609f 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1793,9 +1793,9 @@ void ceph_msg_revoke(struct ceph_msg *msg) WARN_ON(con->state != CEPH_CON_S_OPEN); dout("%s con %p msg %p was sending\n", __func__, con, msg); if (ceph_msgr2(from_msgr(con->msgr))) - ceph_con_v2_revoke(con); + ceph_con_v2_revoke(con, msg); else - ceph_con_v1_revoke(con); + ceph_con_v1_revoke(con, msg); ceph_msg_put(con->out_msg); con->out_msg = NULL; } else { diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c index eebe4e19d75a..cc4a36ef8462 100644 --- a/net/ceph/messenger_v1.c +++ b/net/ceph/messenger_v1.c @@ -169,10 +169,9 @@ static void prepare_message_data(struct ceph_msg *msg, u32 data_len) * Prepare footer for currently outgoing message, and finish things * off. Assumes out_kvec* are already valid.. we just add on to the end. */ -static void prepare_write_message_footer(struct ceph_connection *con) +static void prepare_write_message_footer(struct ceph_connection *con, + struct ceph_msg *m) { - struct ceph_msg *m = con->out_msg; - m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; dout("prepare_write_message_footer %p\n", con); @@ -230,31 +229,31 @@ static void prepare_write_message(struct ceph_connection *con) /* fill in hdr crc and finalize hdr */ crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); - con->out_msg->hdr.crc = cpu_to_le32(crc); - memcpy(&con->v1.out_hdr, &con->out_msg->hdr, sizeof(con->v1.out_hdr)); + m->hdr.crc = cpu_to_le32(crc); + memcpy(&con->v1.out_hdr, &m->hdr, sizeof(con->v1.out_hdr)); /* fill in front and middle crc, footer */ crc = crc32c(0, m->front.iov_base, m->front.iov_len); - con->out_msg->footer.front_crc = cpu_to_le32(crc); + m->footer.front_crc = cpu_to_le32(crc); if (m->middle) { crc = crc32c(0, m->middle->vec.iov_base, m->middle->vec.iov_len); - con->out_msg->footer.middle_crc = cpu_to_le32(crc); + m->footer.middle_crc = cpu_to_le32(crc); } else - con->out_msg->footer.middle_crc = 0; + m->footer.middle_crc = 0; dout("%s front_crc %u middle_crc %u\n", __func__, - le32_to_cpu(con->out_msg->footer.front_crc), - le32_to_cpu(con->out_msg->footer.middle_crc)); - con->out_msg->footer.flags = 0; + le32_to_cpu(m->footer.front_crc), + le32_to_cpu(m->footer.middle_crc)); + m->footer.flags = 0; /* is there a data payload? */ - con->out_msg->footer.data_crc = 0; + m->footer.data_crc = 0; if (m->data_length) { - prepare_message_data(con->out_msg, m->data_length); + prepare_message_data(m, m->data_length); con->v1.out_more = 1; /* data + footer will follow */ } else { /* no, queue up footer too and be done */ - prepare_write_message_footer(con); + prepare_write_message_footer(con, m); } ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); @@ -461,9 +460,9 @@ out: * 0 -> socket full, but more to do * <0 -> error */ -static int write_partial_message_data(struct ceph_connection *con) +static int write_partial_message_data(struct ceph_connection *con, + struct ceph_msg *msg) { - struct ceph_msg *msg = con->out_msg; struct ceph_msg_data_cursor *cursor = &msg->cursor; bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); u32 crc; @@ -515,7 +514,7 @@ static int write_partial_message_data(struct ceph_connection *con) else msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; con_out_kvec_reset(con); - prepare_write_message_footer(con); + prepare_write_message_footer(con, msg); return 1; /* must return > 0 to indicate success */ } @@ -1471,6 +1470,7 @@ bad_tag: */ int ceph_con_v1_try_write(struct ceph_connection *con) { + struct ceph_msg *msg; int ret = 1; dout("try_write start %p state %d\n", con, con->state); @@ -1517,14 +1517,15 @@ more: } /* msg pages? */ - if (con->out_msg) { + msg = con->out_msg; + if (msg) { if (con->v1.out_msg_done) { - ceph_msg_put(con->out_msg); + ceph_msg_put(msg); con->out_msg = NULL; /* we're done with this one */ goto do_next; } - ret = write_partial_message_data(con); + ret = write_partial_message_data(con, msg); if (ret == 1) goto more; /* we need to send the footer, too! */ if (ret == 0) @@ -1563,10 +1564,8 @@ out: return ret; } -void ceph_con_v1_revoke(struct ceph_connection *con) +void ceph_con_v1_revoke(struct ceph_connection *con, struct ceph_msg *msg) { - struct ceph_msg *msg = con->out_msg; - WARN_ON(con->v1.out_skip); /* footer */ if (con->v1.out_msg_done) { diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index c54c8b5a6526..b44e936f3865 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -1560,10 +1560,11 @@ static int prepare_ack(struct ceph_connection *con) return prepare_control(con, FRAME_TAG_ACK, con->v2.out_buf, 8); } -static void prepare_epilogue_plain(struct ceph_connection *con, bool aborted) +static void prepare_epilogue_plain(struct ceph_connection *con, + struct ceph_msg *msg, bool aborted) { dout("%s con %p msg %p aborted %d crcs %u %u %u\n", __func__, con, - con->out_msg, aborted, con->v2.out_epil.front_crc, + msg, aborted, con->v2.out_epil.front_crc, con->v2.out_epil.middle_crc, con->v2.out_epil.data_crc); encode_epilogue_plain(con, aborted); @@ -1574,10 +1575,9 @@ static void prepare_epilogue_plain(struct ceph_connection *con, bool aborted) * For "used" empty segments, crc is -1. For unused (trailing) * segments, crc is 0. */ -static void prepare_message_plain(struct ceph_connection *con) +static void prepare_message_plain(struct ceph_connection *con, + struct ceph_msg *msg) { - struct ceph_msg *msg = con->out_msg; - prepare_head_plain(con, con->v2.out_buf, sizeof(struct ceph_msg_header2), NULL, 0, false); @@ -1618,7 +1618,7 @@ static void prepare_message_plain(struct ceph_connection *con) con->v2.out_state = OUT_S_QUEUE_DATA; } else { con->v2.out_epil.data_crc = 0; - prepare_epilogue_plain(con, false); + prepare_epilogue_plain(con, msg, false); con->v2.out_state = OUT_S_FINISH_MESSAGE; } } @@ -1630,7 +1630,8 @@ static void prepare_message_plain(struct ceph_connection *con) * allocate pages for the entire tail of the message (currently up * to ~32M) and two sgs arrays (up to ~256K each)... */ -static int prepare_message_secure(struct ceph_connection *con) +static int prepare_message_secure(struct ceph_connection *con, + struct ceph_msg *msg) { void *zerop = page_address(ceph_zero_page); struct sg_table enc_sgt = {}; @@ -1645,7 +1646,7 @@ static int prepare_message_secure(struct ceph_connection *con) if (ret) return ret; - tail_len = tail_onwire_len(con->out_msg, true); + tail_len = tail_onwire_len(msg, true); if (!tail_len) { /* * Empty message: once the head is written, @@ -1656,7 +1657,7 @@ static int prepare_message_secure(struct ceph_connection *con) } encode_epilogue_secure(con, false); - ret = setup_message_sgs(&sgt, con->out_msg, zerop, zerop, zerop, + ret = setup_message_sgs(&sgt, msg, zerop, zerop, zerop, &con->v2.out_epil, NULL, 0, false); if (ret) goto out; @@ -1685,7 +1686,7 @@ static int prepare_message_secure(struct ceph_connection *con) goto out; dout("%s con %p msg %p sg_cnt %d enc_page_cnt %d\n", __func__, con, - con->out_msg, sgt.orig_nents, enc_page_cnt); + msg, sgt.orig_nents, enc_page_cnt); con->v2.out_state = OUT_S_QUEUE_ENC_PAGE; out: @@ -1694,19 +1695,19 @@ out: return ret; } -static int prepare_message(struct ceph_connection *con) +static int prepare_message(struct ceph_connection *con, struct ceph_msg *msg) { int lens[] = { sizeof(struct ceph_msg_header2), - front_len(con->out_msg), - middle_len(con->out_msg), - data_len(con->out_msg) + front_len(msg), + middle_len(msg), + data_len(msg) }; struct ceph_frame_desc desc; int ret; dout("%s con %p msg %p logical %d+%d+%d+%d\n", __func__, con, - con->out_msg, lens[0], lens[1], lens[2], lens[3]); + msg, lens[0], lens[1], lens[2], lens[3]); if (con->in_seq > con->in_seq_acked) { dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con, @@ -1717,15 +1718,15 @@ static int prepare_message(struct ceph_connection *con) reset_out_kvecs(con); init_frame_desc(&desc, FRAME_TAG_MESSAGE, lens, 4); encode_preamble(&desc, con->v2.out_buf); - fill_header2(CTRL_BODY(con->v2.out_buf), &con->out_msg->hdr, + fill_header2(CTRL_BODY(con->v2.out_buf), &msg->hdr, con->in_seq_acked); if (con_secure(con)) { - ret = prepare_message_secure(con); + ret = prepare_message_secure(con, msg); if (ret) return ret; } else { - prepare_message_plain(con); + prepare_message_plain(con, msg); } ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); @@ -3153,20 +3154,20 @@ int ceph_con_v2_try_read(struct ceph_connection *con) } } -static void queue_data(struct ceph_connection *con) +static void queue_data(struct ceph_connection *con, struct ceph_msg *msg) { struct bio_vec bv; con->v2.out_epil.data_crc = -1; - ceph_msg_data_cursor_init(&con->v2.out_cursor, con->out_msg, - data_len(con->out_msg)); + ceph_msg_data_cursor_init(&con->v2.out_cursor, msg, + data_len(msg)); get_bvec_at(&con->v2.out_cursor, &bv); set_out_bvec(con, &bv, true); con->v2.out_state = OUT_S_QUEUE_DATA_CONT; } -static void queue_data_cont(struct ceph_connection *con) +static void queue_data_cont(struct ceph_connection *con, struct ceph_msg *msg) { struct bio_vec bv; @@ -3187,7 +3188,7 @@ static void queue_data_cont(struct ceph_connection *con) * we are done. */ reset_out_kvecs(con); - prepare_epilogue_plain(con, false); + prepare_epilogue_plain(con, msg, false); con->v2.out_state = OUT_S_FINISH_MESSAGE; } @@ -3219,7 +3220,7 @@ static void queue_enc_page(struct ceph_connection *con) con->v2.out_state = OUT_S_FINISH_MESSAGE; } -static void queue_zeros(struct ceph_connection *con) +static void queue_zeros(struct ceph_connection *con, struct ceph_msg *msg) { dout("%s con %p out_zero %d\n", __func__, con, con->v2.out_zero); @@ -3236,7 +3237,7 @@ static void queue_zeros(struct ceph_connection *con) * Once it's written, we are done patching up for the revoke. */ reset_out_kvecs(con); - prepare_epilogue_plain(con, true); + prepare_epilogue_plain(con, msg, true); con->v2.out_state = OUT_S_FINISH_MESSAGE; } @@ -3263,6 +3264,7 @@ static void finish_message(struct ceph_connection *con) static int populate_out_iter(struct ceph_connection *con) { + struct ceph_msg *msg; int ret; dout("%s con %p state %d out_state %d\n", __func__, con, con->state, @@ -3278,18 +3280,18 @@ static int populate_out_iter(struct ceph_connection *con) switch (con->v2.out_state) { case OUT_S_QUEUE_DATA: WARN_ON(!con->out_msg); - queue_data(con); + queue_data(con, con->out_msg); goto populated; case OUT_S_QUEUE_DATA_CONT: WARN_ON(!con->out_msg); - queue_data_cont(con); + queue_data_cont(con, con->out_msg); goto populated; case OUT_S_QUEUE_ENC_PAGE: queue_enc_page(con); goto populated; case OUT_S_QUEUE_ZEROS: WARN_ON(con->out_msg); /* revoked */ - queue_zeros(con); + queue_zeros(con, con->out_msg); goto populated; case OUT_S_FINISH_MESSAGE: finish_message(con); @@ -3309,8 +3311,8 @@ static int populate_out_iter(struct ceph_connection *con) return ret; } } else if (!list_empty(&con->out_queue)) { - ceph_con_get_out_msg(con); - ret = prepare_message(con); + msg = ceph_con_get_out_msg(con); + ret = prepare_message(con, msg); if (ret) { pr_err("prepare_message failed: %d\n", ret); return ret; @@ -3422,17 +3424,18 @@ static u32 crc32c_zeros(u32 crc, int zero_len) return crc; } -static void prepare_zero_front(struct ceph_connection *con, int resid) +static void prepare_zero_front(struct ceph_connection *con, + struct ceph_msg *msg, int resid) { int sent; - WARN_ON(!resid || resid > front_len(con->out_msg)); - sent = front_len(con->out_msg) - resid; + WARN_ON(!resid || resid > front_len(msg)); + sent = front_len(msg) - resid; dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid); if (sent) { con->v2.out_epil.front_crc = - crc32c(-1, con->out_msg->front.iov_base, sent); + crc32c(-1, msg->front.iov_base, sent); con->v2.out_epil.front_crc = crc32c_zeros(con->v2.out_epil.front_crc, resid); } else { @@ -3443,17 +3446,18 @@ static void prepare_zero_front(struct ceph_connection *con, int resid) out_zero_add(con, resid); } -static void prepare_zero_middle(struct ceph_connection *con, int resid) +static void prepare_zero_middle(struct ceph_connection *con, + struct ceph_msg *msg, int resid) { int sent; - WARN_ON(!resid || resid > middle_len(con->out_msg)); - sent = middle_len(con->out_msg) - resid; + WARN_ON(!resid || resid > middle_len(msg)); + sent = middle_len(msg) - resid; dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid); if (sent) { con->v2.out_epil.middle_crc = - crc32c(-1, con->out_msg->middle->vec.iov_base, sent); + crc32c(-1, msg->middle->vec.iov_base, sent); con->v2.out_epil.middle_crc = crc32c_zeros(con->v2.out_epil.middle_crc, resid); } else { @@ -3464,61 +3468,64 @@ static void prepare_zero_middle(struct ceph_connection *con, int resid) out_zero_add(con, resid); } -static void prepare_zero_data(struct ceph_connection *con) +static void prepare_zero_data(struct ceph_connection *con, + struct ceph_msg *msg) { dout("%s con %p\n", __func__, con); - con->v2.out_epil.data_crc = crc32c_zeros(-1, data_len(con->out_msg)); - out_zero_add(con, data_len(con->out_msg)); + con->v2.out_epil.data_crc = crc32c_zeros(-1, data_len(msg)); + out_zero_add(con, data_len(msg)); } -static void revoke_at_queue_data(struct ceph_connection *con) +static void revoke_at_queue_data(struct ceph_connection *con, + struct ceph_msg *msg) { int boundary; int resid; - WARN_ON(!data_len(con->out_msg)); + WARN_ON(!data_len(msg)); WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter)); resid = iov_iter_count(&con->v2.out_iter); - boundary = front_len(con->out_msg) + middle_len(con->out_msg); + boundary = front_len(msg) + middle_len(msg); if (resid > boundary) { resid -= boundary; WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN); dout("%s con %p was sending head\n", __func__, con); - if (front_len(con->out_msg)) - prepare_zero_front(con, front_len(con->out_msg)); - if (middle_len(con->out_msg)) - prepare_zero_middle(con, middle_len(con->out_msg)); - prepare_zero_data(con); + if (front_len(msg)) + prepare_zero_front(con, msg, front_len(msg)); + if (middle_len(msg)) + prepare_zero_middle(con, msg, middle_len(msg)); + prepare_zero_data(con, msg); WARN_ON(iov_iter_count(&con->v2.out_iter) != resid); con->v2.out_state = OUT_S_QUEUE_ZEROS; return; } - boundary = middle_len(con->out_msg); + boundary = middle_len(msg); if (resid > boundary) { resid -= boundary; dout("%s con %p was sending front\n", __func__, con); - prepare_zero_front(con, resid); - if (middle_len(con->out_msg)) - prepare_zero_middle(con, middle_len(con->out_msg)); - prepare_zero_data(con); - queue_zeros(con); + prepare_zero_front(con, msg, resid); + if (middle_len(msg)) + prepare_zero_middle(con, msg, middle_len(msg)); + prepare_zero_data(con, msg); + queue_zeros(con, msg); return; } WARN_ON(!resid); dout("%s con %p was sending middle\n", __func__, con); - prepare_zero_middle(con, resid); - prepare_zero_data(con); - queue_zeros(con); + prepare_zero_middle(con, msg, resid); + prepare_zero_data(con, msg); + queue_zeros(con, msg); } -static void revoke_at_queue_data_cont(struct ceph_connection *con) +static void revoke_at_queue_data_cont(struct ceph_connection *con, + struct ceph_msg *msg) { int sent, resid; /* current piece of data */ - WARN_ON(!data_len(con->out_msg)); + WARN_ON(!data_len(msg)); WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter)); resid = iov_iter_count(&con->v2.out_iter); WARN_ON(!resid || resid > con->v2.out_bvec.bv_len); @@ -3537,10 +3544,11 @@ static void revoke_at_queue_data_cont(struct ceph_connection *con) con->v2.out_iter.count -= resid; out_zero_add(con, con->v2.out_cursor.total_resid); - queue_zeros(con); + queue_zeros(con, msg); } -static void revoke_at_finish_message(struct ceph_connection *con) +static void revoke_at_finish_message(struct ceph_connection *con, + struct ceph_msg *msg) { int boundary; int resid; @@ -3548,39 +3556,39 @@ static void revoke_at_finish_message(struct ceph_connection *con) WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter)); resid = iov_iter_count(&con->v2.out_iter); - if (!front_len(con->out_msg) && !middle_len(con->out_msg) && - !data_len(con->out_msg)) { + if (!front_len(msg) && !middle_len(msg) && + !data_len(msg)) { WARN_ON(!resid || resid > MESSAGE_HEAD_PLAIN_LEN); dout("%s con %p was sending head (empty message) - noop\n", __func__, con); return; } - boundary = front_len(con->out_msg) + middle_len(con->out_msg) + + boundary = front_len(msg) + middle_len(msg) + CEPH_EPILOGUE_PLAIN_LEN; if (resid > boundary) { resid -= boundary; WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN); dout("%s con %p was sending head\n", __func__, con); - if (front_len(con->out_msg)) - prepare_zero_front(con, front_len(con->out_msg)); - if (middle_len(con->out_msg)) - prepare_zero_middle(con, middle_len(con->out_msg)); + if (front_len(msg)) + prepare_zero_front(con, msg, front_len(msg)); + if (middle_len(msg)) + prepare_zero_middle(con, msg, middle_len(msg)); con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN; WARN_ON(iov_iter_count(&con->v2.out_iter) != resid); con->v2.out_state = OUT_S_QUEUE_ZEROS; return; } - boundary = middle_len(con->out_msg) + CEPH_EPILOGUE_PLAIN_LEN; + boundary = middle_len(msg) + CEPH_EPILOGUE_PLAIN_LEN; if (resid > boundary) { resid -= boundary; dout("%s con %p was sending front\n", __func__, con); - prepare_zero_front(con, resid); - if (middle_len(con->out_msg)) - prepare_zero_middle(con, middle_len(con->out_msg)); + prepare_zero_front(con, msg, resid); + if (middle_len(msg)) + prepare_zero_middle(con, msg, middle_len(msg)); con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN; - queue_zeros(con); + queue_zeros(con, msg); return; } @@ -3588,9 +3596,9 @@ static void revoke_at_finish_message(struct ceph_connection *con) if (resid > boundary) { resid -= boundary; dout("%s con %p was sending middle\n", __func__, con); - prepare_zero_middle(con, resid); + prepare_zero_middle(con, msg, resid); con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN; - queue_zeros(con); + queue_zeros(con, msg); return; } @@ -3598,7 +3606,7 @@ static void revoke_at_finish_message(struct ceph_connection *con) dout("%s con %p was sending epilogue - noop\n", __func__, con); } -void ceph_con_v2_revoke(struct ceph_connection *con) +void ceph_con_v2_revoke(struct ceph_connection *con, struct ceph_msg *msg) { WARN_ON(con->v2.out_zero); @@ -3611,13 +3619,13 @@ void ceph_con_v2_revoke(struct ceph_connection *con) switch (con->v2.out_state) { case OUT_S_QUEUE_DATA: - revoke_at_queue_data(con); + revoke_at_queue_data(con, msg); break; case OUT_S_QUEUE_DATA_CONT: - revoke_at_queue_data_cont(con); + revoke_at_queue_data_cont(con, msg); break; case OUT_S_FINISH_MESSAGE: - revoke_at_finish_message(con); + revoke_at_finish_message(con, msg); break; default: WARN(1, "bad out_state %d", con->v2.out_state); -- cgit v1.2.3 From 6140f1d43ba9425dc55b12bdfd8877b0c5118d9a Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Wed, 6 Aug 2025 11:48:55 +0200 Subject: libceph: add empty check to ceph_con_get_out_msg() This moves the list_empty() checks from the two callers (v1 and v2) into the base messenger.c library. Now the v1/v2 specializations do not need to know about con->out_queue; that implementation detail is now hidden behind the ceph_con_get_out_msg() function. [ idryomov: instead of changing prepare_write_message() to return a bool, move ceph_con_get_out_msg() call out to arrive to the same pattern as in messenger_v2.c ] Signed-off-by: Max Kellermann Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- net/ceph/messenger.c | 4 +++- net/ceph/messenger_v1.c | 10 ++++------ net/ceph/messenger_v2.c | 3 +-- 3 files changed, 8 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 08a6a083609f..878bbfe770b1 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2114,7 +2114,9 @@ struct ceph_msg *ceph_con_get_out_msg(struct ceph_connection *con) { struct ceph_msg *msg; - BUG_ON(list_empty(&con->out_queue)); + if (list_empty(&con->out_queue)) + return NULL; + msg = list_first_entry(&con->out_queue, struct ceph_msg, list_head); WARN_ON(msg->con != con); diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c index cc4a36ef8462..c9e002d96319 100644 --- a/net/ceph/messenger_v1.c +++ b/net/ceph/messenger_v1.c @@ -191,9 +191,9 @@ static void prepare_write_message_footer(struct ceph_connection *con, /* * Prepare headers for the next outgoing message. */ -static void prepare_write_message(struct ceph_connection *con) +static void prepare_write_message(struct ceph_connection *con, + struct ceph_msg *m) { - struct ceph_msg *m; u32 crc; con_out_kvec_reset(con); @@ -209,8 +209,6 @@ static void prepare_write_message(struct ceph_connection *con) &con->v1.out_temp_ack); } - m = ceph_con_get_out_msg(con); - dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n", m, con->out_seq, le16_to_cpu(m->hdr.type), le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), @@ -1545,8 +1543,8 @@ do_next: goto more; } /* is anything else pending? */ - if (!list_empty(&con->out_queue)) { - prepare_write_message(con); + if ((msg = ceph_con_get_out_msg(con)) != NULL) { + prepare_write_message(con, msg); goto more; } if (con->in_seq > con->in_seq_acked) { diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index b44e936f3865..9e39378eda00 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -3310,8 +3310,7 @@ static int populate_out_iter(struct ceph_connection *con) pr_err("prepare_keepalive2 failed: %d\n", ret); return ret; } - } else if (!list_empty(&con->out_queue)) { - msg = ceph_con_get_out_msg(con); + } else if ((msg = ceph_con_get_out_msg(con)) != NULL) { ret = prepare_message(con, msg); if (ret) { pr_err("prepare_message failed: %d\n", ret); -- cgit v1.2.3 From 07ca98f906a403637fc5e513a872a50ef1247f3b Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 8 Oct 2025 18:56:59 +0200 Subject: xsk: Harden userspace-supplied xdp_desc validation Turned out certain clearly invalid values passed in xdp_desc from userspace can pass xp_{,un}aligned_validate_desc() and then lead to UBs or just invalid frames to be queued for xmit. desc->len close to ``U32_MAX`` with a non-zero pool->tx_metadata_len can cause positive integer overflow and wraparound, the same way low enough desc->addr with a non-zero pool->tx_metadata_len can cause negative integer overflow. Both scenarios can then pass the validation successfully. This doesn't happen with valid XSk applications, but can be used to perform attacks. Always promote desc->len to ``u64`` first to exclude positive overflows of it. Use explicit check_{add,sub}_overflow() when validating desc->addr (which is ``u64`` already). bloat-o-meter reports a little growth of the code size: add/remove: 0/0 grow/shrink: 2/1 up/down: 60/-16 (44) Function old new delta xskq_cons_peek_desc 299 330 +31 xsk_tx_peek_release_desc_batch 973 1002 +29 xsk_generic_xmit 3148 3132 -16 but hopefully this doesn't hurt the performance much. Fixes: 341ac980eab9 ("xsk: Support tx_metadata_len") Cc: stable@vger.kernel.org # 6.8+ Signed-off-by: Alexander Lobakin Reviewed-by: Jason Xing Reviewed-by: Maciej Fijalkowski Link: https://lore.kernel.org/r/20251008165659.4141318-1-aleksander.lobakin@intel.com Signed-off-by: Alexei Starovoitov --- net/xdp/xsk_queue.h | 45 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index f16f390370dc..1eb8d9f8b104 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -143,14 +143,24 @@ static inline bool xp_unused_options_set(u32 options) static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { - u64 addr = desc->addr - pool->tx_metadata_len; - u64 len = desc->len + pool->tx_metadata_len; - u64 offset = addr & (pool->chunk_size - 1); + u64 len = desc->len; + u64 addr, offset; - if (!desc->len) + if (!len) return false; - if (offset + len > pool->chunk_size) + /* Can overflow if desc->addr < pool->tx_metadata_len */ + if (check_sub_overflow(desc->addr, pool->tx_metadata_len, &addr)) + return false; + + offset = addr & (pool->chunk_size - 1); + + /* + * Can't overflow: @offset is guaranteed to be < ``U32_MAX`` + * (pool->chunk_size is ``u32``), @len is guaranteed + * to be <= ``U32_MAX``. + */ + if (offset + len + pool->tx_metadata_len > pool->chunk_size) return false; if (addr >= pool->addrs_cnt) @@ -158,27 +168,42 @@ static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, if (xp_unused_options_set(desc->options)) return false; + return true; } static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { - u64 addr = xp_unaligned_add_offset_to_addr(desc->addr) - pool->tx_metadata_len; - u64 len = desc->len + pool->tx_metadata_len; + u64 len = desc->len; + u64 addr, end; - if (!desc->len) + if (!len) return false; + /* Can't overflow: @len is guaranteed to be <= ``U32_MAX`` */ + len += pool->tx_metadata_len; if (len > pool->chunk_size) return false; - if (addr >= pool->addrs_cnt || addr + len > pool->addrs_cnt || - xp_desc_crosses_non_contig_pg(pool, addr, len)) + /* Can overflow if desc->addr is close to 0 */ + if (check_sub_overflow(xp_unaligned_add_offset_to_addr(desc->addr), + pool->tx_metadata_len, &addr)) + return false; + + if (addr >= pool->addrs_cnt) + return false; + + /* Can overflow if pool->addrs_cnt is high enough */ + if (check_add_overflow(addr, len, &end) || end > pool->addrs_cnt) + return false; + + if (xp_desc_crosses_non_contig_pg(pool, addr, len)) return false; if (xp_unused_options_set(desc->options)) return false; + return true; } -- cgit v1.2.3 From 93a27b5891b8194a8c083c9a80d2141d4bf47ba8 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 27 Sep 2025 21:11:16 +0900 Subject: can: j1939: add missing calls in NETDEV_UNREGISTER notification handler Currently NETDEV_UNREGISTER event handler is not calling j1939_cancel_active_session() and j1939_sk_queue_drop_all(). This will result in these calls being skipped when j1939_sk_release() is called. And I guess that the reason syzbot is still reporting unregister_netdevice: waiting for vcan0 to become free. Usage count = 2 is caused by lack of these calls. Calling j1939_cancel_active_session(priv, sk) from j1939_sk_release() can be covered by calling j1939_cancel_active_session(priv, NULL) from j1939_netdev_notify(). Calling j1939_sk_queue_drop_all() from j1939_sk_release() can be covered by calling j1939_sk_netdev_event_netdown() from j1939_netdev_notify(). Therefore, we can reuse j1939_cancel_active_session(priv, NULL) and j1939_sk_netdev_event_netdown(priv) for NETDEV_UNREGISTER event handler. Fixes: 7fcbe5b2c6a4 ("can: j1939: implement NETDEV_UNREGISTER notification handler") Signed-off-by: Tetsuo Handa Tested-by: Oleksij Rempel Acked-by: Oleksij Rempel Link: https://patch.msgid.link/3ad3c7f8-5a74-4b07-a193-cb0725823558@I-love.SAKURA.ne.jp Signed-off-by: Marc Kleine-Budde --- net/can/j1939/main.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index 3706a872ecaf..a93af55df5fd 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -378,6 +378,8 @@ static int j1939_netdev_notify(struct notifier_block *nb, j1939_ecu_unmap_all(priv); break; case NETDEV_UNREGISTER: + j1939_cancel_active_session(priv, NULL); + j1939_sk_netdev_event_netdown(priv); j1939_sk_netdev_event_unregister(priv); break; } -- cgit v1.2.3 From 25718fdcbdd2dadd15fc8b684df59b43970b91ed Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 9 Oct 2025 11:43:38 +0200 Subject: net: gro_cells: Use nested-BH locking for gro_cell The gro_cell data structure is per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Add a local_lock_t to the data structure and use local_lock_nested_bh() for locking. This change adds only lockdep coverage and does not alter the functional behaviour for !PREEMPT_RT. Reported-by: syzbot+8715dd783e9b0bef43b1@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/68c6c3b1.050a0220.2ff435.0382.GAE@google.com/ Fixes: 3253cb49cbad ("softirq: Allow to drop the softirq-BKL lock on PREEMPT_RT") Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20251009094338.j1jyKfjR@linutronix.de Signed-off-by: Jakub Kicinski --- net/core/gro_cells.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index ff8e5b64bf6b..b43911562f4d 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -8,11 +8,13 @@ struct gro_cell { struct sk_buff_head napi_skbs; struct napi_struct napi; + local_lock_t bh_lock; }; int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) { struct net_device *dev = skb->dev; + bool have_bh_lock = false; struct gro_cell *cell; int res; @@ -25,6 +27,8 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) goto unlock; } + local_lock_nested_bh(&gcells->cells->bh_lock); + have_bh_lock = true; cell = this_cpu_ptr(gcells->cells); if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(net_hotdata.max_backlog)) { @@ -39,6 +43,9 @@ drop: if (skb_queue_len(&cell->napi_skbs) == 1) napi_schedule(&cell->napi); + if (have_bh_lock) + local_unlock_nested_bh(&gcells->cells->bh_lock); + res = NET_RX_SUCCESS; unlock: @@ -54,6 +61,7 @@ static int gro_cell_poll(struct napi_struct *napi, int budget) struct sk_buff *skb; int work_done = 0; + __local_lock_nested_bh(&cell->bh_lock); while (work_done < budget) { skb = __skb_dequeue(&cell->napi_skbs); if (!skb) @@ -64,6 +72,7 @@ static int gro_cell_poll(struct napi_struct *napi, int budget) if (work_done < budget) napi_complete_done(napi, work_done); + __local_unlock_nested_bh(&cell->bh_lock); return work_done; } @@ -79,6 +88,7 @@ int gro_cells_init(struct gro_cells *gcells, struct net_device *dev) struct gro_cell *cell = per_cpu_ptr(gcells->cells, i); __skb_queue_head_init(&cell->napi_skbs); + local_lock_init(&cell->bh_lock); set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state); -- cgit v1.2.3 From 21f4d45eba0b2dcae5dbc9e5e0ad08735c993f16 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 9 Oct 2025 16:02:19 +0100 Subject: net/ip6_tunnel: Prevent perpetual tunnel growth Similarly to ipv4 tunnel, ipv6 version updates dev->needed_headroom, too. While ipv4 tunnel headroom adjustment growth was limited in commit 5ae1e9922bbd ("net: ip_tunnel: prevent perpetual headroom growth"), ipv6 tunnel yet increases the headroom without any ceiling. Reflect ipv4 tunnel headroom adjustment limit on ipv6 version. Credits to Francesco Ruggeri, who was originally debugging this issue and wrote local Arista-specific patch and a reproducer. Fixes: 8eb30be0352d ("ipv6: Create ip6_tnl_xmit") Cc: Florian Westphal Cc: Francesco Ruggeri Signed-off-by: Dmitry Safonov Link: https://patch.msgid.link/20251009-ip6_tunnel-headroom-v2-1-8e4dbd8f7e35@arista.com Signed-off-by: Jakub Kicinski --- include/net/ip_tunnels.h | 15 +++++++++++++++ net/ipv4/ip_tunnel.c | 14 -------------- net/ipv6/ip6_tunnel.c | 3 +-- 3 files changed, 16 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 4314a97702ea..ecae35512b9b 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -611,6 +611,21 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, int headroom, bool reply); +static inline void ip_tunnel_adj_headroom(struct net_device *dev, + unsigned int headroom) +{ + /* we must cap headroom to some upperlimit, else pskb_expand_head + * will overflow header offsets in skb_headers_offset_update(). + */ + const unsigned int max_allowed = 512; + + if (headroom > max_allowed) + headroom = max_allowed; + + if (headroom > READ_ONCE(dev->needed_headroom)) + WRITE_ONCE(dev->needed_headroom, headroom); +} + int iptunnel_handle_offloads(struct sk_buff *skb, int gso_type_mask); static inline int iptunnel_pull_offloads(struct sk_buff *skb) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index aaeb5d16f0c9..158a30ae7c5f 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -568,20 +568,6 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, return 0; } -static void ip_tunnel_adj_headroom(struct net_device *dev, unsigned int headroom) -{ - /* we must cap headroom to some upperlimit, else pskb_expand_head - * will overflow header offsets in skb_headers_offset_update(). - */ - static const unsigned int max_allowed = 512; - - if (headroom > max_allowed) - headroom = max_allowed; - - if (headroom > READ_ONCE(dev->needed_headroom)) - WRITE_ONCE(dev->needed_headroom, headroom); -} - void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto, int tunnel_hlen) { diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 3262e81223df..6405072050e0 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1257,8 +1257,7 @@ route_lookup: */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr) + dst->header_len + t->hlen; - if (max_headroom > READ_ONCE(dev->needed_headroom)) - WRITE_ONCE(dev->needed_headroom, max_headroom); + ip_tunnel_adj_headroom(dev, max_headroom); err = ip6_tnl_encap(skb, t, &proto, fl6); if (err) -- cgit v1.2.3 From 295ce1eb36ae47dc862d6c8a1012618a25516208 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 11 Oct 2025 11:57:42 +0000 Subject: tcp: fix tcp_tso_should_defer() vs large RTT Neal reported that using neper tcp_stream with TCP_TX_DELAY set to 50ms would often lead to flows stuck in a small cwnd mode, regardless of the congestion control. While tcp_stream sets TCP_TX_DELAY too late after the connect(), it highlighted two kernel bugs. The following heuristic in tcp_tso_should_defer() seems wrong for large RTT: delta = tp->tcp_clock_cache - head->tstamp; /* If next ACK is likely to come too late (half srtt), do not defer */ if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0) goto send_now; If next ACK is expected to come in more than 1 ms, we should not defer because we prefer a smooth ACK clocking. While blamed commit was a step in the good direction, it was not generic enough. Another patch fixing TCP_TX_DELAY for established flows will be proposed when net-next reopens. Fixes: 50c8339e9299 ("tcp: tso: restore IW10 after TSO autosizing") Reported-by: Neal Cardwell Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Tested-by: Neal Cardwell Link: https://patch.msgid.link/20251011115742.1245771-1-edumazet@google.com [pabeni@redhat.com: fixed whitespace issue] Signed-off-by: Paolo Abeni --- net/ipv4/tcp_output.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index bb3576ac0ad7..b94efb3050d2 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2369,7 +2369,8 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, u32 max_segs) { const struct inet_connection_sock *icsk = inet_csk(sk); - u32 send_win, cong_win, limit, in_flight; + u32 send_win, cong_win, limit, in_flight, threshold; + u64 srtt_in_ns, expected_ack, how_far_is_the_ack; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *head; int win_divisor; @@ -2431,9 +2432,19 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, head = tcp_rtx_queue_head(sk); if (!head) goto send_now; - delta = tp->tcp_clock_cache - head->tstamp; - /* If next ACK is likely to come too late (half srtt), do not defer */ - if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0) + + srtt_in_ns = (u64)(NSEC_PER_USEC >> 3) * tp->srtt_us; + /* When is the ACK expected ? */ + expected_ack = head->tstamp + srtt_in_ns; + /* How far from now is the ACK expected ? */ + how_far_is_the_ack = expected_ack - tp->tcp_clock_cache; + + /* If next ACK is likely to come too late, + * ie in more than min(1ms, half srtt), do not defer. + */ + threshold = min(srtt_in_ns >> 1, NSEC_PER_MSEC); + + if ((s64)(how_far_is_the_ack - threshold) > 0) goto send_now; /* Ok, it looks like it is advisable to defer. -- cgit v1.2.3 From 7f9ee5fc97e14682e36fe22ae2654c07e4998b82 Mon Sep 17 00:00:00 2001 From: Shardul Bankar Date: Tue, 14 Oct 2025 17:30:37 +0530 Subject: bpf: test_run: Fix ctx leak in bpf_prog_test_run_xdp error path Fix a memory leak in bpf_prog_test_run_xdp() where the context buffer allocated by bpf_ctx_init() is not freed when the function returns early due to a data size check. On the failing path: ctx = bpf_ctx_init(...); if (kattr->test.data_size_in - meta_sz < ETH_HLEN) return -EINVAL; The early return bypasses the cleanup label that kfree()s ctx, leading to a leak detectable by kmemleak under fuzzing. Change the return to jump to the existing free_ctx label. Fixes: fe9544ed1a2e ("bpf: Support specifying linear xdp packet data size for BPF_PROG_TEST_RUN") Reported-by: BPF Runtime Fuzzer (BRF) Signed-off-by: Shardul Bankar Signed-off-by: Martin KaFai Lau Acked-by: Jiri Olsa Acked-by: Daniel Borkmann Link: https://patch.msgid.link/20251014120037.1981316-1-shardulsb08@gmail.com --- net/bpf/test_run.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index dfb03ee0bb62..1782e83de2cb 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -1269,7 +1269,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, goto free_ctx; if (kattr->test.data_size_in - meta_sz < ETH_HLEN) - return -EINVAL; + goto free_ctx; data = bpf_test_init(kattr, linear_sz, max_linear_sz, headroom, tailroom); if (IS_ERR(data)) { -- cgit v1.2.3 From 7f0fddd817ba6daebea1445ae9fab4b6d2294fa8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 13 Oct 2025 20:50:52 +0200 Subject: net: core: fix lockdep splat on device unregister Since blamed commit, unregister_netdevice_many_notify() takes the netdev mutex if the device needs it. If the device list is too long, this will lock more device mutexes than lockdep can handle: unshare -n \ bash -c 'for i in $(seq 1 100);do ip link add foo$i type dummy;done' BUG: MAX_LOCK_DEPTH too low! turning off the locking correctness validator. depth: 48 max: 48! 48 locks held by kworker/u16:1/69: #0: ..148 ((wq_completion)netns){+.+.}-{0:0}, at: process_one_work #1: ..d40 (net_cleanup_work){+.+.}-{0:0}, at: process_one_work #2: ..bd0 (pernet_ops_rwsem){++++}-{4:4}, at: cleanup_net #3: ..aa8 (rtnl_mutex){+.+.}-{4:4}, at: default_device_exit_batch #4: ..cb0 (&dev_instance_lock_key#3){+.+.}-{4:4}, at: unregister_netdevice_many_notify [..] Add a helper to close and then unlock a list of net_devices. Devices that are not up have to be skipped - netif_close_many always removes them from the list without any other actions taken, so they'd remain in locked state. Close devices whenever we've used up half of the tracking slots or we processed entire list without hitting the limit. Fixes: 7e4d784f5810 ("net: hold netdev instance lock during rtnetlink operations") Signed-off-by: Florian Westphal Link: https://patch.msgid.link/20251013185052.14021-1-fw@strlen.de Signed-off-by: Jakub Kicinski --- net/core/dev.c | 40 +++++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index a64cef2c537e..2acfa44927da 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -12176,6 +12176,35 @@ static void dev_memory_provider_uninstall(struct net_device *dev) } } +/* devices must be UP and netdev_lock()'d */ +static void netif_close_many_and_unlock(struct list_head *close_head) +{ + struct net_device *dev, *tmp; + + netif_close_many(close_head, false); + + /* ... now unlock them */ + list_for_each_entry_safe(dev, tmp, close_head, close_list) { + netdev_unlock(dev); + list_del_init(&dev->close_list); + } +} + +static void netif_close_many_and_unlock_cond(struct list_head *close_head) +{ +#ifdef CONFIG_LOCKDEP + /* We can only track up to MAX_LOCK_DEPTH locks per task. + * + * Reserve half the available slots for additional locks possibly + * taken by notifiers and (soft)irqs. + */ + unsigned int limit = MAX_LOCK_DEPTH / 2; + + if (lockdep_depth(current) > limit) + netif_close_many_and_unlock(close_head); +#endif +} + void unregister_netdevice_many_notify(struct list_head *head, u32 portid, const struct nlmsghdr *nlh) { @@ -12208,17 +12237,18 @@ void unregister_netdevice_many_notify(struct list_head *head, /* If device is running, close it first. Start with ops locked... */ list_for_each_entry(dev, head, unreg_list) { + if (!(dev->flags & IFF_UP)) + continue; if (netdev_need_ops_lock(dev)) { list_add_tail(&dev->close_list, &close_head); netdev_lock(dev); } + netif_close_many_and_unlock_cond(&close_head); } - netif_close_many(&close_head, true); - /* ... now unlock them and go over the rest. */ + netif_close_many_and_unlock(&close_head); + /* ... now go over the rest. */ list_for_each_entry(dev, head, unreg_list) { - if (netdev_need_ops_lock(dev)) - netdev_unlock(dev); - else + if (!netdev_need_ops_lock(dev)) list_add_tail(&dev->close_list, &close_head); } netif_close_many(&close_head, true); -- cgit v1.2.3 From ce5af41e3234425a40974696682163edfd21128c Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 14 Oct 2025 11:16:56 +0200 Subject: tls: trim encrypted message to match the plaintext on short splice During tls_sw_sendmsg_locked, we pre-allocate the encrypted message for the size we're expecting to send during the current iteration, but we may end up sending less, for example when splicing: if we're getting the data from small fragments of memory, we may fill up all the slots in the skmsg with less data than expected. In this case, we need to trim the encrypted message to only the length we actually need, to avoid pushing uninitialized bytes down the underlying TCP socket. Fixes: fe1e81d4f73b ("tls/sw: Support MSG_SPLICE_PAGES") Reported-by: Jann Horn Signed-off-by: Sabrina Dubroca Link: https://patch.msgid.link/66a0ae99c9efc15f88e9e56c1f58f902f442ce86.1760432043.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index daac9fd4be7e..36ca3011ab87 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1112,8 +1112,11 @@ alloc_encrypted: goto send_end; tls_ctx->pending_open_record_frags = true; - if (sk_msg_full(msg_pl)) + if (sk_msg_full(msg_pl)) { full_record = true; + sk_msg_trim(sk, msg_en, + msg_pl->sg.size + prot->overhead_size); + } if (full_record || eor) goto copied; -- cgit v1.2.3 From b014a4e066c555185b7c367efacdc33f16695495 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 14 Oct 2025 11:16:57 +0200 Subject: tls: wait for async encrypt in case of error during latter iterations of sendmsg If we hit an error during the main loop of tls_sw_sendmsg_locked (eg failed allocation), we jump to send_end and immediately return. Previous iterations may have queued async encryption requests that are still pending. We should wait for those before returning, as we could otherwise be reading from memory that userspace believes we're not using anymore, which would be a sort of use-after-free. This is similar to what tls_sw_recvmsg already does: failures during the main loop jump to the "wait for async" code, not straight to the unlock/return. Fixes: a42055e8d2c3 ("net/tls: Add support for async encryption of records for performance") Reported-by: Jann Horn Signed-off-by: Sabrina Dubroca Link: https://patch.msgid.link/c793efe9673b87f808d84fdefc0f732217030c52.1760432043.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 36ca3011ab87..1478d515badc 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1054,7 +1054,7 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg, if (ret == -EINPROGRESS) num_async++; else if (ret != -EAGAIN) - goto send_end; + goto end; } } @@ -1226,8 +1226,9 @@ trim_sgl: goto alloc_encrypted; } +send_end: if (!num_async) { - goto send_end; + goto end; } else if (num_zc || eor) { int err; @@ -1245,7 +1246,7 @@ trim_sgl: tls_tx_records(sk, msg->msg_flags); } -send_end: +end: ret = sk_stream_error(sk, msg->msg_flags, ret); return copied > 0 ? copied : ret; } -- cgit v1.2.3 From b6fe4c29bb51cf239ecf48eacf72b924565cb619 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 14 Oct 2025 11:16:58 +0200 Subject: tls: always set record_type in tls_process_cmsg When userspace wants to send a non-DATA record (via the TLS_SET_RECORD_TYPE cmsg), we need to send any pending data from a previous MSG_MORE send() as a separate DATA record. If that DATA record is encrypted asynchronously, tls_handle_open_record will return -EINPROGRESS. This is currently treated as an error by tls_process_cmsg, and it will skip setting record_type to the correct value, but the caller (tls_sw_sendmsg_locked) handles that return value correctly and proceeds with sending the new message with an incorrect record_type (DATA instead of whatever was requested in the cmsg). Always set record_type before handling the open record. If tls_handle_open_record returns an error, record_type will be ignored. If it succeeds, whether with synchronous crypto (returning 0) or asynchronous (returning -EINPROGRESS), the caller will proceed correctly. Fixes: a42055e8d2c3 ("net/tls: Add support for async encryption of records for performance") Reported-by: Jann Horn Signed-off-by: Sabrina Dubroca Link: https://patch.msgid.link/0457252e578a10a94e40c72ba6288b3a64f31662.1760432043.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/tls/tls_main.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index a3ccb3135e51..39a2ab47fe72 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -255,12 +255,9 @@ int tls_process_cmsg(struct sock *sk, struct msghdr *msg, if (msg->msg_flags & MSG_MORE) return -EINVAL; - rc = tls_handle_open_record(sk, msg->msg_flags); - if (rc) - return rc; - *record_type = *(unsigned char *)CMSG_DATA(cmsg); - rc = 0; + + rc = tls_handle_open_record(sk, msg->msg_flags); break; default: return -EINVAL; -- cgit v1.2.3 From b8a6ff84abbcbbc445463de58704686011edc8e1 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 14 Oct 2025 11:16:59 +0200 Subject: tls: wait for pending async decryptions if tls_strp_msg_hold fails Async decryption calls tls_strp_msg_hold to create a clone of the input skb to hold references to the memory it uses. If we fail to allocate that clone, proceeding with async decryption can lead to various issues (UAF on the skb, writing into userspace memory after the recv() call has returned). In this case, wait for all pending decryption requests. Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser") Reported-by: Jann Horn Signed-off-by: Sabrina Dubroca Link: https://patch.msgid.link/b9fe61dcc07dab15da9b35cf4c7d86382a98caf2.1760432043.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 1478d515badc..e3d852091e7a 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1641,8 +1641,10 @@ static int tls_decrypt_sg(struct sock *sk, struct iov_iter *out_iov, if (unlikely(darg->async)) { err = tls_strp_msg_hold(&ctx->strp, &ctx->async_hold); - if (err) - __skb_queue_tail(&ctx->async_hold, darg->skb); + if (err) { + err = tls_decrypt_async_wait(ctx); + darg->async = false; + } return err; } -- cgit v1.2.3 From 7f846c65ca11e63d2409868ff039081f80e42ae4 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 14 Oct 2025 11:17:00 +0200 Subject: tls: don't rely on tx_work during send() With async crypto, we rely on tx_work to actually transmit records once encryption completes. But while send() is running, both the tx_lock and socket lock are held, so tx_work_handler cannot process the queue of encrypted records, and simply reschedules itself. During a large send(), this could last a long time, and use a lot of memory. Transmit any pending encrypted records before restarting the main loop of tls_sw_sendmsg_locked. Fixes: a42055e8d2c3 ("net/tls: Add support for async encryption of records for performance") Reported-by: Jann Horn Signed-off-by: Sabrina Dubroca Link: https://patch.msgid.link/8396631478f70454b44afb98352237d33f48d34d.1760432043.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index e3d852091e7a..d17135369980 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1152,6 +1152,13 @@ alloc_encrypted: } else if (ret != -EAGAIN) goto send_end; } + + /* Transmit if any encryptions have completed */ + if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) { + cancel_delayed_work(&ctx->tx_work.work); + tls_tx_records(sk, msg->msg_flags); + } + continue; rollback_iter: copied -= try_to_copy; @@ -1207,6 +1214,12 @@ copied: goto send_end; } } + + /* Transmit if any encryptions have completed */ + if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) { + cancel_delayed_work(&ctx->tx_work.work); + tls_tx_records(sk, msg->msg_flags); + } } continue; -- cgit v1.2.3 From 6de1dec1c166c7f7324ce52ccfdf43e2fa743b19 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 15 Oct 2025 05:27:15 +0000 Subject: udp: do not use skb_release_head_state() before skb_attempt_defer_free() Michal reported and bisected an issue after recent adoption of skb_attempt_defer_free() in UDP. The issue here is that skb_release_head_state() is called twice per skb, one time from skb_consume_udp(), then a second time from skb_defer_free_flush() and napi_consume_skb(). As Sabrina suggested, remove skb_release_head_state() call from skb_consume_udp(). Add a DEBUG_NET_WARN_ON_ONCE(skb_nfct(skb)) in skb_attempt_defer_free() Many thanks to Michal, Sabrina, Paolo and Florian for their help. Fixes: 6471658dc66c ("udp: use skb_attempt_defer_free()") Reported-and-bisected-by: Michal Kubecek Closes: https://lore.kernel.org/netdev/gpjh4lrotyephiqpuldtxxizrsg6job7cvhiqrw72saz2ubs3h@g6fgbvexgl3r/ Signed-off-by: Eric Dumazet Tested-by: Michal Kubecek Cc: Sabrina Dubroca Cc: Florian Westphal Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20251015052715.4140493-1-edumazet@google.com Signed-off-by: Paolo Abeni --- net/core/skbuff.c | 1 + net/ipv4/udp.c | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index bc12790017b0..6be01454f262 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -7200,6 +7200,7 @@ nodefer: kfree_skb_napi_cache(skb); DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); DEBUG_NET_WARN_ON_ONCE(skb->destructor); + DEBUG_NET_WARN_ON_ONCE(skb_nfct(skb)); sdn = per_cpu_ptr(net_hotdata.skb_defer_nodes, cpu) + numa_node_id(); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 95241093b7f0..30dfbf73729d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1851,8 +1851,6 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) sk_peek_offset_bwd(sk, len); if (!skb_shared(skb)) { - if (unlikely(udp_skb_has_head_state(skb))) - skb_release_head_state(skb); skb_attempt_defer_free(skb); return; } -- cgit v1.2.3 From d0d3e9c2867b32c9c70e39e74b9425871cf0042a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 15 Oct 2025 06:32:21 +0000 Subject: net: gro: clear skb_shinfo(skb)->hwtstamps in napi_reuse_skb() Some network drivers assume this field is zero after napi_get_frags(). We must clear it in napi_reuse_skb() otherwise the following can happen: 1) A packet is received, and skb_shinfo(skb)->hwtstamps is populated because a bit in the receive descriptor announced hwtstamp availability for this packet. 2) Packet is given to gro layer via napi_gro_frags(). 3) Packet is merged to a prior one held in GRO queues. 4) skb is saved after some cleanup in napi->skb via a call to napi_reuse_skb(). 5) Next packet is received 10 seconds later, gets the recycled skb from napi_get_frags(). 6) The receive descriptor does not announce hwtstamp availability. Driver does not clear shinfo->hwtstamps. 7) We have in shinfo->hwtstamps an old timestamp. Fixes: ac45f602ee3d ("net: infrastructure for hardware time stamping") Signed-off-by: Eric Dumazet Reviewed-by: Alexander Lobakin Link: https://patch.msgid.link/20251015063221.4171986-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/gro.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/gro.c b/net/core/gro.c index 5ba4504cfd28..76f9c3712422 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -639,6 +639,8 @@ EXPORT_SYMBOL(gro_receive_skb); static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) { + struct skb_shared_info *shinfo; + if (unlikely(skb->pfmemalloc)) { consume_skb(skb); return; @@ -655,8 +657,12 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->encapsulation = 0; skb->ip_summed = CHECKSUM_NONE; - skb_shinfo(skb)->gso_type = 0; - skb_shinfo(skb)->gso_size = 0; + + shinfo = skb_shinfo(skb); + shinfo->gso_type = 0; + shinfo->gso_size = 0; + shinfo->hwtstamps.hwtstamp = 0; + if (unlikely(skb->slow_gro)) { skb_orphan(skb); skb_ext_reset(skb); -- cgit v1.2.3 From bf29555f5bdc017bac22ca66fcb6c9f46ec8788f Mon Sep 17 00:00:00 2001 From: Johannes Wiesböck Date: Wed, 15 Oct 2025 22:15:43 +0200 Subject: rtnetlink: Allow deleting FDB entries in user namespace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Creating FDB entries is possible from a non-initial user namespace when having CAP_NET_ADMIN, yet, when deleting FDB entries, processes receive an EPERM because the capability is always checked against the initial user namespace. This restricts the FDB management from unprivileged containers. Drop the netlink_capable check in rtnl_fdb_del as it was originally dropped in c5c351088ae7 and reintroduced in 1690be63a27b without intention. This patch was tested using a container on GyroidOS, where it was possible to delete FDB entries from an unprivileged user namespace and private network namespace. Fixes: 1690be63a27b ("bridge: Add vlan support to static neighbors") Reviewed-by: Michael Weiß Tested-by: Harshal Gohel Signed-off-by: Johannes Wiesböck Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20251015201548.319871-1-johannes.wiesboeck@aisec.fraunhofer.de Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 8040ff7c356e..576d5ec3bb36 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4715,9 +4715,6 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, int err; u16 vid; - if (!netlink_capable(skb, CAP_NET_ADMIN)) - return -EPERM; - if (!del_bulk) { err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); -- cgit v1.2.3 From 7c33e97a6ef5d84e98b892c3e00c6d1678d20395 Mon Sep 17 00:00:00 2001 From: Sahil Chandna Date: Wed, 15 Oct 2025 00:26:35 +0530 Subject: bpf: Do not disable preemption in bpf_test_run(). The timer mode is initialized to NO_PREEMPT mode by default, this disables preemption and force execution in atomic context causing issue on PREEMPT_RT configurations when invoking spin_lock_bh(), leading to the following warning: BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 6107, name: syz.0.17 preempt_count: 1, expected: 0 RCU nest depth: 1, expected: 1 Preemption disabled at: [] bpf_test_timer_enter+0xf8/0x140 net/bpf/test_run.c:42 Fix this, by removing NO_PREEMPT/NO_MIGRATE mode check. Also, the test timer context no longer needs explicit calls to migrate_disable()/migrate_enable() with rcu_read_lock()/rcu_read_unlock(). Use helpers rcu_read_lock_dont_migrate() and rcu_read_unlock_migrate() instead. Reported-by: syzbot+1f1fbecb9413cdbfbef8@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=1f1fbecb9413cdbfbef8 Suggested-by: Yonghong Song Suggested-by: Menglong Dong Acked-by: Yonghong Song Tested-by: syzbot+1f1fbecb9413cdbfbef8@syzkaller.appspotmail.com Co-developed-by: Brahmajit Das Signed-off-by: Brahmajit Das Signed-off-by: Sahil Chandna Link: https://lore.kernel.org/r/20251014185635.10300-1-chandna.sahil@gmail.com Signed-off-by: Alexei Starovoitov --- net/bpf/test_run.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 1782e83de2cb..8b7d0b90fea7 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -29,7 +29,6 @@ #include struct bpf_test_timer { - enum { NO_PREEMPT, NO_MIGRATE } mode; u32 i; u64 time_start, time_spent; }; @@ -37,12 +36,7 @@ struct bpf_test_timer { static void bpf_test_timer_enter(struct bpf_test_timer *t) __acquires(rcu) { - rcu_read_lock(); - if (t->mode == NO_PREEMPT) - preempt_disable(); - else - migrate_disable(); - + rcu_read_lock_dont_migrate(); t->time_start = ktime_get_ns(); } @@ -50,12 +44,7 @@ static void bpf_test_timer_leave(struct bpf_test_timer *t) __releases(rcu) { t->time_start = 0; - - if (t->mode == NO_PREEMPT) - preempt_enable(); - else - migrate_enable(); - rcu_read_unlock(); + rcu_read_unlock_migrate(); } static bool bpf_test_timer_continue(struct bpf_test_timer *t, int iterations, @@ -374,7 +363,7 @@ static int bpf_test_run_xdp_live(struct bpf_prog *prog, struct xdp_buff *ctx, { struct xdp_test_data xdp = { .batch_size = batch_size }; - struct bpf_test_timer t = { .mode = NO_MIGRATE }; + struct bpf_test_timer t = {}; int ret; if (!repeat) @@ -404,7 +393,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, struct bpf_prog_array_item item = {.prog = prog}; struct bpf_run_ctx *old_ctx; struct bpf_cg_run_ctx run_ctx; - struct bpf_test_timer t = { NO_MIGRATE }; + struct bpf_test_timer t = {}; enum bpf_cgroup_storage_type stype; int ret; @@ -1377,7 +1366,7 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { - struct bpf_test_timer t = { NO_PREEMPT }; + struct bpf_test_timer t = {}; u32 size = kattr->test.data_size_in; struct bpf_flow_dissector ctx = {}; u32 repeat = kattr->test.repeat; @@ -1445,7 +1434,7 @@ out: int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { - struct bpf_test_timer t = { NO_PREEMPT }; + struct bpf_test_timer t = {}; struct bpf_prog_array *progs = NULL; struct bpf_sk_lookup_kern ctx = {}; u32 repeat = kattr->test.repeat; -- cgit v1.2.3 From 607844761454e3c17e928002e126ccf21c83f6aa Mon Sep 17 00:00:00 2001 From: Aloka Dixit Date: Wed, 24 Sep 2025 18:30:14 +0530 Subject: wifi: mac80211: reset FILS discovery and unsol probe resp intervals When ieee80211_stop_ap() deletes the FILS discovery and unsolicited broadcast probe response templates, the associated interval values are not reset. This can lead to drivers subsequently operating with the non-zero values, leading to unexpected behavior. Trigger repeated retrieval attempts of the FILS discovery template in ath12k, resulting in excessive log messages such as: mac vdev 0 failed to retrieve FILS discovery template mac vdev 4 failed to retrieve FILS discovery template Fix this by resetting the intervals in ieee80211_stop_ap() to ensure proper cleanup of FILS discovery and unsolicited broadcast probe response templates. Fixes: 295b02c4be74 ("mac80211: Add FILS discovery support") Fixes: 632189a0180f ("mac80211: Unsolicited broadcast probe response support") Signed-off-by: Aloka Dixit Signed-off-by: Aaradhana Sahu Link: https://patch.msgid.link/20250924130014.2575533-1-aaradhana.sahu@oss.qualcomm.com Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index d9aca1c3c097..c52b0456039d 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1876,6 +1876,9 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, link_conf->nontransmitted = false; link_conf->ema_ap = false; link_conf->bssid_indicator = 0; + link_conf->fils_discovery.min_interval = 0; + link_conf->fils_discovery.max_interval = 0; + link_conf->unsol_bcast_probe_resp_interval = 0; __sta_info_flush(sdata, true, link_id, NULL); -- cgit v1.2.3 From ed6a47346ec69e7f1659e0a1a3558293f60d5dd7 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sun, 19 Oct 2025 11:54:27 +0300 Subject: wifi: mac80211: fix key tailroom accounting leak For keys added by ieee80211_gtk_rekey_add(), we assume that they're already present in the hardware and set the flag KEY_FLAG_UPLOADED_TO_HARDWARE. However, setting this flag needs to be paired with decrementing the tailroom needed, which was missed. Fixes: f52a0b408ed1 ("wifi: mac80211: mark keys as uploaded when added by the driver") Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20251019115358.c88eafb4083e.I69e9d4d78a756a133668c55b5570cf15a4b0e6a4@changeid Signed-off-by: Johannes Berg --- net/mac80211/key.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/key.c b/net/mac80211/key.c index b14e9cd9713f..d5da7ccea66e 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -508,11 +508,16 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, ret = ieee80211_key_enable_hw_accel(new); } } else { - if (!new->local->wowlan) + if (!new->local->wowlan) { ret = ieee80211_key_enable_hw_accel(new); - else if (link_id < 0 || !sdata->vif.active_links || - BIT(link_id) & sdata->vif.active_links) + } else if (link_id < 0 || !sdata->vif.active_links || + BIT(link_id) & sdata->vif.active_links) { new->flags |= KEY_FLAG_UPLOADED_TO_HARDWARE; + if (!(new->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | + IEEE80211_KEY_FLAG_PUT_MIC_SPACE | + IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) + decrease_tailroom_need_count(sdata, 1); + } } if (ret) -- cgit v1.2.3 From 249e1443e3d57e059925bdb698f53e4d008fc106 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Mon, 20 Oct 2025 10:57:45 +0300 Subject: wifi: nl80211: call kfree without a NULL check Coverity is unhappy because we may leak old_radio_rts_threshold. Since this pointer is only valid in the context of the function and kfree is NULL pointer safe, don't check and just call kfree. Note that somehow, we were checking old_rts_threshold to free old_radio_rts_threshold which is a bit odd. Fixes: 264637941cf4 ("wifi: cfg80211: Add Support to Set RTS Threshold for each Radio") Reviewed-by: Johannes Berg Signed-off-by: Emmanuel Grumbach Link: https://patch.msgid.link/20251020075745.44168-1-emmanuel.grumbach@intel.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 346dfd2bd987..03d07b54359a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4136,8 +4136,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) rdev->wiphy.txq_quantum = old_txq_quantum; } - if (old_rts_threshold) - kfree(old_radio_rts_threshold); + kfree(old_radio_rts_threshold); return result; } -- cgit v1.2.3 From f584239a9ed25057496bf397c370cc5163dde419 Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Fri, 17 Oct 2025 10:48:27 +0800 Subject: net/smc: fix general protection fault in __smc_diag_dump The syzbot report a crash: Oops: general protection fault, probably for non-canonical address 0xfbd5a5d5a0000003: 0000 [#1] SMP KASAN NOPTI KASAN: maybe wild-memory-access in range [0xdead4ead00000018-0xdead4ead0000001f] CPU: 1 UID: 0 PID: 6949 Comm: syz.0.335 Not tainted syzkaller #0 PREEMPT(full) Hardware name: Google Compute Engine/Google Compute Engine, BIOS Google 08/18/2025 RIP: 0010:smc_diag_msg_common_fill net/smc/smc_diag.c:44 [inline] RIP: 0010:__smc_diag_dump.constprop.0+0x3ca/0x2550 net/smc/smc_diag.c:89 Call Trace: smc_diag_dump_proto+0x26d/0x420 net/smc/smc_diag.c:217 smc_diag_dump+0x27/0x90 net/smc/smc_diag.c:234 netlink_dump+0x539/0xd30 net/netlink/af_netlink.c:2327 __netlink_dump_start+0x6d6/0x990 net/netlink/af_netlink.c:2442 netlink_dump_start include/linux/netlink.h:341 [inline] smc_diag_handler_dump+0x1f9/0x240 net/smc/smc_diag.c:251 __sock_diag_cmd net/core/sock_diag.c:249 [inline] sock_diag_rcv_msg+0x438/0x790 net/core/sock_diag.c:285 netlink_rcv_skb+0x158/0x420 net/netlink/af_netlink.c:2552 netlink_unicast_kernel net/netlink/af_netlink.c:1320 [inline] netlink_unicast+0x5a7/0x870 net/netlink/af_netlink.c:1346 netlink_sendmsg+0x8d1/0xdd0 net/netlink/af_netlink.c:1896 sock_sendmsg_nosec net/socket.c:714 [inline] __sock_sendmsg net/socket.c:729 [inline] ____sys_sendmsg+0xa95/0xc70 net/socket.c:2614 ___sys_sendmsg+0x134/0x1d0 net/socket.c:2668 __sys_sendmsg+0x16d/0x220 net/socket.c:2700 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xcd/0x4e0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f The process like this: (CPU1) | (CPU2) ---------------------------------|------------------------------- inet_create() | // init clcsock to NULL | sk = sk_alloc() | | // unexpectedly change clcsock | inet_init_csk_locks() | | // add sk to hash table | smc_inet_init_sock() | smc_sk_init() | smc_hash_sk() | | // traverse the hash table | smc_diag_dump_proto | __smc_diag_dump() | // visit wrong clcsock | smc_diag_msg_common_fill() // alloc clcsock | smc_create_clcsk | sock_create_kern | With CONFIG_DEBUG_LOCK_ALLOC=y, the smc->clcsock is unexpectedly changed in inet_init_csk_locks(). The INET_PROTOSW_ICSK flag is no need by smc, just remove it. After removing the INET_PROTOSW_ICSK flag, this patch alse revert commit 6fd27ea183c2 ("net/smc: fix lacks of icsk_syn_mss with IPPROTO_SMC") to avoid casting smc_sock to inet_connection_sock. Reported-by: syzbot+f775be4458668f7d220e@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=f775be4458668f7d220e Tested-by: syzbot+f775be4458668f7d220e@syzkaller.appspotmail.com Fixes: d25a92ccae6b ("net/smc: Introduce IPPROTO_SMC") Signed-off-by: Wang Liang Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: D. Wythe Link: https://patch.msgid.link/20251017024827.3137512-1-wangliang74@huawei.com Signed-off-by: Jakub Kicinski --- net/smc/smc_inet.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'net') diff --git a/net/smc/smc_inet.c b/net/smc/smc_inet.c index a944e7dcb8b9..a94084b4a498 100644 --- a/net/smc/smc_inet.c +++ b/net/smc/smc_inet.c @@ -56,7 +56,6 @@ static struct inet_protosw smc_inet_protosw = { .protocol = IPPROTO_SMC, .prot = &smc_inet_prot, .ops = &smc_inet_stream_ops, - .flags = INET_PROTOSW_ICSK, }; #if IS_ENABLED(CONFIG_IPV6) @@ -104,27 +103,15 @@ static struct inet_protosw smc_inet6_protosw = { .protocol = IPPROTO_SMC, .prot = &smc_inet6_prot, .ops = &smc_inet6_stream_ops, - .flags = INET_PROTOSW_ICSK, }; #endif /* CONFIG_IPV6 */ -static unsigned int smc_sync_mss(struct sock *sk, u32 pmtu) -{ - /* No need pass it through to clcsock, mss can always be set by - * sock_create_kern or smc_setsockopt. - */ - return 0; -} - static int smc_inet_init_sock(struct sock *sk) { struct net *net = sock_net(sk); /* init common smc sock */ smc_sk_init(net, sk, IPPROTO_SMC); - - inet_csk(sk)->icsk_sync_mss = smc_sync_mss; - /* create clcsock */ return smc_create_clcsk(net, sk, sk->sk_family); } -- cgit v1.2.3 From e84cb860ac3ce67ec6ecc364433fd5b412c448bc Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 20 Oct 2025 22:53:26 +0200 Subject: mptcp: pm: in-kernel: C-flag: handle late ADD_ADDR The special C-flag case expects the ADD_ADDR to be received when switching to 'fully-established'. But for various reasons, the ADD_ADDR could be sent after the "4th ACK", and the special case doesn't work. On NIPA, the new test validating this special case for the C-flag failed a few times, e.g. 102 default limits, server deny join id 0 syn rx [FAIL] got 0 JOIN[s] syn rx expected 2 Server ns stats (...) MPTcpExtAddAddrTx 1 MPTcpExtEchoAdd 1 Client ns stats (...) MPTcpExtAddAddr 1 MPTcpExtEchoAddTx 1 synack rx [FAIL] got 0 JOIN[s] synack rx expected 2 ack rx [FAIL] got 0 JOIN[s] ack rx expected 2 join Rx [FAIL] see above syn tx [FAIL] got 0 JOIN[s] syn tx expected 2 join Tx [FAIL] see above I had a suspicion about what the issue could be: the ADD_ADDR might have been received after the switch to the 'fully-established' state. The issue was not easy to reproduce. The packet capture shown that the ADD_ADDR can indeed be sent with a delay, and the client would not try to establish subflows to it as expected. A simple fix is not to mark the endpoints as 'used' in the C-flag case, when looking at creating subflows to the remote initial IP address and port. In this case, there is no need to try. Note: newly added fullmesh endpoints will still continue to be used as expected, thanks to the conditions behind mptcp_pm_add_addr_c_flag_case. Fixes: 4b1ff850e0c1 ("mptcp: pm: in-kernel: usable client side with C-flag") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251020-net-mptcp-c-flag-late-add-addr-v1-1-8207030cb0e8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_kernel.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index e0f44dc232aa..2ae95476dba3 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -370,6 +370,10 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) } subflow: + /* No need to try establishing subflows to remote id0 if not allowed */ + if (mptcp_pm_add_addr_c_flag_case(msk)) + goto exit; + /* check if should create a new subflow */ while (msk->pm.local_addr_used < endp_subflow_max && msk->pm.extra_subflows < limit_extra_subflows) { @@ -401,6 +405,8 @@ subflow: __mptcp_subflow_connect(sk, &local, &addrs[i]); spin_lock_bh(&msk->pm.lock); } + +exit: mptcp_pm_nl_check_work_pending(msk); } -- cgit v1.2.3 From c5394b8b7a92c5013d2917591e28e938fe7ff2a2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 20 Oct 2025 16:11:14 +0000 Subject: net: gro_cells: fix lock imbalance in gro_cells_receive() syzbot found that the local_unlock_nested_bh() call was missing in some cases. WARNING: possible recursive locking detected syzkaller #0 Not tainted -------------------------------------------- syz.2.329/7421 is trying to acquire lock: ffffe8ffffd48888 ((&cell->bh_lock)){+...}-{3:3}, at: spin_lock include/linux/spinlock_rt.h:44 [inline] ffffe8ffffd48888 ((&cell->bh_lock)){+...}-{3:3}, at: gro_cells_receive+0x404/0x790 net/core/gro_cells.c:30 but task is already holding lock: ffffe8ffffd48888 ((&cell->bh_lock)){+...}-{3:3}, at: spin_lock include/linux/spinlock_rt.h:44 [inline] ffffe8ffffd48888 ((&cell->bh_lock)){+...}-{3:3}, at: gro_cells_receive+0x404/0x790 net/core/gro_cells.c:30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock((&cell->bh_lock)); lock((&cell->bh_lock)); *** DEADLOCK *** Given the introduction of @have_bh_lock variable, it seems the author intent was to have the local_unlock_nested_bh() after the @unlock label. Fixes: 25718fdcbdd2 ("net: gro_cells: Use nested-BH locking for gro_cell") Reported-by: syzbot+f9651b9a8212e1c8906f@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/68f65eb9.a70a0220.205af.0034.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Sebastian Andrzej Siewior Reviewed-by: David Ahern Link: https://patch.msgid.link/20251020161114.1891141-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/gro_cells.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index b43911562f4d..fd57b845de33 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -43,12 +43,11 @@ drop: if (skb_queue_len(&cell->napi_skbs) == 1) napi_schedule(&cell->napi); - if (have_bh_lock) - local_unlock_nested_bh(&gcells->cells->bh_lock); - res = NET_RX_SUCCESS; unlock: + if (have_bh_lock) + local_unlock_nested_bh(&gcells->cells->bh_lock); rcu_read_unlock(); return res; } -- cgit v1.2.3 From 441f0647f7673e0e64d4910ef61a5fb8f16bfb82 Mon Sep 17 00:00:00 2001 From: Alexey Simakov Date: Tue, 21 Oct 2025 16:00:36 +0300 Subject: sctp: avoid NULL dereference when chunk data buffer is missing chunk->skb pointer is dereferenced in the if-block where it's supposed to be NULL only. chunk->skb can only be NULL if chunk->head_skb is not. Check for frag_list instead and do it just before replacing chunk->skb. We're sure that otherwise chunk->skb is non-NULL because of outer if() condition. Fixes: 90017accff61 ("sctp: Add GSO support") Signed-off-by: Alexey Simakov Acked-by: Marcelo Ricardo Leitner Link: https://patch.msgid.link/20251021130034.6333-1-bigalex934@gmail.com Signed-off-by: Jakub Kicinski --- net/sctp/inqueue.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index 5c1652181805..f5a7d5a38755 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -169,13 +169,14 @@ next_chunk: chunk->head_skb = chunk->skb; /* skbs with "cover letter" */ - if (chunk->head_skb && chunk->skb->data_len == chunk->skb->len) + if (chunk->head_skb && chunk->skb->data_len == chunk->skb->len) { + if (WARN_ON(!skb_shinfo(chunk->skb)->frag_list)) { + __SCTP_INC_STATS(dev_net(chunk->skb->dev), + SCTP_MIB_IN_PKT_DISCARDS); + sctp_chunk_free(chunk); + goto next_chunk; + } chunk->skb = skb_shinfo(chunk->skb)->frag_list; - - if (WARN_ON(!chunk->skb)) { - __SCTP_INC_STATS(dev_net(chunk->skb->dev), SCTP_MIB_IN_PKT_DISCARDS); - sctp_chunk_free(chunk); - goto next_chunk; } } -- cgit v1.2.3 From c0178eec8884231a5ae0592b9fce827bccb77e86 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Mon, 20 Oct 2025 15:55:33 +0200 Subject: net: hsr: prevent creation of HSR device with slaves from another netns HSR/PRP driver does not handle correctly having slaves/interlink devices in a different net namespace. Currently, it is possible to create a HSR link in a different net namespace than the slaves/interlink with the following command: ip link add hsr0 netns hsr-ns type hsr slave1 eth1 slave2 eth2 As there is no use-case on supporting this scenario, enforce that HSR device link matches netns defined by IFLA_LINK_NETNSID. The iproute2 command mentioned above will throw the following error: Error: hsr: HSR slaves/interlink must be on the same net namespace than HSR link. Fixes: f421436a591d ("net/hsr: Add support for the High-availability Seamless Redundancy protocol (HSRv0)") Signed-off-by: Fernando Fernandez Mancera Link: https://patch.msgid.link/20251020135533.9373-1-fmancera@suse.de Signed-off-by: Jakub Kicinski --- net/hsr/hsr_netlink.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index b120470246cc..c96b63adf96f 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -34,12 +34,18 @@ static int hsr_newlink(struct net_device *dev, struct netlink_ext_ack *extack) { struct net *link_net = rtnl_newlink_link_net(params); + struct net_device *link[2], *interlink = NULL; struct nlattr **data = params->data; enum hsr_version proto_version; unsigned char multicast_spec; u8 proto = HSR_PROTOCOL_HSR; - struct net_device *link[2], *interlink = NULL; + if (!net_eq(link_net, dev_net(dev))) { + NL_SET_ERR_MSG_MOD(extack, + "HSR slaves/interlink must be on the same net namespace than HSR link"); + return -EINVAL; + } + if (!data) { NL_SET_ERR_MSG_MOD(extack, "No slave devices specified"); return -EINVAL; -- cgit v1.2.3 From f6ceec6434b5efff62cecbaa2ff74fc29b96c0c6 Mon Sep 17 00:00:00 2001 From: Ralf Lici Date: Tue, 21 Oct 2025 12:09:40 +0200 Subject: net: datagram: introduce datagram_poll_queue for custom receive queues Some protocols using TCP encapsulation (e.g., espintcp, openvpn) deliver userspace-bound packets through a custom skb queue rather than the standard sk_receive_queue. Introduce datagram_poll_queue that accepts an explicit receive queue, and convert datagram_poll into a wrapper around datagram_poll_queue. This allows protocols with custom skb queues to reuse the core polling logic without relying on sk_receive_queue. Cc: Sabrina Dubroca Cc: Antonio Quartulli Signed-off-by: Ralf Lici Reviewed-by: Sabrina Dubroca Reviewed-by: Antonio Quartulli Link: https://patch.msgid.link/20251021100942.195010-2-ralf@mandelbit.com Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 3 +++ net/core/datagram.c | 44 ++++++++++++++++++++++++++++++++++---------- 2 files changed, 37 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index fb3fec9affaa..a7cc3d1f4fd1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4204,6 +4204,9 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, struct sk_buff_head *sk_queue, unsigned int flags, int *off, int *err); struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, int *err); +__poll_t datagram_poll_queue(struct file *file, struct socket *sock, + struct poll_table_struct *wait, + struct sk_buff_head *rcv_queue); __poll_t datagram_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int skb_copy_datagram_iter(const struct sk_buff *from, int offset, diff --git a/net/core/datagram.c b/net/core/datagram.c index cb4b9ef2e4e3..c285c6465923 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -920,21 +920,22 @@ fault: EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); /** - * datagram_poll - generic datagram poll + * datagram_poll_queue - same as datagram_poll, but on a specific receive + * queue * @file: file struct * @sock: socket * @wait: poll table + * @rcv_queue: receive queue to poll * - * Datagram poll: Again totally generic. This also handles - * sequenced packet sockets providing the socket receive queue - * is only ever holding data ready to receive. + * Performs polling on the given receive queue, handling shutdown, error, + * and connection state. This is useful for protocols that deliver + * userspace-bound packets through a custom queue instead of + * sk->sk_receive_queue. * - * Note: when you *don't* use this routine for this protocol, - * and you use a different write policy from sock_writeable() - * then please supply your own write_space callback. + * Return: poll bitmask indicating the socket's current state */ -__poll_t datagram_poll(struct file *file, struct socket *sock, - poll_table *wait) +__poll_t datagram_poll_queue(struct file *file, struct socket *sock, + poll_table *wait, struct sk_buff_head *rcv_queue) { struct sock *sk = sock->sk; __poll_t mask; @@ -956,7 +957,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, mask |= EPOLLHUP; /* readable? */ - if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(rcv_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ @@ -978,4 +979,27 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, return mask; } +EXPORT_SYMBOL(datagram_poll_queue); + +/** + * datagram_poll - generic datagram poll + * @file: file struct + * @sock: socket + * @wait: poll table + * + * Datagram poll: Again totally generic. This also handles + * sequenced packet sockets providing the socket receive queue + * is only ever holding data ready to receive. + * + * Note: when you *don't* use this routine for this protocol, + * and you use a different write policy from sock_writeable() + * then please supply your own write_space callback. + * + * Return: poll bitmask indicating the socket's current state + */ +__poll_t datagram_poll(struct file *file, struct socket *sock, poll_table *wait) +{ + return datagram_poll_queue(file, sock, wait, + &sock->sk->sk_receive_queue); +} EXPORT_SYMBOL(datagram_poll); -- cgit v1.2.3 From 0fc3e32c2c069f541f2724d91f5e98480b640326 Mon Sep 17 00:00:00 2001 From: Ralf Lici Date: Tue, 21 Oct 2025 12:09:41 +0200 Subject: espintcp: use datagram_poll_queue for socket readiness espintcp uses a custom queue (ike_queue) to deliver packets to userspace. The polling logic relies on datagram_poll, which checks sk_receive_queue, which can lead to false readiness signals when that queue contains non-userspace packets. Switch espintcp_poll to use datagram_poll_queue with ike_queue, ensuring poll only signals readiness when userspace data is actually available. Fixes: e27cca96cd68 ("xfrm: add espintcp (RFC 8229)") Signed-off-by: Ralf Lici Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20251021100942.195010-3-ralf@mandelbit.com Signed-off-by: Paolo Abeni --- net/xfrm/espintcp.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net') diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c index fc7a603b04f1..bf744ac9d5a7 100644 --- a/net/xfrm/espintcp.c +++ b/net/xfrm/espintcp.c @@ -555,14 +555,10 @@ static void espintcp_close(struct sock *sk, long timeout) static __poll_t espintcp_poll(struct file *file, struct socket *sock, poll_table *wait) { - __poll_t mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; struct espintcp_ctx *ctx = espintcp_getctx(sk); - if (!skb_queue_empty(&ctx->ike_queue)) - mask |= EPOLLIN | EPOLLRDNORM; - - return mask; + return datagram_poll_queue(file, sock, wait, &ctx->ike_queue); } static void build_protos(struct proto *espintcp_prot, -- cgit v1.2.3 From f7c877e7535260cc7a21484c994e8ce7e8cb6780 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Tue, 21 Oct 2025 14:17:18 +0200 Subject: vsock: fix lock inversion in vsock_assign_transport() Syzbot reported a potential lock inversion deadlock between vsock_register_mutex and sk_lock-AF_VSOCK when vsock_linger() is called. The issue was introduced by commit 687aa0c5581b ("vsock: Fix transport_* TOCTOU") which added vsock_register_mutex locking in vsock_assign_transport() around the transport->release() call, that can call vsock_linger(). vsock_assign_transport() can be called with sk_lock held. vsock_linger() calls sk_wait_event() that temporarily releases and re-acquires sk_lock. During this window, if another thread hold vsock_register_mutex while trying to acquire sk_lock, a circular dependency is created. Fix this by releasing vsock_register_mutex before calling transport->release() and vsock_deassign_transport(). This is safe because we don't need to hold vsock_register_mutex while releasing the old transport, and we ensure the new transport won't disappear by obtaining a module reference first via try_module_get(). Reported-by: syzbot+10e35716f8e4929681fa@syzkaller.appspotmail.com Tested-by: syzbot+10e35716f8e4929681fa@syzkaller.appspotmail.com Fixes: 687aa0c5581b ("vsock: Fix transport_* TOCTOU") Cc: mhal@rbox.co Cc: stable@vger.kernel.org Signed-off-by: Stefano Garzarella Link: https://patch.msgid.link/20251021121718.137668-1-sgarzare@redhat.com Signed-off-by: Paolo Abeni --- net/vmw_vsock/af_vsock.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 4c2db6cca557..76763247a377 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -487,12 +487,26 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) goto err; } - if (vsk->transport) { - if (vsk->transport == new_transport) { - ret = 0; - goto err; - } + if (vsk->transport && vsk->transport == new_transport) { + ret = 0; + goto err; + } + /* We increase the module refcnt to prevent the transport unloading + * while there are open sockets assigned to it. + */ + if (!new_transport || !try_module_get(new_transport->module)) { + ret = -ENODEV; + goto err; + } + + /* It's safe to release the mutex after a successful try_module_get(). + * Whichever transport `new_transport` points at, it won't go away until + * the last module_put() below or in vsock_deassign_transport(). + */ + mutex_unlock(&vsock_register_mutex); + + if (vsk->transport) { /* transport->release() must be called with sock lock acquired. * This path can only be taken during vsock_connect(), where we * have already held the sock lock. In the other cases, this @@ -512,20 +526,6 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) vsk->peer_shutdown = 0; } - /* We increase the module refcnt to prevent the transport unloading - * while there are open sockets assigned to it. - */ - if (!new_transport || !try_module_get(new_transport->module)) { - ret = -ENODEV; - goto err; - } - - /* It's safe to release the mutex after a successful try_module_get(). - * Whichever transport `new_transport` points at, it won't go away until - * the last module_put() below or in vsock_deassign_transport(). - */ - mutex_unlock(&vsock_register_mutex); - if (sk->sk_type == SOCK_SEQPACKET) { if (!new_transport->seqpacket_allow || !new_transport->seqpacket_allow(remote_cid)) { -- cgit v1.2.3 From 09b0cd1297b4dbfe736aeaa0ceeab2265f47f772 Mon Sep 17 00:00:00 2001 From: Cen Zhang Date: Mon, 29 Sep 2025 05:30:17 +0000 Subject: Bluetooth: hci_sync: fix race in hci_cmd_sync_dequeue_once hci_cmd_sync_dequeue_once() does lookup and then cancel the entry under two separate lock sections. Meanwhile, hci_cmd_sync_work() can also delete the same entry, leading to double list_del() and "UAF". Fix this by holding cmd_sync_work_lock across both lookup and cancel, so that the entry cannot be removed concurrently. Fixes: 505ea2b29592 ("Bluetooth: hci_sync: Add helper functions to manipulate cmd_sync queue") Reported-by: Cen Zhang Signed-off-by: Cen Zhang Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index eefdb6134ca5..d160e5e1fe8a 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -863,11 +863,17 @@ bool hci_cmd_sync_dequeue_once(struct hci_dev *hdev, { struct hci_cmd_sync_work_entry *entry; - entry = hci_cmd_sync_lookup_entry(hdev, func, data, destroy); - if (!entry) + mutex_lock(&hdev->cmd_sync_work_lock); + + entry = _hci_cmd_sync_lookup_entry(hdev, func, data, destroy); + if (!entry) { + mutex_unlock(&hdev->cmd_sync_work_lock); return false; + } - hci_cmd_sync_cancel_entry(hdev, entry); + _hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED); + + mutex_unlock(&hdev->cmd_sync_work_lock); return true; } -- cgit v1.2.3 From f0c200a4a537f8f374584a974518b0ce69eda76c Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 26 Sep 2025 11:48:50 -0400 Subject: Bluetooth: ISO: Fix BIS connection dst_type handling Socket dst_type cannot be directly assigned to hci_conn->type since there domain is different which may lead to the wrong address type being used. Fixes: 6a5ad251b7cd ("Bluetooth: ISO: Fix possible circular locking dependency") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 9b263d061e05..954e1916506b 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -2032,7 +2032,7 @@ static void iso_conn_ready(struct iso_conn *conn) */ if (!bacmp(&hcon->dst, BDADDR_ANY)) { bacpy(&hcon->dst, &iso_pi(parent)->dst); - hcon->dst_type = iso_pi(parent)->dst_type; + hcon->dst_type = le_addr_type(iso_pi(parent)->dst_type); } if (test_bit(HCI_CONN_PA_SYNC, &hcon->flags)) { -- cgit v1.2.3 From 0d92808024b4e9868cef68d16f121d509843e80e Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 1 Oct 2025 10:55:58 -0400 Subject: Bluetooth: HCI: Fix tracking of advertisement set/instance 0x00 This fixes the state tracking of advertisement set/instance 0x00 which is considered a legacy instance and is not tracked individually by adv_instances list, previously it was assumed that hci_dev itself would track it via HCI_LE_ADV but that is a global state not specifc to instance 0x00, so to fix it a new flag is introduced that only tracks the state of instance 0x00. Fixes: 1488af7b8b5f ("Bluetooth: hci_sync: Fix hci_resume_advertising_sync") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 1 + net/bluetooth/hci_event.c | 4 ++++ net/bluetooth/hci_sync.c | 5 ++--- 3 files changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 9ecc70baaca9..8d0e703bc929 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -434,6 +434,7 @@ enum { HCI_USER_CHANNEL, HCI_EXT_CONFIGURED, HCI_LE_ADV, + HCI_LE_ADV_0, HCI_LE_PER_ADV, HCI_LE_SCAN, HCI_SSP_ENABLED, diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index d790b0d4eb9a..1dabf5a7ae18 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1609,6 +1609,8 @@ static u8 hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev, void *data, if (adv && !adv->periodic) adv->enabled = true; + else if (!set->handle) + hci_dev_set_flag(hdev, HCI_LE_ADV_0); conn = hci_lookup_le_connect(hdev); if (conn) @@ -1619,6 +1621,8 @@ static u8 hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev, void *data, if (cp->num_of_sets) { if (adv) adv->enabled = false; + else if (!set->handle) + hci_dev_clear_flag(hdev, HCI_LE_ADV_0); /* If just one instance was disabled check if there are * any other instance enabled before clearing HCI_LE_ADV diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index d160e5e1fe8a..28ad08cd7d70 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -2606,9 +2606,8 @@ static int hci_resume_advertising_sync(struct hci_dev *hdev) /* If current advertising instance is set to instance 0x00 * then we need to re-enable it. */ - if (!hdev->cur_adv_instance) - err = hci_enable_ext_advertising_sync(hdev, - hdev->cur_adv_instance); + if (hci_dev_test_and_clear_flag(hdev, HCI_LE_ADV_0)) + err = hci_enable_ext_advertising_sync(hdev, 0x00); } else { /* Schedule for most recent instance to be restarted and begin * the software rotation loop -- cgit v1.2.3 From e8785404de06a69d89dcdd1e9a0b6ea42dc6d327 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Fri, 3 Oct 2025 22:07:32 +0300 Subject: Bluetooth: MGMT: fix crash in set_mesh_sync and set_mesh_complete There is a BUG: KASAN: stack-out-of-bounds in set_mesh_sync due to memcpy from badly declared on-stack flexible array. Another crash is in set_mesh_complete() due to double list_del via mgmt_pending_valid + mgmt_pending_remove. Use DEFINE_FLEX to declare the flexible array right, and don't memcpy outside bounds. As mgmt_pending_valid removes the cmd from list, use mgmt_pending_free, and also report status on error. Fixes: 302a1f674c00d ("Bluetooth: MGMT: Fix possible UAFs") Signed-off-by: Pauli Virtanen Reviewed-by: Paul Menzel Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/mgmt.h | 2 +- net/bluetooth/mgmt.c | 26 +++++++++++++++----------- 2 files changed, 16 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 74edea06985b..bca0333f1e99 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -853,7 +853,7 @@ struct mgmt_cp_set_mesh { __le16 window; __le16 period; __u8 num_ad_types; - __u8 ad_types[]; + __u8 ad_types[] __counted_by(num_ad_types); } __packed; #define MGMT_SET_MESH_RECEIVER_SIZE 6 diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index a3d16eece0d2..24e335e3a727 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -2175,19 +2175,24 @@ static void set_mesh_complete(struct hci_dev *hdev, void *data, int err) sk = cmd->sk; if (status) { + mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER, + status); mgmt_pending_foreach(MGMT_OP_SET_MESH_RECEIVER, hdev, true, cmd_status_rsp, &status); - return; + goto done; } - mgmt_pending_remove(cmd); mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER, 0, NULL, 0); + +done: + mgmt_pending_free(cmd); } static int set_mesh_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_cp_set_mesh cp; + DEFINE_FLEX(struct mgmt_cp_set_mesh, cp, ad_types, num_ad_types, + sizeof(hdev->mesh_ad_types)); size_t len; mutex_lock(&hdev->mgmt_pending_lock); @@ -2197,27 +2202,26 @@ static int set_mesh_sync(struct hci_dev *hdev, void *data) return -ECANCELED; } - memcpy(&cp, cmd->param, sizeof(cp)); + len = cmd->param_len; + memcpy(cp, cmd->param, min(__struct_size(cp), len)); mutex_unlock(&hdev->mgmt_pending_lock); - len = cmd->param_len; - memset(hdev->mesh_ad_types, 0, sizeof(hdev->mesh_ad_types)); - if (cp.enable) + if (cp->enable) hci_dev_set_flag(hdev, HCI_MESH); else hci_dev_clear_flag(hdev, HCI_MESH); - hdev->le_scan_interval = __le16_to_cpu(cp.period); - hdev->le_scan_window = __le16_to_cpu(cp.window); + hdev->le_scan_interval = __le16_to_cpu(cp->period); + hdev->le_scan_window = __le16_to_cpu(cp->window); - len -= sizeof(cp); + len -= sizeof(struct mgmt_cp_set_mesh); /* If filters don't fit, forward all adv pkts */ if (len <= sizeof(hdev->mesh_ad_types)) - memcpy(hdev->mesh_ad_types, cp.ad_types, len); + memcpy(hdev->mesh_ad_types, cp->ad_types, len); hci_update_passive_scan_sync(hdev); return 0; -- cgit v1.2.3 From 76e20da0bd00c556ed0a1e7250bdb6ac3e808ea8 Mon Sep 17 00:00:00 2001 From: Frédéric Danis Date: Mon, 6 Oct 2025 10:35:44 +0200 Subject: Revert "Bluetooth: L2CAP: convert timeouts to secs_to_jiffies()" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit c9d84da18d1e0d28a7e16ca6df8e6d47570501d4. It replaces in L2CAP calls to msecs_to_jiffies() to secs_to_jiffies() and updates the constants accordingly. But the constants are also used in LCAP Configure Request and L2CAP Configure Response which expect values in milliseconds. This may prevent correct usage of L2CAP channel. To fix it, keep those constants in milliseconds and so revert this change. Fixes: c9d84da18d1e ("Bluetooth: L2CAP: convert timeouts to secs_to_jiffies()") Signed-off-by: Frédéric Danis Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/l2cap.h | 4 ++-- net/bluetooth/l2cap_core.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 4bb0eaedda18..00e182a22720 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -38,8 +38,8 @@ #define L2CAP_DEFAULT_TX_WINDOW 63 #define L2CAP_DEFAULT_EXT_WINDOW 0x3FFF #define L2CAP_DEFAULT_MAX_TX 3 -#define L2CAP_DEFAULT_RETRANS_TO 2 /* seconds */ -#define L2CAP_DEFAULT_MONITOR_TO 12 /* seconds */ +#define L2CAP_DEFAULT_RETRANS_TO 2000 /* 2 seconds */ +#define L2CAP_DEFAULT_MONITOR_TO 12000 /* 12 seconds */ #define L2CAP_DEFAULT_MAX_PDU_SIZE 1492 /* Sized for AMP packet */ #define L2CAP_DEFAULT_ACK_TO 200 #define L2CAP_DEFAULT_MAX_SDU_SIZE 0xFFFF diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 805c752ac0a9..d08320380ad6 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -282,7 +282,7 @@ static void __set_retrans_timer(struct l2cap_chan *chan) if (!delayed_work_pending(&chan->monitor_timer) && chan->retrans_timeout) { l2cap_set_timer(chan, &chan->retrans_timer, - secs_to_jiffies(chan->retrans_timeout)); + msecs_to_jiffies(chan->retrans_timeout)); } } @@ -291,7 +291,7 @@ static void __set_monitor_timer(struct l2cap_chan *chan) __clear_retrans_timer(chan); if (chan->monitor_timeout) { l2cap_set_timer(chan, &chan->monitor_timer, - secs_to_jiffies(chan->monitor_timeout)); + msecs_to_jiffies(chan->monitor_timeout)); } } -- cgit v1.2.3 From c403da5e98b04a2aec9cfb25cbeeb28d7ce29975 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 7 Oct 2025 13:29:15 -0400 Subject: Bluetooth: ISO: Fix another instance of dst_type handling Socket dst_type cannot be directly assigned to hci_conn->type since there domain is different which may lead to the wrong address type being used. Fixes: 6a5ad251b7cd ("Bluetooth: ISO: Fix possible circular locking dependency") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 954e1916506b..3d98cb6291da 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -2046,7 +2046,13 @@ static void iso_conn_ready(struct iso_conn *conn) } bacpy(&iso_pi(sk)->dst, &hcon->dst); - iso_pi(sk)->dst_type = hcon->dst_type; + + /* Convert from HCI to three-value type */ + if (hcon->dst_type == ADDR_LE_DEV_PUBLIC) + iso_pi(sk)->dst_type = BDADDR_LE_PUBLIC; + else + iso_pi(sk)->dst_type = BDADDR_LE_RANDOM; + iso_pi(sk)->sync_handle = iso_pi(parent)->sync_handle; memcpy(iso_pi(sk)->base, iso_pi(parent)->base, iso_pi(parent)->base_len); iso_pi(sk)->base_len = iso_pi(parent)->base_len; -- cgit v1.2.3 From 857eb0fabc389be5159e0e17d84bc122614b5b98 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 22 Oct 2025 16:29:41 -0400 Subject: Bluetooth: hci_conn: Fix connection cleanup with BIG with 2 or more BIS This fixes bis_cleanup not considering connections in BT_OPEN state before attempting to remove the BIG causing the following error: btproxy[20110]: < HCI Command: LE Terminate Broadcast Isochronous Group (0x08|0x006a) plen 2 BIG Handle: 0x01 Reason: Connection Terminated By Local Host (0x16) > HCI Event: Command Status (0x0f) plen 4 LE Terminate Broadcast Isochronous Group (0x08|0x006a) ncmd 1 Status: Unknown Advertising Identifier (0x42) Fixes: fa224d0c094a ("Bluetooth: ISO: Reassociate a socket with an active BIS") Signed-off-by: Luiz Augusto von Dentz Reviewed-by: Paul Menzel --- net/bluetooth/hci_conn.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 111f0e37b672..c5dedf39a129 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -843,6 +843,13 @@ static void bis_cleanup(struct hci_conn *conn) if (bis) return; + bis = hci_conn_hash_lookup_big_state(hdev, + conn->iso_qos.bcast.big, + BT_OPEN, + HCI_ROLE_MASTER); + if (bis) + return; + hci_le_terminate_big(hdev, conn); } else { hci_le_big_terminate(hdev, conn->iso_qos.bcast.big, -- cgit v1.2.3 From 751463ceefc3397566d03c8b64ef4a77f5fd88ac Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 22 Oct 2025 16:03:19 -0400 Subject: Bluetooth: hci_core: Fix tracking of periodic advertisement Periodic advertising enabled flag cannot be tracked by the enabled flag since advertising and periodic advertising each can be enabled/disabled separately from one another causing the states to be inconsistent when for example an advertising set is disabled its enabled flag is set to false which is then used for periodic which has not being disabled. Fixes: eca0ae4aea66 ("Bluetooth: Add initial implementation of BIS connections") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 1 + net/bluetooth/hci_event.c | 7 +++++-- net/bluetooth/hci_sync.c | 4 ++-- 3 files changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 2924c2bf2a98..b8100dbfe5d7 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -244,6 +244,7 @@ struct adv_info { bool enabled; bool pending; bool periodic; + bool periodic_enabled; __u8 mesh; __u8 instance; __u8 handle; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 1dabf5a7ae18..d37db364acf7 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1607,7 +1607,7 @@ static u8 hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev, void *data, hci_dev_set_flag(hdev, HCI_LE_ADV); - if (adv && !adv->periodic) + if (adv) adv->enabled = true; else if (!set->handle) hci_dev_set_flag(hdev, HCI_LE_ADV_0); @@ -3963,8 +3963,11 @@ static u8 hci_cc_le_set_per_adv_enable(struct hci_dev *hdev, void *data, hci_dev_set_flag(hdev, HCI_LE_PER_ADV); if (adv) - adv->enabled = true; + adv->periodic_enabled = true; } else { + if (adv) + adv->periodic_enabled = false; + /* If just one instance was disabled check if there are * any other instance enabled before clearing HCI_LE_PER_ADV. * The current periodic adv instance will be marked as diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 28ad08cd7d70..73fc41b68b68 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -1607,7 +1607,7 @@ int hci_disable_per_advertising_sync(struct hci_dev *hdev, u8 instance) /* If periodic advertising already disabled there is nothing to do. */ adv = hci_find_adv_instance(hdev, instance); - if (!adv || !adv->periodic || !adv->enabled) + if (!adv || !adv->periodic_enabled) return 0; memset(&cp, 0, sizeof(cp)); @@ -1672,7 +1672,7 @@ static int hci_enable_per_advertising_sync(struct hci_dev *hdev, u8 instance) /* If periodic advertising already enabled there is nothing to do. */ adv = hci_find_adv_instance(hdev, instance); - if (adv && adv->periodic && adv->enabled) + if (adv && adv->periodic_enabled) return 0; memset(&cp, 0, sizeof(cp)); -- cgit v1.2.3 From 91d35ec9b3956d6b3cf789c1593467e58855b03a Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 23 Oct 2025 14:05:30 +0200 Subject: Bluetooth: rfcomm: fix modem control handling The RFCOMM driver confuses the local and remote modem control signals, which specifically means that the reported DTR and RTS state will instead reflect the remote end (i.e. DSR and CTS). This issue dates back to the original driver (and a follow-on update) merged in 2002, which resulted in a non-standard implementation of TIOCMSET that allowed controlling also the TS07.10 IC and DV signals by mapping them to the RI and DCD input flags, while TIOCMGET failed to return the actual state of DTR and RTS. Note that the bogus control of input signals in tiocmset() is just dead code as those flags will have been masked out by the tty layer since 2003. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/rfcomm/tty.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index 376ce6de84be..b783526ab588 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -643,8 +643,8 @@ static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig) tty_port_tty_hangup(&dev->port, true); dev->modem_status = - ((v24_sig & RFCOMM_V24_RTC) ? (TIOCM_DSR | TIOCM_DTR) : 0) | - ((v24_sig & RFCOMM_V24_RTR) ? (TIOCM_RTS | TIOCM_CTS) : 0) | + ((v24_sig & RFCOMM_V24_RTC) ? TIOCM_DSR : 0) | + ((v24_sig & RFCOMM_V24_RTR) ? TIOCM_CTS : 0) | ((v24_sig & RFCOMM_V24_IC) ? TIOCM_RI : 0) | ((v24_sig & RFCOMM_V24_DV) ? TIOCM_CD : 0); } @@ -1055,10 +1055,14 @@ static void rfcomm_tty_hangup(struct tty_struct *tty) static int rfcomm_tty_tiocmget(struct tty_struct *tty) { struct rfcomm_dev *dev = tty->driver_data; + struct rfcomm_dlc *dlc = dev->dlc; + u8 v24_sig; BT_DBG("tty %p dev %p", tty, dev); - return dev->modem_status; + rfcomm_dlc_get_modem_status(dlc, &v24_sig); + + return (v24_sig & (TIOCM_DTR | TIOCM_RTS)) | dev->modem_status; } static int rfcomm_tty_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) @@ -1071,23 +1075,15 @@ static int rfcomm_tty_tiocmset(struct tty_struct *tty, unsigned int set, unsigne rfcomm_dlc_get_modem_status(dlc, &v24_sig); - if (set & TIOCM_DSR || set & TIOCM_DTR) + if (set & TIOCM_DTR) v24_sig |= RFCOMM_V24_RTC; - if (set & TIOCM_RTS || set & TIOCM_CTS) + if (set & TIOCM_RTS) v24_sig |= RFCOMM_V24_RTR; - if (set & TIOCM_RI) - v24_sig |= RFCOMM_V24_IC; - if (set & TIOCM_CD) - v24_sig |= RFCOMM_V24_DV; - if (clear & TIOCM_DSR || clear & TIOCM_DTR) + if (clear & TIOCM_DTR) v24_sig &= ~RFCOMM_V24_RTC; - if (clear & TIOCM_RTS || clear & TIOCM_CTS) + if (clear & TIOCM_RTS) v24_sig &= ~RFCOMM_V24_RTR; - if (clear & TIOCM_RI) - v24_sig &= ~RFCOMM_V24_IC; - if (clear & TIOCM_CD) - v24_sig &= ~RFCOMM_V24_DV; rfcomm_dlc_set_modem_status(dlc, v24_sig); -- cgit v1.2.3 From 7ceba45a6658ce637da334cd0ebf27f4ede6c0fe Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Tue, 28 Oct 2025 12:58:37 +0200 Subject: wifi: cfg80211: add an hrtimer based delayed work item The normal timer mechanism assume that timeout further in the future need a lower accuracy. As an example, the granularity for a timer scheduled 4096 ms in the future on a 1000 Hz system is already 512 ms. This granularity is perfectly sufficient for e.g. timeouts, but there are other types of events that will happen at a future point in time and require a higher accuracy. Add a new wiphy_hrtimer_work type that uses an hrtimer internally. The API is almost identical to the existing wiphy_delayed_work and it can be used as a drop-in replacement after minor adjustments. The work will be scheduled relative to the current time with a slack of 1 millisecond. CC: stable@vger.kernel.org # 6.4+ Signed-off-by: Benjamin Berg Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20251028125710.7f13a2adc5eb.I01b5af0363869864b0580d9c2a1770bafab69566@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++ net/wireless/core.c | 56 ++++++++++++++++++++++++++++++++++++ net/wireless/trace.h | 21 ++++++++++++++ 3 files changed, 155 insertions(+) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 781624f5913a..820e299f06b5 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -6435,6 +6435,11 @@ static inline void wiphy_delayed_work_init(struct wiphy_delayed_work *dwork, * after wiphy_lock() was called. Therefore, wiphy_cancel_work() can * use just cancel_work() instead of cancel_work_sync(), it requires * being in a section protected by wiphy_lock(). + * + * Note that these are scheduled with a timer where the accuracy + * becomes less the longer in the future the scheduled timer is. Use + * wiphy_hrtimer_work_queue() if the timer must be not be late by more + * than approximately 10 percent. */ void wiphy_delayed_work_queue(struct wiphy *wiphy, struct wiphy_delayed_work *dwork, @@ -6506,6 +6511,79 @@ void wiphy_delayed_work_flush(struct wiphy *wiphy, bool wiphy_delayed_work_pending(struct wiphy *wiphy, struct wiphy_delayed_work *dwork); +struct wiphy_hrtimer_work { + struct wiphy_work work; + struct wiphy *wiphy; + struct hrtimer timer; +}; + +enum hrtimer_restart wiphy_hrtimer_work_timer(struct hrtimer *t); + +static inline void wiphy_hrtimer_work_init(struct wiphy_hrtimer_work *hrwork, + wiphy_work_func_t func) +{ + hrtimer_setup(&hrwork->timer, wiphy_hrtimer_work_timer, + CLOCK_BOOTTIME, HRTIMER_MODE_REL); + wiphy_work_init(&hrwork->work, func); +} + +/** + * wiphy_hrtimer_work_queue - queue hrtimer work for the wiphy + * @wiphy: the wiphy to queue for + * @hrwork: the high resolution timer worker + * @delay: the delay given as a ktime_t + * + * Please refer to wiphy_delayed_work_queue(). The difference is that + * the hrtimer work uses a high resolution timer for scheduling. This + * may be needed if timeouts might be scheduled further in the future + * and the accuracy of the normal timer is not sufficient. + * + * Expect a delay of a few milliseconds as the timer is scheduled + * with some slack and some more time may pass between queueing the + * work and its start. + */ +void wiphy_hrtimer_work_queue(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork, + ktime_t delay); + +/** + * wiphy_hrtimer_work_cancel - cancel previously queued hrtimer work + * @wiphy: the wiphy, for debug purposes + * @hrtimer: the hrtimer work to cancel + * + * Cancel the work *without* waiting for it, this assumes being + * called under the wiphy mutex acquired by wiphy_lock(). + */ +void wiphy_hrtimer_work_cancel(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrtimer); + +/** + * wiphy_hrtimer_work_flush - flush previously queued hrtimer work + * @wiphy: the wiphy, for debug purposes + * @hrwork: the hrtimer work to flush + * + * Flush the work (i.e. run it if pending). This must be called + * under the wiphy mutex acquired by wiphy_lock(). + */ +void wiphy_hrtimer_work_flush(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork); + +/** + * wiphy_hrtimer_work_pending - Find out whether a wiphy hrtimer + * work item is currently pending. + * + * @wiphy: the wiphy, for debug purposes + * @hrwork: the hrtimer work in question + * + * Return: true if timer is pending, false otherwise + * + * Please refer to the wiphy_delayed_work_pending() documentation as + * this is the equivalent function for hrtimer based delayed work + * items. + */ +bool wiphy_hrtimer_work_pending(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork); + /** * enum ieee80211_ap_reg_power - regulatory power for an Access Point * diff --git a/net/wireless/core.c b/net/wireless/core.c index 797f9f2004a6..54a34d8d356e 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1787,6 +1787,62 @@ bool wiphy_delayed_work_pending(struct wiphy *wiphy, } EXPORT_SYMBOL_GPL(wiphy_delayed_work_pending); +enum hrtimer_restart wiphy_hrtimer_work_timer(struct hrtimer *t) +{ + struct wiphy_hrtimer_work *hrwork = + container_of(t, struct wiphy_hrtimer_work, timer); + + wiphy_work_queue(hrwork->wiphy, &hrwork->work); + + return HRTIMER_NORESTART; +} +EXPORT_SYMBOL_GPL(wiphy_hrtimer_work_timer); + +void wiphy_hrtimer_work_queue(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork, + ktime_t delay) +{ + trace_wiphy_hrtimer_work_queue(wiphy, &hrwork->work, delay); + + if (!delay) { + hrtimer_cancel(&hrwork->timer); + wiphy_work_queue(wiphy, &hrwork->work); + return; + } + + hrwork->wiphy = wiphy; + hrtimer_start_range_ns(&hrwork->timer, delay, + 1000 * NSEC_PER_USEC, HRTIMER_MODE_REL); +} +EXPORT_SYMBOL_GPL(wiphy_hrtimer_work_queue); + +void wiphy_hrtimer_work_cancel(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork) +{ + lockdep_assert_held(&wiphy->mtx); + + hrtimer_cancel(&hrwork->timer); + wiphy_work_cancel(wiphy, &hrwork->work); +} +EXPORT_SYMBOL_GPL(wiphy_hrtimer_work_cancel); + +void wiphy_hrtimer_work_flush(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork) +{ + lockdep_assert_held(&wiphy->mtx); + + hrtimer_cancel(&hrwork->timer); + wiphy_work_flush(wiphy, &hrwork->work); +} +EXPORT_SYMBOL_GPL(wiphy_hrtimer_work_flush); + +bool wiphy_hrtimer_work_pending(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork) +{ + return hrtimer_is_queued(&hrwork->timer); +} +EXPORT_SYMBOL_GPL(wiphy_hrtimer_work_pending); + static int __init cfg80211_init(void) { int err; diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 8a4c34112eb5..2b71f1d867a0 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -304,6 +304,27 @@ TRACE_EVENT(wiphy_delayed_work_queue, __entry->delay) ); +TRACE_EVENT(wiphy_hrtimer_work_queue, + TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work, + ktime_t delay), + TP_ARGS(wiphy, work, delay), + TP_STRUCT__entry( + WIPHY_ENTRY + __field(void *, instance) + __field(void *, func) + __field(ktime_t, delay) + ), + TP_fast_assign( + WIPHY_ASSIGN; + __entry->instance = work; + __entry->func = work->func; + __entry->delay = delay; + ), + TP_printk(WIPHY_PR_FMT " instance=%p func=%pS delay=%llu", + WIPHY_PR_ARG, __entry->instance, __entry->func, + __entry->delay) +); + TRACE_EVENT(wiphy_work_worker_start, TP_PROTO(struct wiphy *wiphy), TP_ARGS(wiphy), -- cgit v1.2.3 From dfa865d490b1bd252045463588a91a4d3c82f3c8 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Tue, 28 Oct 2025 12:58:38 +0200 Subject: wifi: mac80211: use wiphy_hrtimer_work for ttlm_work The work item may be scheduled relatively far in the future. As the event happens at a specific point in time, the normal timer accuracy is not sufficient in that case. Switch to use wiphy_hrtimer_work so that the accuracy is sufficient. CC: stable@vger.kernel.org Fixes: 702e80470a33 ("wifi: mac80211: support handling of advertised TID-to-link mapping") Signed-off-by: Benjamin Berg Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20251028125710.83c2c611545e.I35498a6d883ea24b0dc4910cf521aa768d2a0e90@changeid Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 2 +- net/mac80211/mlme.c | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 73fd86ec1bce..eb22279c6e01 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -616,7 +616,7 @@ struct ieee80211_if_managed { u16 removed_links; /* TID-to-link mapping support */ - struct wiphy_delayed_work ttlm_work; + struct wiphy_hrtimer_work ttlm_work; struct ieee80211_adv_ttlm_info ttlm_info; struct wiphy_work teardown_ttlm_work; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 3b5827ea438e..623a46b3214e 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -45,7 +45,7 @@ #define IEEE80211_ASSOC_TIMEOUT_SHORT (HZ / 10) #define IEEE80211_ASSOC_MAX_TRIES 3 -#define IEEE80211_ADV_TTLM_SAFETY_BUFFER_MS msecs_to_jiffies(100) +#define IEEE80211_ADV_TTLM_SAFETY_BUFFER_MS (100 * USEC_PER_MSEC) #define IEEE80211_ADV_TTLM_ST_UNDERFLOW 0xff00 #define IEEE80211_NEG_TTLM_REQ_TIMEOUT (HZ / 5) @@ -4242,7 +4242,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, memset(&sdata->u.mgd.ttlm_info, 0, sizeof(sdata->u.mgd.ttlm_info)); - wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &ifmgd->ttlm_work); + wiphy_hrtimer_work_cancel(sdata->local->hw.wiphy, &ifmgd->ttlm_work); memset(&sdata->vif.neg_ttlm, 0, sizeof(sdata->vif.neg_ttlm)); wiphy_delayed_work_cancel(sdata->local->hw.wiphy, @@ -7095,7 +7095,7 @@ static void ieee80211_process_adv_ttlm(struct ieee80211_sub_if_data *sdata, /* if a planned TID-to-link mapping was cancelled - * abort it */ - wiphy_delayed_work_cancel(sdata->local->hw.wiphy, + wiphy_hrtimer_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.ttlm_work); } else if (sdata->u.mgd.ttlm_info.active) { /* if no TID-to-link element, set to default mapping in @@ -7130,7 +7130,7 @@ static void ieee80211_process_adv_ttlm(struct ieee80211_sub_if_data *sdata, if (ttlm_info.switch_time) { u16 beacon_ts_tu, st_tu, delay; - u32 delay_jiffies; + u64 delay_usec; u64 mask; /* The t2l map switch time is indicated with a partial @@ -7152,23 +7152,23 @@ static void ieee80211_process_adv_ttlm(struct ieee80211_sub_if_data *sdata, if (delay > IEEE80211_ADV_TTLM_ST_UNDERFLOW) return; - delay_jiffies = TU_TO_JIFFIES(delay); + delay_usec = ieee80211_tu_to_usec(delay); /* Link switching can take time, so schedule it * 100ms before to be ready on time */ - if (delay_jiffies > IEEE80211_ADV_TTLM_SAFETY_BUFFER_MS) - delay_jiffies -= + if (delay_usec > IEEE80211_ADV_TTLM_SAFETY_BUFFER_MS) + delay_usec -= IEEE80211_ADV_TTLM_SAFETY_BUFFER_MS; else - delay_jiffies = 0; + delay_usec = 0; sdata->u.mgd.ttlm_info = ttlm_info; - wiphy_delayed_work_cancel(sdata->local->hw.wiphy, + wiphy_hrtimer_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.ttlm_work); - wiphy_delayed_work_queue(sdata->local->hw.wiphy, + wiphy_hrtimer_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.ttlm_work, - delay_jiffies); + us_to_ktime(delay_usec)); return; } } @@ -8802,7 +8802,7 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) timer_setup(&ifmgd->conn_mon_timer, ieee80211_sta_conn_mon_timer, 0); wiphy_delayed_work_init(&ifmgd->tx_tspec_wk, ieee80211_sta_handle_tspec_ac_params_wk); - wiphy_delayed_work_init(&ifmgd->ttlm_work, + wiphy_hrtimer_work_init(&ifmgd->ttlm_work, ieee80211_tid_to_link_map_work); wiphy_delayed_work_init(&ifmgd->neg_ttlm_timeout_work, ieee80211_neg_ttlm_timeout_work); -- cgit v1.2.3 From 3f654d53dff565095d83a84e3b6187526dadf4c8 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Tue, 28 Oct 2025 12:58:39 +0200 Subject: wifi: mac80211: use wiphy_hrtimer_work for ml_reconf_work The work item may be scheduled relatively far in the future. As the event happens at a specific point in time, the normal timer accuracy is not sufficient in that case. Switch to use wiphy_hrtimer_work so that the accuracy is sufficient. CC: stable@vger.kernel.org Fixes: 8eb8dd2ffbbb ("wifi: mac80211: Support link removal using Reconfiguration ML element") Signed-off-by: Benjamin Berg Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20251028125710.24a7b54e9e37.I063c5c15bf7672f94cea75f83e486a3ca52d098f@changeid Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 2 +- net/mac80211/mlme.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index eb22279c6e01..eb38049b2252 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -612,7 +612,7 @@ struct ieee80211_if_managed { u8 *assoc_req_ies; size_t assoc_req_ies_len; - struct wiphy_delayed_work ml_reconf_work; + struct wiphy_hrtimer_work ml_reconf_work; u16 removed_links; /* TID-to-link mapping support */ diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 623a46b3214e..f95bcf84ecc2 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -4249,7 +4249,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, &ifmgd->neg_ttlm_timeout_work); sdata->u.mgd.removed_links = 0; - wiphy_delayed_work_cancel(sdata->local->hw.wiphy, + wiphy_hrtimer_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.ml_reconf_work); wiphy_work_cancel(sdata->local->hw.wiphy, @@ -6876,7 +6876,7 @@ static void ieee80211_ml_reconfiguration(struct ieee80211_sub_if_data *sdata, /* In case the removal was cancelled, abort it */ if (sdata->u.mgd.removed_links) { sdata->u.mgd.removed_links = 0; - wiphy_delayed_work_cancel(sdata->local->hw.wiphy, + wiphy_hrtimer_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.ml_reconf_work); } return; @@ -6906,9 +6906,9 @@ static void ieee80211_ml_reconfiguration(struct ieee80211_sub_if_data *sdata, } sdata->u.mgd.removed_links = removed_links; - wiphy_delayed_work_queue(sdata->local->hw.wiphy, + wiphy_hrtimer_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.ml_reconf_work, - TU_TO_JIFFIES(delay)); + us_to_ktime(ieee80211_tu_to_usec(delay))); } static int ieee80211_ttlm_set_links(struct ieee80211_sub_if_data *sdata, @@ -8793,7 +8793,7 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) ieee80211_csa_connection_drop_work); wiphy_delayed_work_init(&ifmgd->tdls_peer_del_work, ieee80211_tdls_peer_del_work); - wiphy_delayed_work_init(&ifmgd->ml_reconf_work, + wiphy_hrtimer_work_init(&ifmgd->ml_reconf_work, ieee80211_ml_reconf_work); wiphy_delayed_work_init(&ifmgd->reconf.wk, ieee80211_ml_sta_reconf_timeout); -- cgit v1.2.3 From fbc1cc6973099f45e4c30b86f12b4435c7cb7d24 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Tue, 28 Oct 2025 12:58:40 +0200 Subject: wifi: mac80211: use wiphy_hrtimer_work for csa.switch_work The work item may be scheduled relatively far in the future. As the event happens at a specific point in time, the normal timer accuracy is not sufficient in that case. Switch to use wiphy_hrtimer_work so that the accuracy is sufficient. To make this work, use the same clock to store the timestamp. CC: stable@vger.kernel.org Fixes: ec3252bff7b6 ("wifi: mac80211: use wiphy work for channel switch") Signed-off-by: Benjamin Berg Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20251028125710.68258c7e4ac4.I4ff2b2cdffbbf858bf5f08baccc7a88c4f9efe6f@changeid Signed-off-by: Johannes Berg --- net/mac80211/chan.c | 2 +- net/mac80211/ieee80211_i.h | 4 ++-- net/mac80211/link.c | 4 ++-- net/mac80211/mlme.c | 18 +++++++++--------- 4 files changed, 14 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 57065714cf8c..7f8799fd673e 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -1290,7 +1290,7 @@ ieee80211_link_chanctx_reservation_complete(struct ieee80211_link_data *link) &link->csa.finalize_work); break; case NL80211_IFTYPE_STATION: - wiphy_delayed_work_queue(sdata->local->hw.wiphy, + wiphy_hrtimer_work_queue(sdata->local->hw.wiphy, &link->u.mgd.csa.switch_work, 0); break; case NL80211_IFTYPE_UNSPECIFIED: diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index eb38049b2252..878c3b14aeb8 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1017,10 +1017,10 @@ struct ieee80211_link_data_managed { bool operating_11g_mode; struct { - struct wiphy_delayed_work switch_work; + struct wiphy_hrtimer_work switch_work; struct cfg80211_chan_def ap_chandef; struct ieee80211_parsed_tpe tpe; - unsigned long time; + ktime_t time; bool waiting_bcn; bool ignored_same_chan; bool blocked_tx; diff --git a/net/mac80211/link.c b/net/mac80211/link.c index d71eabe5abf8..4a19b765ccb6 100644 --- a/net/mac80211/link.c +++ b/net/mac80211/link.c @@ -472,10 +472,10 @@ static int _ieee80211_set_active_links(struct ieee80211_sub_if_data *sdata, * from there. */ if (link->conf->csa_active) - wiphy_delayed_work_queue(local->hw.wiphy, + wiphy_hrtimer_work_queue(local->hw.wiphy, &link->u.mgd.csa.switch_work, link->u.mgd.csa.time - - jiffies); + ktime_get_boottime()); } for_each_set_bit(link_id, &add, IEEE80211_MLD_MAX_NUM_LINKS) { diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index f95bcf84ecc2..f3138d158535 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2594,7 +2594,7 @@ void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success, return; } - wiphy_delayed_work_queue(sdata->local->hw.wiphy, + wiphy_hrtimer_work_queue(sdata->local->hw.wiphy, &link->u.mgd.csa.switch_work, 0); } @@ -2753,7 +2753,8 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, .timestamp = timestamp, .device_timestamp = device_timestamp, }; - unsigned long now; + u32 csa_time_tu; + ktime_t now; int res; lockdep_assert_wiphy(local->hw.wiphy); @@ -2983,10 +2984,9 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, csa_ie.mode); /* we may have to handle timeout for deactivated link in software */ - now = jiffies; - link->u.mgd.csa.time = now + - TU_TO_JIFFIES((max_t(int, csa_ie.count, 1) - 1) * - link->conf->beacon_int); + now = ktime_get_boottime(); + csa_time_tu = (max_t(int, csa_ie.count, 1) - 1) * link->conf->beacon_int; + link->u.mgd.csa.time = now + us_to_ktime(ieee80211_tu_to_usec(csa_time_tu)); if (ieee80211_vif_link_active(&sdata->vif, link->link_id) && local->ops->channel_switch) { @@ -3001,7 +3001,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, } /* channel switch handled in software */ - wiphy_delayed_work_queue(local->hw.wiphy, + wiphy_hrtimer_work_queue(local->hw.wiphy, &link->u.mgd.csa.switch_work, link->u.mgd.csa.time - now); return; @@ -8849,7 +8849,7 @@ void ieee80211_mgd_setup_link(struct ieee80211_link_data *link) else link->u.mgd.req_smps = IEEE80211_SMPS_OFF; - wiphy_delayed_work_init(&link->u.mgd.csa.switch_work, + wiphy_hrtimer_work_init(&link->u.mgd.csa.switch_work, ieee80211_csa_switch_work); ieee80211_clear_tpe(&link->conf->tpe); @@ -10064,7 +10064,7 @@ void ieee80211_mgd_stop_link(struct ieee80211_link_data *link) &link->u.mgd.request_smps_work); wiphy_work_cancel(link->sdata->local->hw.wiphy, &link->u.mgd.recalc_smps); - wiphy_delayed_work_cancel(link->sdata->local->hw.wiphy, + wiphy_hrtimer_work_cancel(link->sdata->local->hw.wiphy, &link->u.mgd.csa.switch_work); } -- cgit v1.2.3 From 2cbb259ec4f8e12dade80b388b81d41fa22187d2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 23 Oct 2025 14:55:32 +0200 Subject: bpf: Reject negative head_room in __bpf_skb_change_head Yinhao et al. recently reported: Our fuzzing tool was able to create a BPF program which triggered the below BUG condition inside pskb_expand_head. [ 23.016047][T10006] kernel BUG at net/core/skbuff.c:2232! [...] [ 23.017301][T10006] RIP: 0010:pskb_expand_head+0x1519/0x1530 [...] [ 23.021249][T10006] Call Trace: [ 23.021387][T10006] [ 23.021507][T10006] ? __pfx_pskb_expand_head+0x10/0x10 [ 23.021725][T10006] __bpf_skb_change_head+0x22a/0x520 [ 23.021939][T10006] bpf_skb_change_head+0x34/0x1b0 [ 23.022143][T10006] ___bpf_prog_run+0xf70/0xb670 [ 23.022342][T10006] __bpf_prog_run32+0xed/0x140 [...] The problem is that in __bpf_skb_change_head() we need to reject a negative head_room as otherwise this propagates all the way to the pskb_expand_head() from skb_cow(). For example, if the BPF test infra passes a skb with gso_skb:1 to the BPF helper with a negative head_room of -22, then this gets passed into skb_cow(). __skb_cow() in this example calculates a delta of -86 which gets aligned to -64, and then triggers BUG_ON(nhead < 0). Thus, reject malformed negative input. Fixes: 3a0af8fd61f9 ("bpf: BPF for lightweight tunnel infrastructure") Reported-by: Yinhao Hu Reported-by: Kaiyan Mei Signed-off-by: Daniel Borkmann Signed-off-by: Martin KaFai Lau Reviewed-by: Dongliang Mu Link: https://patch.msgid.link/20251023125532.182262-1-daniel@iogearbox.net --- net/core/filter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 76628df1fc82..fa06c5a08e22 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3877,7 +3877,8 @@ static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, u32 new_len = skb->len + head_room; int ret; - if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) || + if (unlikely(flags || (int)head_room < 0 || + (!skb_is_gso(skb) && new_len > max_len) || new_len < skb->len)) return -EINVAL; -- cgit v1.2.3 From 514f1dc8f2ca3101e04cdf452e53baca3a76e544 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 22 Oct 2025 17:18:10 +0200 Subject: netfilter: nft_ct: enable labels for get case too conntrack labels can only be set when the conntrack has been created with the "ctlabel" extension. For older iptables (connlabel match), adding an "-m connlabel" rule turns on the ctlabel extension allocation for all future conntrack entries. For nftables, its only enabled for 'ct label set foo', but not for 'ct label foo' (i.e. check). But users could have a ruleset that only checks for presence, and rely on userspace to set a label bit via ctnetlink infrastructure. This doesn't work without adding a dummy 'ct label set' rule. We could also enable extension infra for the first (failing) ctnetlink request, but unlike ruleset we would not be able to disable the extension again. Therefore turn on ctlabel extension allocation if an nftables ruleset checks for a connlabel too. Fixes: 1ad8f48df6f6 ("netfilter: nftables: add connlabel set support") Reported-by: Antonio Ojea Closes: https://lore.kernel.org/netfilter-devel/aPi_VdZpVjWujZ29@strlen.de/ Signed-off-by: Florian Westphal --- net/netfilter/nft_ct.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index d526e69a2a2b..a418eb3d612b 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -379,6 +379,14 @@ static bool nft_ct_tmpl_alloc_pcpu(void) } #endif +static void __nft_ct_get_destroy(const struct nft_ctx *ctx, struct nft_ct *priv) +{ +#ifdef CONFIG_NF_CONNTRACK_LABELS + if (priv->key == NFT_CT_LABELS) + nf_connlabels_put(ctx->net); +#endif +} + static int nft_ct_get_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -413,6 +421,10 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, if (tb[NFTA_CT_DIRECTION] != NULL) return -EINVAL; len = NF_CT_LABELS_MAX_SIZE; + + err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1); + if (err) + return err; break; #endif case NFT_CT_HELPER: @@ -494,7 +506,8 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, case IP_CT_DIR_REPLY: break; default: - return -EINVAL; + err = -EINVAL; + goto err; } } @@ -502,11 +515,11 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, err = nft_parse_register_store(ctx, tb[NFTA_CT_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, len); if (err < 0) - return err; + goto err; err = nf_ct_netns_get(ctx->net, ctx->family); if (err < 0) - return err; + goto err; if (priv->key == NFT_CT_BYTES || priv->key == NFT_CT_PKTS || @@ -514,6 +527,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, nf_ct_set_acct(ctx->net, true); return 0; +err: + __nft_ct_get_destroy(ctx, priv); + return err; } static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv) @@ -626,6 +642,9 @@ err1: static void nft_ct_get_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { + struct nft_ct *priv = nft_expr_priv(expr); + + __nft_ct_get_destroy(ctx, priv); nf_ct_netns_put(ctx->net, ctx->family); } -- cgit v1.2.3 From 8d96dfdcabef00e28f0c851b1502adb679dfc6d9 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Fri, 24 Oct 2025 17:54:39 +0200 Subject: netfilter: nft_connlimit: fix possible data race on connection count nft_connlimit_eval() reads priv->list->count to check if the connection limit has been exceeded. This value is being read without a lock and can be modified by a different process. Use READ_ONCE() for correctness. Fixes: df4a90250976 ("netfilter: nf_conncount: merge lookup and add functions") Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Florian Westphal --- net/netfilter/nft_connlimit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index 92b984fa8175..fc35a11cdca2 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -48,7 +48,7 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, return; } - count = priv->list->count; + count = READ_ONCE(priv->list->count); if ((count > priv->limit) ^ priv->invert) { regs->verdict.code = NFT_BREAK; -- cgit v1.2.3 From 90918e3b6404c2a37837b8f11692471b4c512de2 Mon Sep 17 00:00:00 2001 From: Andrii Melnychenko Date: Fri, 24 Oct 2025 18:22:16 +0200 Subject: netfilter: nft_ct: add seqadj extension for natted connections Sequence adjustment may be required for FTP traffic with PASV/EPSV modes. due to need to re-write packet payload (IP, port) on the ftp control connection. This can require changes to the TCP length and expected seq / ack_seq. The easiest way to reproduce this issue is with PASV mode. Example ruleset: table inet ftp_nat { ct helper ftp_helper { type "ftp" protocol tcp l3proto inet } chain prerouting { type filter hook prerouting priority 0; policy accept; tcp dport 21 ct state new ct helper set "ftp_helper" } } table ip nat { chain prerouting { type nat hook prerouting priority -100; policy accept; tcp dport 21 dnat ip prefix to ip daddr map { 192.168.100.1 : 192.168.13.2/32 } } chain postrouting { type nat hook postrouting priority 100 ; policy accept; tcp sport 21 snat ip prefix to ip saddr map { 192.168.13.2 : 192.168.100.1/32 } } } Note that the ftp helper gets assigned *after* the dnat setup. The inverse (nat after helper assign) is handled by an existing check in nf_nat_setup_info() and will not show the problem. Topoloy: +-------------------+ +----------------------------------+ | FTP: 192.168.13.2 | <-> | NAT: 192.168.13.3, 192.168.100.1 | +-------------------+ +----------------------------------+ | +-----------------------+ | Client: 192.168.100.2 | +-----------------------+ ftp nat changes do not work as expected in this case: Connected to 192.168.100.1. [..] ftp> epsv EPSV/EPRT on IPv4 off. ftp> ls 227 Entering passive mode (192,168,100,1,209,129). 421 Service not available, remote server has closed connection. Kernel logs: Missing nfct_seqadj_ext_add() setup call WARNING: CPU: 1 PID: 0 at net/netfilter/nf_conntrack_seqadj.c:41 [..] __nf_nat_mangle_tcp_packet+0x100/0x160 [nf_nat] nf_nat_ftp+0x142/0x280 [nf_nat_ftp] help+0x4d1/0x880 [nf_conntrack_ftp] nf_confirm+0x122/0x2e0 [nf_conntrack] nf_hook_slow+0x3c/0xb0 .. Fix this by adding the required extension when a conntrack helper is assigned to a connection that has a nat binding. Fixes: 1a64edf54f55 ("netfilter: nft_ct: add helper set support") Signed-off-by: Andrii Melnychenko Signed-off-by: Florian Westphal --- net/netfilter/nft_ct.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index a418eb3d612b..6f2ae7cad731 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -22,6 +22,7 @@ #include #include #include +#include struct nft_ct_helper_obj { struct nf_conntrack_helper *helper4; @@ -1192,6 +1193,10 @@ static void nft_ct_helper_obj_eval(struct nft_object *obj, if (help) { rcu_assign_pointer(help->helper, to_assign); set_bit(IPS_HELPER_BIT, &ct->status); + + if ((ct->status & IPS_NAT_MASK) && !nfct_seqadj(ct)) + if (!nfct_seqadj_ext_add(ct)) + regs->verdict.code = NF_DROP; } } -- cgit v1.2.3 From a6f0459aadf1b41a9b9fae02006b1db024d60856 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 28 Oct 2025 12:57:59 +0100 Subject: mptcp: fix subflow rcvbuf adjust The mptcp PM can add subflow to the conn_list before tcp_init_transfer(). Calling tcp_rcvbuf_grow() on such subflow is not correct as later init will overwrite the update. Fix the issue calling tcp_rcvbuf_grow() only after init buffer initialization. Fixes: e118cdc34dd1 ("mptcp: rcvbuf auto-tuning improvement") Signed-off-by: Paolo Abeni Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251028-net-tcp-recv-autotune-v3-1-74b43ba4c84c@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 0292162a14ee..a8a3bdf95543 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2051,6 +2051,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) msk->rcvq_space.space = msk->rcvq_space.copied; if (mptcp_rcvbuf_grow(sk)) { + int copied = msk->rcvq_space.copied; /* Make subflows follow along. If we do not do this, we * get drops at subflow level if skbs can't be moved to @@ -2063,8 +2064,11 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) ssk = mptcp_subflow_tcp_sock(subflow); slow = lock_sock_fast(ssk); - tcp_sk(ssk)->rcvq_space.space = msk->rcvq_space.copied; - tcp_rcvbuf_grow(ssk); + /* subflows can be added before tcp_init_transfer() */ + if (tcp_sk(ssk)->rcvq_space.space) { + tcp_sk(ssk)->rcvq_space.space = copied; + tcp_rcvbuf_grow(ssk); + } unlock_sock_fast(ssk, slow); } } -- cgit v1.2.3 From b1e014a1f3275a6f3d0f2b30b8117447fc3915f5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Oct 2025 12:58:01 +0100 Subject: tcp: add newval parameter to tcp_rcvbuf_grow() This patch has no functional change, and prepares the following one. tcp_rcvbuf_grow() will need to have access to tp->rcvq_space.space old and new values. Change mptcp_rcvbuf_grow() in a similar way. Signed-off-by: Eric Dumazet [ Moved 'oldval' declaration to the next patch to avoid warnings at build time. ] Signed-off-by: Matthieu Baerts (NGI0) Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20251028-net-tcp-recv-autotune-v3-3-74b43ba4c84c@kernel.org Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 2 +- net/ipv4/tcp_input.c | 14 +++++++------- net/mptcp/protocol.c | 20 ++++++++------------ 3 files changed, 16 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 5ca230ed526a..ab20f549b8f9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -370,7 +370,7 @@ void tcp_delack_timer_handler(struct sock *sk); int tcp_ioctl(struct sock *sk, int cmd, int *karg); enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); -void tcp_rcvbuf_grow(struct sock *sk); +void tcp_rcvbuf_grow(struct sock *sk, u32 newval); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 31ea5af49f2d..cb4e07f84ae2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -891,18 +891,20 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, } } -void tcp_rcvbuf_grow(struct sock *sk) +void tcp_rcvbuf_grow(struct sock *sk, u32 newval) { const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); - int rcvwin, rcvbuf, cap; + u32 rcvwin, rcvbuf, cap; + + tp->rcvq_space.space = newval; if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) || (sk->sk_userlocks & SOCK_RCVBUF_LOCK)) return; /* slow start: allow the sender to double its rate. */ - rcvwin = tp->rcvq_space.space << 1; + rcvwin = newval << 1; if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt; @@ -943,9 +945,7 @@ void tcp_rcv_space_adjust(struct sock *sk) trace_tcp_rcvbuf_grow(sk, time); - tp->rcvq_space.space = copied; - - tcp_rcvbuf_grow(sk); + tcp_rcvbuf_grow(sk, copied); new_measure: tp->rcvq_space.seq = tp->copied_seq; @@ -5270,7 +5270,7 @@ end: } /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */ if (sk->sk_socket) - tcp_rcvbuf_grow(sk); + tcp_rcvbuf_grow(sk, tp->rcvq_space.space); } static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index a8a3bdf95543..052a0c62023f 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -194,17 +194,18 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to, * - mptcp does not maintain a msk-level window clamp * - returns true when the receive buffer is actually updated */ -static bool mptcp_rcvbuf_grow(struct sock *sk) +static bool mptcp_rcvbuf_grow(struct sock *sk, u32 newval) { struct mptcp_sock *msk = mptcp_sk(sk); const struct net *net = sock_net(sk); - int rcvwin, rcvbuf, cap; + u32 rcvwin, rcvbuf, cap; + msk->rcvq_space.space = newval; if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) || (sk->sk_userlocks & SOCK_RCVBUF_LOCK)) return false; - rcvwin = msk->rcvq_space.space << 1; + rcvwin = newval << 1; if (!RB_EMPTY_ROOT(&msk->out_of_order_queue)) rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq; @@ -334,7 +335,7 @@ end: skb_set_owner_r(skb, sk); /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */ if (sk->sk_socket) - mptcp_rcvbuf_grow(sk); + mptcp_rcvbuf_grow(sk, msk->rcvq_space.space); } static void mptcp_init_skb(struct sock *ssk, struct sk_buff *skb, int offset, @@ -2049,10 +2050,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) if (msk->rcvq_space.copied <= msk->rcvq_space.space) goto new_measure; - msk->rcvq_space.space = msk->rcvq_space.copied; - if (mptcp_rcvbuf_grow(sk)) { - int copied = msk->rcvq_space.copied; - + if (mptcp_rcvbuf_grow(sk, msk->rcvq_space.copied)) { /* Make subflows follow along. If we do not do this, we * get drops at subflow level if skbs can't be moved to * the mptcp rx queue fast enough (announced rcv_win can @@ -2065,10 +2063,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) ssk = mptcp_subflow_tcp_sock(subflow); slow = lock_sock_fast(ssk); /* subflows can be added before tcp_init_transfer() */ - if (tcp_sk(ssk)->rcvq_space.space) { - tcp_sk(ssk)->rcvq_space.space = copied; - tcp_rcvbuf_grow(ssk); - } + if (tcp_sk(ssk)->rcvq_space.space) + tcp_rcvbuf_grow(ssk, msk->rcvq_space.copied); unlock_sock_fast(ssk, slow); } } -- cgit v1.2.3 From aa251c84636c326471ca9d53723816ba8fffe2bf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Oct 2025 12:58:02 +0100 Subject: tcp: fix too slow tcp_rcvbuf_grow() action While the blamed commits apparently avoided an overshoot, they also limited how fast a sender can increase BDP at each RTT. This is not exactly a revert, we do not add the 16 * tp->advmss cushion we had, and we are keeping the out_of_order_queue contribution. Do the same in mptcp_rcvbuf_grow(). Tested: emulated 50ms rtt (tcp_stream --tcp-tx-delay 50000), cubic 20 second flow. net.ipv4.tcp_rmem set to "4096 131072 67000000" perf record -a -e tcp:tcp_rcvbuf_grow sleep 20 perf script Before: We can see we fail to roughly double RWIN at each RTT. Sender is RWIN limited while CWND is ramping up (before getting tcp_wmem limited). tcp_stream 33793 [010] 825.717525: tcp:tcp_rcvbuf_grow: time=100869 rtt_us=50428 copied=49152 inq=0 space=40960 ooo=0 scaling_ratio=219 rcvbuf=131072 rcv_ssthresh=103970 window_clamp=112128 rcv_wnd=106496 tcp_stream 33793 [010] 825.768966: tcp:tcp_rcvbuf_grow: time=51447 rtt_us=50362 copied=86016 inq=0 space=49152 ooo=0 scaling_ratio=219 rcvbuf=131072 rcv_ssthresh=107474 window_clamp=112128 rcv_wnd=106496 tcp_stream 33793 [010] 825.821539: tcp:tcp_rcvbuf_grow: time=52577 rtt_us=50243 copied=114688 inq=0 space=86016 ooo=0 scaling_ratio=219 rcvbuf=201096 rcv_ssthresh=167377 window_clamp=172031 rcv_wnd=167936 tcp_stream 33793 [010] 825.871781: tcp:tcp_rcvbuf_grow: time=50248 rtt_us=50237 copied=167936 inq=0 space=114688 ooo=0 scaling_ratio=219 rcvbuf=268129 rcv_ssthresh=224722 window_clamp=229375 rcv_wnd=225280 tcp_stream 33793 [010] 825.922475: tcp:tcp_rcvbuf_grow: time=50698 rtt_us=50183 copied=241664 inq=0 space=167936 ooo=0 scaling_ratio=219 rcvbuf=392617 rcv_ssthresh=331217 window_clamp=335871 rcv_wnd=323584 tcp_stream 33793 [010] 825.973326: tcp:tcp_rcvbuf_grow: time=50855 rtt_us=50213 copied=339968 inq=0 space=241664 ooo=0 scaling_ratio=219 rcvbuf=564986 rcv_ssthresh=478674 window_clamp=483327 rcv_wnd=462848 tcp_stream 33793 [010] 826.023970: tcp:tcp_rcvbuf_grow: time=50647 rtt_us=50248 copied=491520 inq=0 space=339968 ooo=0 scaling_ratio=219 rcvbuf=794811 rcv_ssthresh=671778 window_clamp=679935 rcv_wnd=651264 tcp_stream 33793 [010] 826.074612: tcp:tcp_rcvbuf_grow: time=50648 rtt_us=50227 copied=700416 inq=0 space=491520 ooo=0 scaling_ratio=219 rcvbuf=1149124 rcv_ssthresh=974881 window_clamp=983039 rcv_wnd=942080 tcp_stream 33793 [010] 826.125452: tcp:tcp_rcvbuf_grow: time=50845 rtt_us=50225 copied=987136 inq=8192 space=700416 ooo=0 scaling_ratio=219 rcvbuf=1637502 rcv_ssthresh=1392674 window_clamp=1400831 rcv_wnd=1339392 tcp_stream 33793 [010] 826.175698: tcp:tcp_rcvbuf_grow: time=50250 rtt_us=50198 copied=1347584 inq=0 space=978944 ooo=0 scaling_ratio=219 rcvbuf=2288672 rcv_ssthresh=1949729 window_clamp=1957887 rcv_wnd=1945600 tcp_stream 33793 [010] 826.225947: tcp:tcp_rcvbuf_grow: time=50252 rtt_us=50240 copied=1945600 inq=0 space=1347584 ooo=0 scaling_ratio=219 rcvbuf=3150516 rcv_ssthresh=2687010 window_clamp=2695167 rcv_wnd=2691072 tcp_stream 33793 [010] 826.276175: tcp:tcp_rcvbuf_grow: time=50233 rtt_us=50224 copied=2691072 inq=0 space=1945600 ooo=0 scaling_ratio=219 rcvbuf=4548617 rcv_ssthresh=3883041 window_clamp=3891199 rcv_wnd=3887104 tcp_stream 33793 [010] 826.326403: tcp:tcp_rcvbuf_grow: time=50233 rtt_us=50229 copied=3887104 inq=0 space=2691072 ooo=0 scaling_ratio=219 rcvbuf=6291456 rcv_ssthresh=5370482 window_clamp=5382144 rcv_wnd=5373952 tcp_stream 33793 [010] 826.376723: tcp:tcp_rcvbuf_grow: time=50323 rtt_us=50218 copied=5373952 inq=0 space=3887104 ooo=0 scaling_ratio=219 rcvbuf=9087658 rcv_ssthresh=7755537 window_clamp=7774207 rcv_wnd=7757824 tcp_stream 33793 [010] 826.426991: tcp:tcp_rcvbuf_grow: time=50274 rtt_us=50196 copied=7757824 inq=180224 space=5373952 ooo=0 scaling_ratio=219 rcvbuf=12563759 rcv_ssthresh=10729233 window_clamp=10747903 rcv_wnd=10575872 tcp_stream 33793 [010] 826.477229: tcp:tcp_rcvbuf_grow: time=50241 rtt_us=50078 copied=10731520 inq=180224 space=7577600 ooo=0 scaling_ratio=219 rcvbuf=17715667 rcv_ssthresh=15136529 window_clamp=15155199 rcv_wnd=14983168 tcp_stream 33793 [010] 826.527482: tcp:tcp_rcvbuf_grow: time=50258 rtt_us=50153 copied=15138816 inq=360448 space=10551296 ooo=0 scaling_ratio=219 rcvbuf=24667870 rcv_ssthresh=21073410 window_clamp=21102591 rcv_wnd=20766720 tcp_stream 33793 [010] 826.577712: tcp:tcp_rcvbuf_grow: time=50234 rtt_us=50228 copied=21073920 inq=0 space=14778368 ooo=0 scaling_ratio=219 rcvbuf=34550339 rcv_ssthresh=29517041 window_clamp=29556735 rcv_wnd=29519872 tcp_stream 33793 [010] 826.627982: tcp:tcp_rcvbuf_grow: time=50275 rtt_us=50220 copied=29519872 inq=540672 space=21073920 ooo=0 scaling_ratio=219 rcvbuf=49268707 rcv_ssthresh=42090625 window_clamp=42147839 rcv_wnd=41627648 tcp_stream 33793 [010] 826.678274: tcp:tcp_rcvbuf_grow: time=50296 rtt_us=50185 copied=42053632 inq=761856 space=28979200 ooo=0 scaling_ratio=219 rcvbuf=67000000 rcv_ssthresh=57238168 window_clamp=57316406 rcv_wnd=56606720 tcp_stream 33793 [010] 826.728627: tcp:tcp_rcvbuf_grow: time=50357 rtt_us=50128 copied=43913216 inq=851968 space=41291776 ooo=0 scaling_ratio=219 rcvbuf=67000000 rcv_ssthresh=57290728 window_clamp=57316406 rcv_wnd=56524800 tcp_stream 33793 [010] 827.131364: tcp:tcp_rcvbuf_grow: time=50239 rtt_us=50127 copied=43843584 inq=655360 space=43061248 ooo=0 scaling_ratio=219 rcvbuf=67000000 rcv_ssthresh=57290728 window_clamp=57316406 rcv_wnd=56696832 tcp_stream 33793 [010] 827.181613: tcp:tcp_rcvbuf_grow: time=50254 rtt_us=50115 copied=43843584 inq=524288 space=43188224 ooo=0 scaling_ratio=219 rcvbuf=67000000 rcv_ssthresh=57290728 window_clamp=57316406 rcv_wnd=56807424 tcp_stream 33793 [010] 828.339635: tcp:tcp_rcvbuf_grow: time=50283 rtt_us=50110 copied=43843584 inq=458752 space=43319296 ooo=0 scaling_ratio=219 rcvbuf=67000000 rcv_ssthresh=57290728 window_clamp=57316406 rcv_wnd=56864768 tcp_stream 33793 [010] 828.440350: tcp:tcp_rcvbuf_grow: time=50404 rtt_us=50099 copied=43843584 inq=393216 space=43384832 ooo=0 scaling_ratio=219 rcvbuf=67000000 rcv_ssthresh=57290728 window_clamp=57316406 rcv_wnd=56922112 tcp_stream 33793 [010] 829.195106: tcp:tcp_rcvbuf_grow: time=50154 rtt_us=50077 copied=43843584 inq=196608 space=43450368 ooo=0 scaling_ratio=219 rcvbuf=67000000 rcv_ssthresh=57290728 window_clamp=57316406 rcv_wnd=57090048 After: It takes few steps to increase RWIN. Sender is no longer RWIN limited. tcp_stream 50826 [010] 935.634212: tcp:tcp_rcvbuf_grow: time=100788 rtt_us=50315 copied=49152 inq=0 space=40960 ooo=0 scaling_ratio=219 rcvbuf=131072 rcv_ssthresh=103970 window_clamp=112128 rcv_wnd=106496 tcp_stream 50826 [010] 935.685642: tcp:tcp_rcvbuf_grow: time=51437 rtt_us=50361 copied=86016 inq=0 space=49152 ooo=0 scaling_ratio=219 rcvbuf=160875 rcv_ssthresh=132969 window_clamp=137623 rcv_wnd=131072 tcp_stream 50826 [010] 935.738299: tcp:tcp_rcvbuf_grow: time=52660 rtt_us=50256 copied=139264 inq=0 space=86016 ooo=0 scaling_ratio=219 rcvbuf=502741 rcv_ssthresh=411497 window_clamp=430079 rcv_wnd=413696 tcp_stream 50826 [010] 935.788544: tcp:tcp_rcvbuf_grow: time=50249 rtt_us=50233 copied=307200 inq=0 space=139264 ooo=0 scaling_ratio=219 rcvbuf=728690 rcv_ssthresh=618717 window_clamp=623371 rcv_wnd=618496 tcp_stream 50826 [010] 935.838796: tcp:tcp_rcvbuf_grow: time=50258 rtt_us=50202 copied=618496 inq=0 space=307200 ooo=0 scaling_ratio=219 rcvbuf=2450338 rcv_ssthresh=1855709 window_clamp=2096187 rcv_wnd=1859584 tcp_stream 50826 [010] 935.889140: tcp:tcp_rcvbuf_grow: time=50347 rtt_us=50166 copied=1261568 inq=0 space=618496 ooo=0 scaling_ratio=219 rcvbuf=4376503 rcv_ssthresh=3725291 window_clamp=3743961 rcv_wnd=3706880 tcp_stream 50826 [010] 935.939435: tcp:tcp_rcvbuf_grow: time=50300 rtt_us=50185 copied=2478080 inq=24576 space=1261568 ooo=0 scaling_ratio=219 rcvbuf=9082648 rcv_ssthresh=7733731 window_clamp=7769921 rcv_wnd=7692288 tcp_stream 50826 [010] 935.989681: tcp:tcp_rcvbuf_grow: time=50251 rtt_us=50221 copied=4915200 inq=114688 space=2453504 ooo=0 scaling_ratio=219 rcvbuf=16574936 rcv_ssthresh=14108110 window_clamp=14179339 rcv_wnd=14024704 tcp_stream 50826 [010] 936.039967: tcp:tcp_rcvbuf_grow: time=50289 rtt_us=50279 copied=9830400 inq=114688 space=4800512 ooo=0 scaling_ratio=219 rcvbuf=32695050 rcv_ssthresh=27896187 window_clamp=27969593 rcv_wnd=27815936 tcp_stream 50826 [010] 936.090172: tcp:tcp_rcvbuf_grow: time=50211 rtt_us=50200 copied=19841024 inq=114688 space=9715712 ooo=0 scaling_ratio=219 rcvbuf=67000000 rcv_ssthresh=57245176 window_clamp=57316406 rcv_wnd=57163776 tcp_stream 50826 [010] 936.140430: tcp:tcp_rcvbuf_grow: time=50262 rtt_us=50197 copied=39501824 inq=114688 space=19726336 ooo=0 scaling_ratio=219 rcvbuf=67000000 rcv_ssthresh=57245176 window_clamp=57316406 rcv_wnd=57163776 tcp_stream 50826 [010] 936.190527: tcp:tcp_rcvbuf_grow: time=50101 rtt_us=50071 copied=43655168 inq=262144 space=39387136 ooo=0 scaling_ratio=219 rcvbuf=67000000 rcv_ssthresh=57259192 window_clamp=57316406 rcv_wnd=57032704 tcp_stream 50826 [010] 936.240719: tcp:tcp_rcvbuf_grow: time=50197 rtt_us=50057 copied=43843584 inq=262144 space=43393024 ooo=0 scaling_ratio=219 rcvbuf=67000000 rcv_ssthresh=57259192 window_clamp=57316406 rcv_wnd=57032704 tcp_stream 50826 [010] 936.341271: tcp:tcp_rcvbuf_grow: time=50297 rtt_us=50123 copied=43843584 inq=131072 space=43581440 ooo=0 scaling_ratio=219 rcvbuf=67000000 rcv_ssthresh=57259192 window_clamp=57316406 rcv_wnd=57147392 tcp_stream 50826 [010] 936.642503: tcp:tcp_rcvbuf_grow: time=50131 rtt_us=50084 copied=43843584 inq=0 space=43712512 ooo=0 scaling_ratio=219 rcvbuf=67000000 rcv_ssthresh=57259192 window_clamp=57316406 rcv_wnd=57262080 Fixes: 65c5287892e9 ("tcp: fix sk_rcvbuf overshoot") Fixes: e118cdc34dd1 ("mptcp: rcvbuf auto-tuning improvement") Reported-by: Neal Cardwell Signed-off-by: Eric Dumazet Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/589 Signed-off-by: Matthieu Baerts (NGI0) Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20251028-net-tcp-recv-autotune-v3-4-74b43ba4c84c@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 11 +++++++++-- net/mptcp/protocol.c | 10 +++++++++- 2 files changed, 18 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cb4e07f84ae2..e4a979b75cc6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -895,17 +895,24 @@ void tcp_rcvbuf_grow(struct sock *sk, u32 newval) { const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); - u32 rcvwin, rcvbuf, cap; + u32 rcvwin, rcvbuf, cap, oldval; + u64 grow; + oldval = tp->rcvq_space.space; tp->rcvq_space.space = newval; if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) || (sk->sk_userlocks & SOCK_RCVBUF_LOCK)) return; - /* slow start: allow the sender to double its rate. */ + /* DRS is always one RTT late. */ rcvwin = newval << 1; + /* slow start: allow the sender to double its rate. */ + grow = (u64)rcvwin * (newval - oldval); + do_div(grow, oldval); + rcvwin += grow << 1; + if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 052a0c62023f..875027b9319c 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -198,15 +198,23 @@ static bool mptcp_rcvbuf_grow(struct sock *sk, u32 newval) { struct mptcp_sock *msk = mptcp_sk(sk); const struct net *net = sock_net(sk); - u32 rcvwin, rcvbuf, cap; + u32 rcvwin, rcvbuf, cap, oldval; + u64 grow; + oldval = msk->rcvq_space.space; msk->rcvq_space.space = newval; if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) || (sk->sk_userlocks & SOCK_RCVBUF_LOCK)) return false; + /* DRS is always one RTT late. */ rcvwin = newval << 1; + /* slow start: allow the sender to double its rate. */ + grow = (u64)rcvwin * (newval - oldval); + do_div(grow, oldval); + rcvwin += grow << 1; + if (!RB_EMPTY_ROOT(&msk->out_of_order_queue)) rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq; -- cgit v1.2.3 From 27b0e701d3872ba59c5b579a9e8a02ea49ad3d3b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 28 Oct 2025 09:16:52 +0100 Subject: mptcp: drop bogus optimization in __mptcp_check_push() Accessing the transmit queue without owning the msk socket lock is inherently racy, hence __mptcp_check_push() could actually quit early even when there is pending data. That in turn could cause unexpected tx lock and timeout. Dropping the early check avoids the race, implicitly relaying on later tests under the relevant lock. With such change, all the other mptcp_send_head() call sites are now under the msk socket lock and we can additionally drop the now unneeded annotation on the transmit head pointer accesses. Fixes: 6e628cd3a8f7 ("mptcp: use mptcp release_cb for delayed tasks") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Geliang Tang Tested-by: Geliang Tang Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251028-net-mptcp-send-timeout-v1-1-38ffff5a9ec8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 11 ++++------- net/mptcp/protocol.h | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 875027b9319c..655a2a45224f 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1007,7 +1007,7 @@ static void __mptcp_clean_una(struct sock *sk) if (WARN_ON_ONCE(!msk->recovery)) break; - WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); + msk->first_pending = mptcp_send_next(sk); } dfrag_clear(sk, dfrag); @@ -1552,7 +1552,7 @@ static int __subflow_push_pending(struct sock *sk, struct sock *ssk, mptcp_update_post_push(msk, dfrag, ret); } - WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); + msk->first_pending = mptcp_send_next(sk); if (msk->snd_burst <= 0 || !sk_stream_memory_free(ssk) || @@ -1912,7 +1912,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) get_page(dfrag->page); list_add_tail(&dfrag->list, &msk->rtx_queue); if (!msk->first_pending) - WRITE_ONCE(msk->first_pending, dfrag); + msk->first_pending = dfrag; } pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d\n", msk, dfrag->data_seq, dfrag->data_len, dfrag->already_sent, @@ -2882,7 +2882,7 @@ static void __mptcp_clear_xmit(struct sock *sk) struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; - WRITE_ONCE(msk->first_pending, NULL); + msk->first_pending = NULL; list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) dfrag_clear(sk, dfrag); } @@ -3422,9 +3422,6 @@ void __mptcp_data_acked(struct sock *sk) void __mptcp_check_push(struct sock *sk, struct sock *ssk) { - if (!mptcp_send_head(sk)) - return; - if (!sock_owned_by_user(sk)) __mptcp_subflow_push_pending(sk, ssk, false); else diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 52f9cfa4ce95..379a88e14e8d 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -414,7 +414,7 @@ static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) { const struct mptcp_sock *msk = mptcp_sk(sk); - return READ_ONCE(msk->first_pending); + return msk->first_pending; } static inline struct mptcp_data_frag *mptcp_send_next(struct sock *sk) -- cgit v1.2.3 From 8e04ce45a8db7a080220e86e249198fa676b83dc Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 28 Oct 2025 09:16:53 +0100 Subject: mptcp: fix MSG_PEEK stream corruption If a MSG_PEEK | MSG_WAITALL read operation consumes all the bytes in the receive queue and recvmsg() need to waits for more data - i.e. it's a blocking one - upon arrival of the next packet the MPTCP protocol will start again copying the oldest data present in the receive queue, corrupting the data stream. Address the issue explicitly tracking the peeked sequence number, restarting from the last peeked byte. Fixes: ca4fb892579f ("mptcp: add MSG_PEEK support") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Geliang Tang Tested-by: Geliang Tang Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251028-net-mptcp-send-timeout-v1-2-38ffff5a9ec8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 655a2a45224f..2535788569ab 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1945,22 +1945,36 @@ do_error: static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied); -static int __mptcp_recvmsg_mskq(struct sock *sk, - struct msghdr *msg, - size_t len, int flags, +static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg, + size_t len, int flags, int copied_total, struct scm_timestamping_internal *tss, int *cmsg_flags) { struct mptcp_sock *msk = mptcp_sk(sk); struct sk_buff *skb, *tmp; + int total_data_len = 0; int copied = 0; skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) { - u32 offset = MPTCP_SKB_CB(skb)->offset; + u32 delta, offset = MPTCP_SKB_CB(skb)->offset; u32 data_len = skb->len - offset; - u32 count = min_t(size_t, len - copied, data_len); + u32 count; int err; + if (flags & MSG_PEEK) { + /* skip already peeked skbs */ + if (total_data_len + data_len <= copied_total) { + total_data_len += data_len; + continue; + } + + /* skip the already peeked data in the current skb */ + delta = copied_total - total_data_len; + offset += delta; + data_len -= delta; + } + + count = min_t(size_t, len - copied, data_len); if (!(flags & MSG_TRUNC)) { err = skb_copy_datagram_msg(skb, offset, msg, count); if (unlikely(err < 0)) { @@ -1977,16 +1991,14 @@ static int __mptcp_recvmsg_mskq(struct sock *sk, copied += count; - if (count < data_len) { - if (!(flags & MSG_PEEK)) { + if (!(flags & MSG_PEEK)) { + msk->bytes_consumed += count; + if (count < data_len) { MPTCP_SKB_CB(skb)->offset += count; MPTCP_SKB_CB(skb)->map_seq += count; - msk->bytes_consumed += count; + break; } - break; - } - if (!(flags & MSG_PEEK)) { /* avoid the indirect call, we know the destructor is sock_rfree */ skb->destructor = NULL; skb->sk = NULL; @@ -1994,7 +2006,6 @@ static int __mptcp_recvmsg_mskq(struct sock *sk, sk_mem_uncharge(sk, skb->truesize); __skb_unlink(skb, &sk->sk_receive_queue); skb_attempt_defer_free(skb); - msk->bytes_consumed += count; } if (copied >= len) @@ -2191,7 +2202,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, while (copied < len) { int err, bytes_read; - bytes_read = __mptcp_recvmsg_mskq(sk, msg, len - copied, flags, &tss, &cmsg_flags); + bytes_read = __mptcp_recvmsg_mskq(sk, msg, len - copied, flags, + copied, &tss, &cmsg_flags); if (unlikely(bytes_read < 0)) { if (!copied) copied = bytes_read; -- cgit v1.2.3 From a824084b98d8a1dbd6e85d0842a8eb5e73467f59 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 28 Oct 2025 09:16:54 +0100 Subject: mptcp: restore window probe Since commit 72377ab2d671 ("mptcp: more conservative check for zero probes") the MPTCP-level zero window probe check is always disabled, as the TCP-level write queue always contains at least the newly allocated skb. Refine the relevant check tacking in account that the above condition and that such skb can have zero length. Fixes: 72377ab2d671 ("mptcp: more conservative check for zero probes") Cc: stable@vger.kernel.org Reported-by: Geliang Tang Closes: https://lore.kernel.org/d0a814c364e744ca6b836ccd5b6e9146882e8d42.camel@kernel.org Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Tested-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251028-net-mptcp-send-timeout-v1-3-38ffff5a9ec8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 2535788569ab..5d8714adae6c 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1299,7 +1299,12 @@ alloc_skb: if (copy == 0) { u64 snd_una = READ_ONCE(msk->snd_una); - if (snd_una != msk->snd_nxt || tcp_write_queue_tail(ssk)) { + /* No need for zero probe if there are any data pending + * either at the msk or ssk level; skb is the current write + * queue tail and can be empty at this point. + */ + if (snd_una != msk->snd_nxt || skb->len || + skb != tcp_send_head(ssk)) { tcp_remove_empty_skb(ssk); return 0; } -- cgit v1.2.3 From fe11dfa10919ce594682c76f5f648a0840d80a2b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 28 Oct 2025 09:16:55 +0100 Subject: mptcp: zero window probe mib Explicitly account for MPTCP-level zero windows probe, to catch hopefully earlier issues alike the one addressed by the previous patch. Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Tested-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251028-net-mptcp-send-timeout-v1-4-38ffff5a9ec8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/mib.c | 1 + net/mptcp/mib.h | 1 + net/mptcp/protocol.c | 1 + 3 files changed, 3 insertions(+) (limited to 'net') diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index 6003e47c770a..171643815076 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -85,6 +85,7 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("DssFallback", MPTCP_MIB_DSSFALLBACK), SNMP_MIB_ITEM("SimultConnectFallback", MPTCP_MIB_SIMULTCONNFALLBACK), SNMP_MIB_ITEM("FallbackFailed", MPTCP_MIB_FALLBACKFAILED), + SNMP_MIB_ITEM("WinProbe", MPTCP_MIB_WINPROBE), }; /* mptcp_mib_alloc - allocate percpu mib counters diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index 309bac6fea32..a1d3e9369fbb 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -88,6 +88,7 @@ enum linux_mptcp_mib_field { MPTCP_MIB_DSSFALLBACK, /* Bad or missing DSS */ MPTCP_MIB_SIMULTCONNFALLBACK, /* Simultaneous connect */ MPTCP_MIB_FALLBACKFAILED, /* Can't fallback due to msk status */ + MPTCP_MIB_WINPROBE, /* MPTCP-level zero window probe */ __MPTCP_MIB_MAX }; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5d8714adae6c..2d6b8de35c44 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1355,6 +1355,7 @@ alloc_skb: mpext->dsn64); if (zero_window_probe) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_WINPROBE); mptcp_subflow_ctx(ssk)->rel_write_seq += copy; mpext->frozen = 1; if (READ_ONCE(msk->csum_enabled)) -- cgit v1.2.3 From c15d5c62ab313c19121f10e25d4fec852bd1c40c Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Sun, 26 Oct 2025 22:03:02 +0200 Subject: net: tls: Cancel RX async resync request on rcd_delta overflow When a netdev issues a RX async resync request for a TLS connection, the TLS module handles it by logging record headers and attempting to match them to the tcp_sn provided by the device. If a match is found, the TLS module approves the tcp_sn for resynchronization. While waiting for a device response, the TLS module also increments rcd_delta each time a new TLS record is received, tracking the distance from the original resync request. However, if the device response is delayed or fails (e.g due to unstable connection and device getting out of tracking, hardware errors, resource exhaustion etc.), the TLS module keeps logging and incrementing, which can lead to a WARN() when rcd_delta exceeds the threshold. To address this, introduce tls_offload_rx_resync_async_request_cancel() to explicitly cancel resync requests when a device response failure is detected. Call this helper also as a final safeguard when rcd_delta crosses its threshold, as reaching this point implies that earlier cancellation did not occur. Signed-off-by: Shahar Shitrit Reviewed-by: Sabrina Dubroca Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1761508983-937977-3-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/tls.h | 6 ++++++ net/tls/tls_device.c | 4 +++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/tls.h b/include/net/tls.h index b90f3b675c3c..c7bcdb3afad7 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -467,6 +467,12 @@ tls_offload_rx_resync_async_request_end(struct tls_offload_resync_async *resync_ atomic64_set(&resync_async->req, ((u64)ntohl(seq) << 32) | RESYNC_REQ); } +static inline void +tls_offload_rx_resync_async_request_cancel(struct tls_offload_resync_async *resync_async) +{ + atomic64_set(&resync_async->req, 0); +} + static inline void tls_offload_rx_resync_set_type(struct sock *sk, enum tls_offload_sync_type type) { diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index a64ae15b1a60..71734411ff4c 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -723,8 +723,10 @@ tls_device_rx_resync_async(struct tls_offload_resync_async *resync_async, /* shouldn't get to wraparound: * too long in async stage, something bad happened */ - if (WARN_ON_ONCE(resync_async->rcd_delta == USHRT_MAX)) + if (WARN_ON_ONCE(resync_async->rcd_delta == USHRT_MAX)) { + tls_offload_rx_resync_async_request_cancel(resync_async); return false; + } /* asynchronous stage: log all headers seq such that * req_seq <= seq <= end_seq, and wait for real resync request -- cgit v1.2.3 From 6a2108c78069fda000729b88c97b1eba0405e6d7 Mon Sep 17 00:00:00 2001 From: Shivaji Kant Date: Wed, 29 Oct 2025 06:54:19 +0000 Subject: net: devmem: refresh devmem TX dst in case of route invalidation The zero-copy Device Memory (Devmem) transmit path relies on the socket's route cache (`dst_entry`) to validate that the packet is being sent via the network device to which the DMA buffer was bound. However, this check incorrectly fails and returns `-ENODEV` if the socket's route cache entry (`dst`) is merely missing or expired (`dst == NULL`). This scenario is observed during network events, such as when flow steering rules are deleted, leading to a temporary route cache invalidation. This patch fixes -ENODEV error for `net_devmem_get_binding()` by doing the following: 1. It attempts to rebuild the route via `rebuild_header()` if the route is initially missing (`dst == NULL`). This allows the TCP/IP stack to recover from transient route cache misses. 2. It uses `rcu_read_lock()` and `dst_dev_rcu()` to safely access the network device pointer (`dst_dev`) from the route, preventing use-after-free conditions if the device is concurrently removed. 3. It maintains the critical safety check by validating that the retrieved destination device (`dst_dev`) is exactly the device registered in the Devmem binding (`binding->dev`). These changes prevent unnecessary ENODEV failures while maintaining the critical safety requirement that the Devmem resources are only used on the bound network device. Reviewed-by: Bobby Eshleman Reported-by: Eric Dumazet Reported-by: Vedant Mathur Suggested-by: Eric Dumazet Fixes: bd61848900bf ("net: devmem: Implement TX path") Signed-off-by: Shivaji Kant Link: https://patch.msgid.link/20251029065420.3489943-1-shivajikant@google.com Signed-off-by: Jakub Kicinski --- net/core/devmem.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/devmem.c b/net/core/devmem.c index d9de31a6cc7f..1d04754bc756 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "devmem.h" @@ -357,7 +358,8 @@ struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id) { struct net_devmem_dmabuf_binding *binding; - struct dst_entry *dst = __sk_dst_get(sk); + struct net_device *dst_dev; + struct dst_entry *dst; int err = 0; binding = net_devmem_lookup_dmabuf(dmabuf_id); @@ -366,16 +368,35 @@ struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk, goto out_err; } + rcu_read_lock(); + dst = __sk_dst_get(sk); + /* If dst is NULL (route expired), attempt to rebuild it. */ + if (unlikely(!dst)) { + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) { + err = -EHOSTUNREACH; + goto out_unlock; + } + dst = __sk_dst_get(sk); + if (unlikely(!dst)) { + err = -ENODEV; + goto out_unlock; + } + } + /* The dma-addrs in this binding are only reachable to the corresponding * net_device. */ - if (!dst || !dst->dev || dst->dev->ifindex != binding->dev->ifindex) { + dst_dev = dst_dev_rcu(dst); + if (unlikely(!dst_dev) || unlikely(dst_dev != binding->dev)) { err = -ENODEV; - goto out_err; + goto out_unlock; } + rcu_read_unlock(); return binding; +out_unlock: + rcu_read_unlock(); out_err: if (binding) net_devmem_dmabuf_binding_put(binding); -- cgit v1.2.3 From 51e5ad549c43b557c7da1e4d1a1dcf061b4a5f6c Mon Sep 17 00:00:00 2001 From: Ranganath V N Date: Sun, 26 Oct 2025 22:03:12 +0530 Subject: net: sctp: fix KMSAN uninit-value in sctp_inq_pop Fix an issue detected by syzbot: KMSAN reported an uninitialized-value access in sctp_inq_pop BUG: KMSAN: uninit-value in sctp_inq_pop The issue is actually caused by skb trimming via sk_filter() in sctp_rcv(). In the reproducer, skb->len becomes 1 after sk_filter(), which bypassed the original check: if (skb->len < sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + skb_transport_offset(skb)) To handle this safely, a new check should be performed after sk_filter(). Reported-by: syzbot+d101e12bccd4095460e7@syzkaller.appspotmail.com Tested-by: syzbot+d101e12bccd4095460e7@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=d101e12bccd4095460e7 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Suggested-by: Xin Long Signed-off-by: Ranganath V N Reviewed-by: Simon Horman Link: https://patch.msgid.link/20251026-kmsan_fix-v3-1-2634a409fa5f@gmail.com Signed-off-by: Paolo Abeni --- net/sctp/input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index 7e99894778d4..e119e460ccde 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -190,7 +190,7 @@ int sctp_rcv(struct sk_buff *skb) goto discard_release; nf_reset_ct(skb); - if (sk_filter(sk, skb)) + if (sk_filter(sk, skb) || skb->len < sizeof(struct sctp_chunkhdr)) goto discard_release; /* Create an SCTP packet structure. */ -- cgit v1.2.3 From 5c5f1f64681cc889d9b13e4a61285e9e029d6ab5 Mon Sep 17 00:00:00 2001 From: Raphael Pinsonneault-Thibeault Date: Fri, 24 Oct 2025 12:29:10 -0400 Subject: Bluetooth: hci_event: validate skb length for unknown CC opcode In hci_cmd_complete_evt(), if the command complete event has an unknown opcode, we assume the first byte of the remaining skb->data contains the return status. However, parameter data has previously been pulled in hci_event_func(), which may leave the skb empty. If so, using skb->data[0] for the return status uses un-init memory. The fix is to check skb->len before using skb->data. Reported-by: syzbot+a9a4bedfca6aa9d7fa24@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=a9a4bedfca6aa9d7fa24 Tested-by: syzbot+a9a4bedfca6aa9d7fa24@syzkaller.appspotmail.com Fixes: afcb3369f46ed ("Bluetooth: hci_event: Fix vendor (unknown) opcode status handling") Signed-off-by: Raphael Pinsonneault-Thibeault Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index d37db364acf7..f20c826509b6 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4218,6 +4218,13 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, void *data, } if (i == ARRAY_SIZE(hci_cc_table)) { + if (!skb->len) { + bt_dev_err(hdev, "Unexpected cc 0x%4.4x with no status", + *opcode); + *status = HCI_ERROR_UNSPECIFIED; + return; + } + /* Unknown opcode, assume byte 0 contains the status, so * that e.g. __hci_cmd_sync() properly returns errors * for vendor specific commands send by HCI drivers. -- cgit v1.2.3 From 8d59fba49362c65332395789fd82771f1028d87e Mon Sep 17 00:00:00 2001 From: Ilia Gavrilov Date: Mon, 20 Oct 2025 15:12:55 +0000 Subject: Bluetooth: MGMT: Fix OOB access in parse_adv_monitor_pattern() In the parse_adv_monitor_pattern() function, the value of the 'length' variable is currently limited to HCI_MAX_EXT_AD_LENGTH(251). The size of the 'value' array in the mgmt_adv_pattern structure is 31. If the value of 'pattern[i].length' is set in the user space and exceeds 31, the 'patterns[i].value' array can be accessed out of bound when copied. Increasing the size of the 'value' array in the 'mgmt_adv_pattern' structure will break the userspace. Considering this, and to avoid OOB access revert the limits for 'offset' and 'length' back to the value of HCI_MAX_AD_LENGTH. Found by InfoTeCS on behalf of Linux Verification Center (linuxtesting.org) with SVACE. Fixes: db08722fc7d4 ("Bluetooth: hci_core: Fix missing instances using HCI_MAX_AD_LENGTH") Cc: stable@vger.kernel.org Signed-off-by: Ilia Gavrilov Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/mgmt.h | 2 +- net/bluetooth/mgmt.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index bca0333f1e99..f5be96f08b9d 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -780,7 +780,7 @@ struct mgmt_adv_pattern { __u8 ad_type; __u8 offset; __u8 length; - __u8 value[31]; + __u8 value[HCI_MAX_AD_LENGTH]; } __packed; #define MGMT_OP_ADD_ADV_PATTERNS_MONITOR 0x0052 diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 24e335e3a727..79762bfaea5f 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -5395,9 +5395,9 @@ static u8 parse_adv_monitor_pattern(struct adv_monitor *m, u8 pattern_count, for (i = 0; i < pattern_count; i++) { offset = patterns[i].offset; length = patterns[i].length; - if (offset >= HCI_MAX_EXT_AD_LENGTH || - length > HCI_MAX_EXT_AD_LENGTH || - (offset + length) > HCI_MAX_EXT_AD_LENGTH) + if (offset >= HCI_MAX_AD_LENGTH || + length > HCI_MAX_AD_LENGTH || + (offset + length) > HCI_MAX_AD_LENGTH) return MGMT_STATUS_INVALID_PARAMS; p = kmalloc(sizeof(*p), GFP_KERNEL); -- cgit v1.2.3 From 3d18a84eddde169d6dbf3c72cc5358b988c347d0 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Mon, 27 Oct 2025 20:46:21 +0100 Subject: net: dsa: tag_brcm: legacy: fix untagged rx on unbridged ports for bcm63xx The internal switch on BCM63XX SoCs will unconditionally add 802.1Q VLAN tags on egress to CPU when 802.1Q mode is enabled. We do this unconditionally since commit ed409f3bbaa5 ("net: dsa: b53: Configure VLANs while not filtering"). This is fine for VLAN aware bridges, but for standalone ports and vlan unaware bridges this means all packets are tagged with the default VID, which is 0. While the kernel will treat that like untagged, this can break userspace applications processing raw packets, expecting untagged traffic, like STP daemons. This also breaks several bridge tests, where the tcpdump output then does not match the expected output anymore. Since 0 isn't a valid VID, just strip out the VLAN tag if we encounter it, unless the priority field is set, since that would be a valid tag again. Fixes: 964dbf186eaa ("net: dsa: tag_brcm: add support for legacy tags") Signed-off-by: Jonas Gorski Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20251027194621.133301-1-jonas.gorski@gmail.com Signed-off-by: Jakub Kicinski --- net/dsa/tag_brcm.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 26bb657ceac3..d9c77fa553b5 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -224,12 +224,14 @@ static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb, { int len = BRCM_LEG_TAG_LEN; int source_port; + __be16 *proto; u8 *brcm_tag; if (unlikely(!pskb_may_pull(skb, BRCM_LEG_TAG_LEN + VLAN_HLEN))) return NULL; brcm_tag = dsa_etype_header_pos_rx(skb); + proto = (__be16 *)(brcm_tag + BRCM_LEG_TAG_LEN); source_port = brcm_tag[5] & BRCM_LEG_PORT_ID; @@ -237,8 +239,12 @@ static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb, if (!skb->dev) return NULL; - /* VLAN tag is added by BCM63xx internal switch */ - if (netdev_uses_dsa(skb->dev)) + /* The internal switch in BCM63XX SoCs always tags on egress on the CPU + * port. We use VID 0 internally for untagged traffic, so strip the tag + * if the TCI field is all 0, and keep it otherwise to also retain + * e.g. 802.1p tagged packets. + */ + if (proto[0] == htons(ETH_P_8021Q) && proto[1] == 0) len += VLAN_HLEN; /* Remove Broadcom tag and update checksum */ -- cgit v1.2.3 From c211f5d7cbd5cb34489d526648bb9c8ecc907dee Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 30 Oct 2025 07:35:39 +0000 Subject: net: vlan: sync VLAN features with lower device After registering a VLAN device and setting its feature flags, we need to synchronize the VLAN features with the lower device. For example, the VLAN device does not have the NETIF_F_LRO flag, it should be synchronized with the lower device based on the NETIF_F_UPPER_DISABLES definition. As the dev->vlan_features has changed, we need to call netdev_update_features(). The caller must run after netdev_upper_dev_link() links the lower devices, so this patch adds the netdev_update_features() call in register_vlan_dev(). Fixes: fd867d51f889 ("net/core: generic support for disabling netdev features down stack") Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20251030073539.133779-1-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- net/8021q/vlan.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index fda3a80e9340..2b74ed56eb16 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -193,6 +193,8 @@ int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack) vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, dev); grp->nr_vlan_devs++; + netdev_update_features(dev); + return 0; out_unregister_netdev: -- cgit v1.2.3 From 38f50242bf0f237cdc262308d624d333286ec3c5 Mon Sep 17 00:00:00 2001 From: Stefan Wiehler Date: Tue, 28 Oct 2025 17:12:26 +0100 Subject: sctp: Hold RCU read lock while iterating over address list With CONFIG_PROVE_RCU_LIST=y and by executing $ netcat -l --sctp & $ netcat --sctp localhost & $ ss --sctp one can trigger the following Lockdep-RCU splat(s): WARNING: suspicious RCU usage 6.18.0-rc1-00093-g7f864458e9a6 #5 Not tainted ----------------------------- net/sctp/diag.c:76 RCU-list traversed in non-reader section!! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 2 locks held by ss/215: #0: ffff9c740828bec0 (nlk_cb_mutex-SOCK_DIAG){+.+.}-{4:4}, at: __netlink_dump_start+0x84/0x2b0 #1: ffff9c7401d72cd0 (sk_lock-AF_INET6){+.+.}-{0:0}, at: sctp_sock_dump+0x38/0x200 stack backtrace: CPU: 0 UID: 0 PID: 215 Comm: ss Not tainted 6.18.0-rc1-00093-g7f864458e9a6 #5 PREEMPT(voluntary) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x5d/0x90 lockdep_rcu_suspicious.cold+0x4e/0xa3 inet_sctp_diag_fill.isra.0+0x4b1/0x5d0 sctp_sock_dump+0x131/0x200 sctp_transport_traverse_process+0x170/0x1b0 ? __pfx_sctp_sock_filter+0x10/0x10 ? __pfx_sctp_sock_dump+0x10/0x10 sctp_diag_dump+0x103/0x140 __inet_diag_dump+0x70/0xb0 netlink_dump+0x148/0x490 __netlink_dump_start+0x1f3/0x2b0 inet_diag_handler_cmd+0xcd/0x100 ? __pfx_inet_diag_dump_start+0x10/0x10 ? __pfx_inet_diag_dump+0x10/0x10 ? __pfx_inet_diag_dump_done+0x10/0x10 sock_diag_rcv_msg+0x18e/0x320 ? __pfx_sock_diag_rcv_msg+0x10/0x10 netlink_rcv_skb+0x4d/0x100 netlink_unicast+0x1d7/0x2b0 netlink_sendmsg+0x203/0x450 ____sys_sendmsg+0x30c/0x340 ___sys_sendmsg+0x94/0xf0 __sys_sendmsg+0x83/0xf0 do_syscall_64+0xbb/0x390 entry_SYSCALL_64_after_hwframe+0x77/0x7f ... Fixes: 8f840e47f190 ("sctp: add the sctp_diag.c file") Signed-off-by: Stefan Wiehler Reviewed-by: Kuniyuki Iwashima Acked-by: Xin Long Link: https://patch.msgid.link/20251028161506.3294376-2-stefan.wiehler@nokia.com Signed-off-by: Jakub Kicinski --- net/sctp/diag.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/sctp/diag.c b/net/sctp/diag.c index 996c2018f0e6..1a8761f87bf1 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -73,19 +73,23 @@ static int inet_diag_msg_sctpladdrs_fill(struct sk_buff *skb, struct nlattr *attr; void *info = NULL; + rcu_read_lock(); list_for_each_entry_rcu(laddr, address_list, list) addrcnt++; + rcu_read_unlock(); attr = nla_reserve(skb, INET_DIAG_LOCALS, addrlen * addrcnt); if (!attr) return -EMSGSIZE; info = nla_data(attr); + rcu_read_lock(); list_for_each_entry_rcu(laddr, address_list, list) { memcpy(info, &laddr->a, sizeof(laddr->a)); memset(info + sizeof(laddr->a), 0, addrlen - sizeof(laddr->a)); info += addrlen; } + rcu_read_unlock(); return 0; } -- cgit v1.2.3 From 95aef86ab231f047bb8085c70666059b58f53c09 Mon Sep 17 00:00:00 2001 From: Stefan Wiehler Date: Tue, 28 Oct 2025 17:12:27 +0100 Subject: sctp: Prevent TOCTOU out-of-bounds write For the following path not holding the sock lock, sctp_diag_dump() -> sctp_for_each_endpoint() -> sctp_ep_dump() make sure not to exceed bounds in case the address list has grown between buffer allocation (time-of-check) and write (time-of-use). Suggested-by: Kuniyuki Iwashima Fixes: 8f840e47f190 ("sctp: add the sctp_diag.c file") Signed-off-by: Stefan Wiehler Reviewed-by: Kuniyuki Iwashima Acked-by: Xin Long Link: https://patch.msgid.link/20251028161506.3294376-3-stefan.wiehler@nokia.com Signed-off-by: Jakub Kicinski --- net/sctp/diag.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sctp/diag.c b/net/sctp/diag.c index 1a8761f87bf1..5d64dd99ca9a 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -88,6 +88,9 @@ static int inet_diag_msg_sctpladdrs_fill(struct sk_buff *skb, memcpy(info, &laddr->a, sizeof(laddr->a)); memset(info + sizeof(laddr->a), 0, addrlen - sizeof(laddr->a)); info += addrlen; + + if (!--addrcnt) + break; } rcu_read_unlock(); -- cgit v1.2.3 From f1fc201148c7e684c10a72b6a3375597f28d1ef6 Mon Sep 17 00:00:00 2001 From: Stefan Wiehler Date: Tue, 28 Oct 2025 17:12:28 +0100 Subject: sctp: Hold sock lock while iterating over address list Move address list traversal in inet_assoc_attr_size() under the sock lock to avoid holding the RCU read lock. Suggested-by: Xin Long Fixes: 8f840e47f190 ("sctp: add the sctp_diag.c file") Signed-off-by: Stefan Wiehler Acked-by: Xin Long Link: https://patch.msgid.link/20251028161506.3294376-4-stefan.wiehler@nokia.com Signed-off-by: Jakub Kicinski --- net/sctp/diag.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sctp/diag.c b/net/sctp/diag.c index 5d64dd99ca9a..2afb376299fe 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -230,14 +230,15 @@ struct sctp_comm_param { bool net_admin; }; -static size_t inet_assoc_attr_size(struct sctp_association *asoc) +static size_t inet_assoc_attr_size(struct sock *sk, + struct sctp_association *asoc) { int addrlen = sizeof(struct sockaddr_storage); int addrcnt = 0; struct sctp_sockaddr_entry *laddr; list_for_each_entry_rcu(laddr, &asoc->base.bind_addr.address_list, - list) + list, lockdep_sock_is_held(sk)) addrcnt++; return nla_total_size(sizeof(struct sctp_info)) @@ -263,11 +264,14 @@ static int sctp_sock_dump_one(struct sctp_endpoint *ep, struct sctp_transport *t if (err) return err; - rep = nlmsg_new(inet_assoc_attr_size(assoc), GFP_KERNEL); - if (!rep) + lock_sock(sk); + + rep = nlmsg_new(inet_assoc_attr_size(sk, assoc), GFP_KERNEL); + if (!rep) { + release_sock(sk); return -ENOMEM; + } - lock_sock(sk); if (ep != assoc->ep) { err = -EAGAIN; goto out; -- cgit v1.2.3 From 59b20b15c112867f28a12a24aa25f14549db02e4 Mon Sep 17 00:00:00 2001 From: Huiwen He Date: Mon, 3 Nov 2025 10:36:19 +0800 Subject: sctp: make sctp_transport_init() void sctp_transport_init() is static and never returns NULL. It is only called by sctp_transport_new(), so change it to void and remove the redundant return value check. Signed-off-by: Huiwen He Acked-by: Xin Long Link: https://patch.msgid.link/20251103023619.1025622-1-hehuiwen@kylinos.cn Signed-off-by: Jakub Kicinski --- net/sctp/transport.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 4d258a6e8033..0d48c61fe6ad 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -37,10 +37,10 @@ /* 1st Level Abstractions. */ /* Initialize a new transport from provided memory. */ -static struct sctp_transport *sctp_transport_init(struct net *net, - struct sctp_transport *peer, - const union sctp_addr *addr, - gfp_t gfp) +static void sctp_transport_init(struct net *net, + struct sctp_transport *peer, + const union sctp_addr *addr, + gfp_t gfp) { /* Copy in the address. */ peer->af_specific = sctp_get_af_specific(addr->sa.sa_family); @@ -83,8 +83,6 @@ static struct sctp_transport *sctp_transport_init(struct net *net, get_random_bytes(&peer->hb_nonce, sizeof(peer->hb_nonce)); refcount_set(&peer->refcnt, 1); - - return peer; } /* Allocate and initialize a new transport. */ @@ -96,20 +94,13 @@ struct sctp_transport *sctp_transport_new(struct net *net, transport = kzalloc(sizeof(*transport), gfp); if (!transport) - goto fail; + return NULL; - if (!sctp_transport_init(net, transport, addr, gfp)) - goto fail_init; + sctp_transport_init(net, transport, addr, gfp); SCTP_DBG_OBJCNT_INC(transport); return transport; - -fail_init: - kfree(transport); - -fail: - return NULL; } /* This transport is no longer needed. Free up if possible, or -- cgit v1.2.3 From 327c20c21d80e0d87834b392d83ae73c955ad8ff Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 3 Nov 2025 08:38:17 -0800 Subject: netpoll: Fix deadlock in memory allocation under spinlock Fix a AA deadlock in refill_skbs() where memory allocation while holding skb_pool->lock can trigger a recursive lock acquisition attempt. The deadlock scenario occurs when the system is under severe memory pressure: 1. refill_skbs() acquires skb_pool->lock (spinlock) 2. alloc_skb() is called while holding the lock 3. Memory allocator fails and calls slab_out_of_memory() 4. This triggers printk() for the OOM warning 5. The console output path calls netpoll_send_udp() 6. netpoll_send_udp() attempts to acquire the same skb_pool->lock 7. Deadlock: the lock is already held by the same CPU Call stack: refill_skbs() spin_lock_irqsave(&skb_pool->lock) <- lock acquired __alloc_skb() kmem_cache_alloc_node_noprof() slab_out_of_memory() printk() console_flush_all() netpoll_send_udp() skb_dequeue() spin_lock_irqsave(&skb_pool->lock) <- deadlock attempt This bug was exposed by commit 248f6571fd4c51 ("netpoll: Optimize skb refilling on critical path") which removed refill_skbs() from the critical path (where nested printk was being deferred), letting nested printk being called from inside refill_skbs() Refactor refill_skbs() to never allocate memory while holding the spinlock. Another possible solution to fix this problem is protecting the refill_skbs() from nested printks, basically calling printk_deferred_{enter,exit}() in refill_skbs(), then, any nested pr_warn() would be deferred. I prefer this approach, given I _think_ it might be a good idea to move the alloc_skb() from GFP_ATOMIC to GFP_KERNEL in the future, so, having the alloc_skb() outside of the lock will be necessary step. There is a possible TOCTOU issue when checking for the pool length, and queueing the new allocated skb, but, this is not an issue, given that an extra SKB in the pool is harmless and it will be eventually used. Signed-off-by: Breno Leitao Fixes: 248f6571fd4c51 ("netpoll: Optimize skb refilling on critical path") Reviewed-by: Simon Horman Link: https://patch.msgid.link/20251103-fix_netpoll_aa-v4-1-4cfecdf6da7c@debian.org Signed-off-by: Jakub Kicinski --- net/core/netpoll.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 60a05d3b7c24..c85f740065fc 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -228,19 +228,16 @@ static void refill_skbs(struct netpoll *np) { struct sk_buff_head *skb_pool; struct sk_buff *skb; - unsigned long flags; skb_pool = &np->skb_pool; - spin_lock_irqsave(&skb_pool->lock, flags); - while (skb_pool->qlen < MAX_SKBS) { + while (READ_ONCE(skb_pool->qlen) < MAX_SKBS) { skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC); if (!skb) break; - __skb_queue_tail(skb_pool, skb); + skb_queue_tail(skb_pool, skb); } - spin_unlock_irqrestore(&skb_pool->lock, flags); } static void zap_completion_queue(void) -- cgit v1.2.3 From d917c217b612971ea05ae1582e8740b747e0e7e8 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 4 Nov 2025 16:34:35 +0100 Subject: net: gro_cells: Reduce lock scope in gro_cell_poll One GRO-cell device's NAPI callback can nest into the GRO-cell of another device if the underlying device is also using GRO-cell. This is the case for IPsec over vxlan. These two GRO-cells are separate devices. From lockdep's point of view it is the same because each device is sharing the same lock class and so it reports a possible deadlock assuming one device is nesting into itself. Hold the bh_lock only while accessing gro_cell::napi_skbs in gro_cell_poll(). This reduces the locking scope and avoids acquiring the same lock class multiple times. Fixes: 25718fdcbdd2 ("net: gro_cells: Use nested-BH locking for gro_cell") Reported-by: Gal Pressman Closes: https://lore.kernel.org/all/66664116-edb8-48dc-ad72-d5223696dd19@nvidia.com/ Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20251104153435.ty88xDQt@linutronix.de Signed-off-by: Jakub Kicinski --- net/core/gro_cells.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index fd57b845de33..a725d21159a6 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -60,9 +60,10 @@ static int gro_cell_poll(struct napi_struct *napi, int budget) struct sk_buff *skb; int work_done = 0; - __local_lock_nested_bh(&cell->bh_lock); while (work_done < budget) { + __local_lock_nested_bh(&cell->bh_lock); skb = __skb_dequeue(&cell->napi_skbs); + __local_unlock_nested_bh(&cell->bh_lock); if (!skb) break; napi_gro_receive(napi, skb); @@ -71,7 +72,6 @@ static int gro_cell_poll(struct napi_struct *napi, int budget) if (work_done < budget) napi_complete_done(napi, work_done); - __local_unlock_nested_bh(&cell->bh_lock); return work_done; } -- cgit v1.2.3 From 8dca36978aa80bab9d4da130c211db75c9e00048 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 5 Nov 2025 13:19:18 +0200 Subject: net: bridge: fix use-after-free due to MST port state bypass syzbot reported[1] a use-after-free when deleting an expired fdb. It is due to a race condition between learning still happening and a port being deleted, after all its fdbs have been flushed. The port's state has been toggled to disabled so no learning should happen at that time, but if we have MST enabled, it will bypass the port's state, that together with VLAN filtering disabled can lead to fdb learning at a time when it shouldn't happen while the port is being deleted. VLAN filtering must be disabled because we flush the port VLANs when it's being deleted which will stop learning. This fix adds a check for the port's vlan group which is initialized to NULL when the port is getting deleted, that avoids the port state bypass. When MST is enabled there would be a minimal new overhead in the fast-path because the port's vlan group pointer is cache-hot. [1] https://syzkaller.appspot.com/bug?extid=dd280197f0f7ab3917be Fixes: ec7328b59176 ("net: bridge: mst: Multiple Spanning Tree (MST) mode") Reported-by: syzbot+dd280197f0f7ab3917be@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/69088ffa.050a0220.29fc44.003d.GAE@google.com/ Signed-off-by: Nikolay Aleksandrov Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20251105111919.1499702-2-razor@blackwall.org Signed-off-by: Jakub Kicinski --- net/bridge/br_forward.c | 2 +- net/bridge/br_input.c | 4 ++-- net/bridge/br_private.h | 8 +++++--- 3 files changed, 8 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 870bdf2e082c..dea09096ad0f 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -25,7 +25,7 @@ static inline int should_deliver(const struct net_bridge_port *p, vg = nbp_vlan_group_rcu(p); return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) && - (br_mst_is_enabled(p->br) || p->state == BR_STATE_FORWARDING) && + (br_mst_is_enabled(p) || p->state == BR_STATE_FORWARDING) && br_allowed_egress(vg, skb) && nbp_switchdev_allowed_egress(p, skb) && !br_skb_isolated(p, skb); } diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 67b4c905e49a..777fa869c1a1 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -94,7 +94,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb br = p->br; - if (br_mst_is_enabled(br)) { + if (br_mst_is_enabled(p)) { state = BR_STATE_FORWARDING; } else { if (p->state == BR_STATE_DISABLED) { @@ -429,7 +429,7 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) return RX_HANDLER_PASS; forward: - if (br_mst_is_enabled(p->br)) + if (br_mst_is_enabled(p)) goto defer_stp_filtering; switch (p->state) { diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 16be5d250402..b571d6f61389 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1935,10 +1935,12 @@ static inline bool br_vlan_state_allowed(u8 state, bool learn_allow) /* br_mst.c */ #ifdef CONFIG_BRIDGE_VLAN_FILTERING DECLARE_STATIC_KEY_FALSE(br_mst_used); -static inline bool br_mst_is_enabled(struct net_bridge *br) +static inline bool br_mst_is_enabled(const struct net_bridge_port *p) { + /* check the port's vlan group to avoid racing with port deletion */ return static_branch_unlikely(&br_mst_used) && - br_opt_get(br, BROPT_MST_ENABLED); + br_opt_get(p->br, BROPT_MST_ENABLED) && + rcu_access_pointer(p->vlgrp); } int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state, @@ -1953,7 +1955,7 @@ int br_mst_fill_info(struct sk_buff *skb, int br_mst_process(struct net_bridge_port *p, const struct nlattr *mst_attr, struct netlink_ext_ack *extack); #else -static inline bool br_mst_is_enabled(struct net_bridge *br) +static inline bool br_mst_is_enabled(const struct net_bridge_port *p) { return false; } -- cgit v1.2.3 From ee87c63f9b2a418f698d79c2991347e31a7d2c27 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 5 Nov 2025 13:19:19 +0200 Subject: net: bridge: fix MST static key usage As Ido pointed out, the static key usage in MST is buggy and should use inc/dec instead of enable/disable because we can have multiple bridges with MST enabled which means a single bridge can disable MST for all. Use static_branch_inc/dec to avoid that. When destroying a bridge decrement the key if MST was enabled. Fixes: ec7328b59176 ("net: bridge: mst: Multiple Spanning Tree (MST) mode") Reported-by: Ido Schimmel Closes: https://lore.kernel.org/netdev/20251104120313.1306566-1-razor@blackwall.org/T/#m6888d87658f94ed1725433940f4f4ebb00b5a68b Signed-off-by: Nikolay Aleksandrov Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20251105111919.1499702-3-razor@blackwall.org Signed-off-by: Jakub Kicinski --- net/bridge/br_if.c | 1 + net/bridge/br_mst.c | 10 ++++++++-- net/bridge/br_private.h | 5 +++++ 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 98c5b9c3145f..ca3a637d7cca 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -386,6 +386,7 @@ void br_dev_delete(struct net_device *dev, struct list_head *head) del_nbp(p); } + br_mst_uninit(br); br_recalculate_neigh_suppress_enabled(br); br_fdb_delete_by_port(br, NULL, 0, 1); diff --git a/net/bridge/br_mst.c b/net/bridge/br_mst.c index 3f24b4ee49c2..43a300ae6bfa 100644 --- a/net/bridge/br_mst.c +++ b/net/bridge/br_mst.c @@ -22,6 +22,12 @@ bool br_mst_enabled(const struct net_device *dev) } EXPORT_SYMBOL_GPL(br_mst_enabled); +void br_mst_uninit(struct net_bridge *br) +{ + if (br_opt_get(br, BROPT_MST_ENABLED)) + static_branch_dec(&br_mst_used); +} + int br_mst_get_info(const struct net_device *dev, u16 msti, unsigned long *vids) { const struct net_bridge_vlan_group *vg; @@ -225,9 +231,9 @@ int br_mst_set_enabled(struct net_bridge *br, bool on, return err; if (on) - static_branch_enable(&br_mst_used); + static_branch_inc(&br_mst_used); else - static_branch_disable(&br_mst_used); + static_branch_dec(&br_mst_used); br_opt_toggle(br, BROPT_MST_ENABLED, on); return 0; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index b571d6f61389..7280c4e9305f 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1954,6 +1954,7 @@ int br_mst_fill_info(struct sk_buff *skb, const struct net_bridge_vlan_group *vg); int br_mst_process(struct net_bridge_port *p, const struct nlattr *mst_attr, struct netlink_ext_ack *extack); +void br_mst_uninit(struct net_bridge *br); #else static inline bool br_mst_is_enabled(const struct net_bridge_port *p) { @@ -1989,6 +1990,10 @@ static inline int br_mst_process(struct net_bridge_port *p, { return -EOPNOTSUPP; } + +static inline void br_mst_uninit(struct net_bridge *br) +{ +} #endif struct nf_br_ops { -- cgit v1.2.3