From eb09fbeb48709fe66c0d708aed81e910a577a30a Mon Sep 17 00:00:00 2001 From: Lizhi Xu Date: Wed, 13 Nov 2024 17:51:29 +0800 Subject: mac802154: check local interfaces before deleting sdata list syzkaller reported a corrupted list in ieee802154_if_remove. [1] Remove an IEEE 802.15.4 network interface after unregister an IEEE 802.15.4 hardware device from the system. CPU0 CPU1 ==== ==== genl_family_rcv_msg_doit ieee802154_unregister_hw ieee802154_del_iface ieee802154_remove_interfaces rdev_del_virtual_intf_deprecated list_del(&sdata->list) ieee802154_if_remove list_del_rcu The net device has been unregistered, since the rcu grace period, unregistration must be run before ieee802154_if_remove. To avoid this issue, add a check for local->interfaces before deleting sdata list. [1] kernel BUG at lib/list_debug.c:58! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 0 UID: 0 PID: 6277 Comm: syz-executor157 Not tainted 6.12.0-rc6-syzkaller-00005-g557329bcecc2 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:__list_del_entry_valid_or_report+0xf4/0x140 lib/list_debug.c:56 Code: e8 a1 7e 00 07 90 0f 0b 48 c7 c7 e0 37 60 8c 4c 89 fe e8 8f 7e 00 07 90 0f 0b 48 c7 c7 40 38 60 8c 4c 89 fe e8 7d 7e 00 07 90 <0f> 0b 48 c7 c7 a0 38 60 8c 4c 89 fe e8 6b 7e 00 07 90 0f 0b 48 c7 RSP: 0018:ffffc9000490f3d0 EFLAGS: 00010246 RAX: 000000000000004e RBX: dead000000000122 RCX: d211eee56bb28d00 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: ffff88805b278dd8 R08: ffffffff8174a12c R09: 1ffffffff2852f0d R10: dffffc0000000000 R11: fffffbfff2852f0e R12: dffffc0000000000 R13: dffffc0000000000 R14: dead000000000100 R15: ffff88805b278cc0 FS: 0000555572f94380(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000056262e4a3000 CR3: 0000000078496000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __list_del_entry_valid include/linux/list.h:124 [inline] __list_del_entry include/linux/list.h:215 [inline] list_del_rcu include/linux/rculist.h:157 [inline] ieee802154_if_remove+0x86/0x1e0 net/mac802154/iface.c:687 rdev_del_virtual_intf_deprecated net/ieee802154/rdev-ops.h:24 [inline] ieee802154_del_iface+0x2c0/0x5c0 net/ieee802154/nl-phy.c:323 genl_family_rcv_msg_doit net/netlink/genetlink.c:1115 [inline] genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline] genl_rcv_msg+0xb14/0xec0 net/netlink/genetlink.c:1210 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2551 genl_rcv+0x28/0x40 net/netlink/genetlink.c:1219 netlink_unicast_kernel net/netlink/af_netlink.c:1331 [inline] netlink_unicast+0x7f6/0x990 net/netlink/af_netlink.c:1357 netlink_sendmsg+0x8e4/0xcb0 net/netlink/af_netlink.c:1901 sock_sendmsg_nosec net/socket.c:729 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:744 ____sys_sendmsg+0x52a/0x7e0 net/socket.c:2607 ___sys_sendmsg net/socket.c:2661 [inline] __sys_sendmsg+0x292/0x380 net/socket.c:2690 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Reported-and-tested-by: syzbot+985f827280dc3a6e7e92@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=985f827280dc3a6e7e92 Signed-off-by: Lizhi Xu Reviewed-by: Miquel Raynal Link: https://lore.kernel.org/20241113095129.1457225-1-lizhi.xu@windriver.com Signed-off-by: Stefan Schmidt --- net/mac802154/iface.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index c0e2da5072be..9e4631fade90 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -684,6 +684,10 @@ void ieee802154_if_remove(struct ieee802154_sub_if_data *sdata) ASSERT_RTNL(); mutex_lock(&sdata->local->iflist_mtx); + if (list_empty(&sdata->local->interfaces)) { + mutex_unlock(&sdata->local->iflist_mtx); + return; + } list_del_rcu(&sdata->list); mutex_unlock(&sdata->local->iflist_mtx); -- cgit v1.2.3 From 1e9b0e1c550c42c13c111d1a31e822057232abc4 Mon Sep 17 00:00:00 2001 From: Antonio Pastor Date: Thu, 2 Jan 2025 20:23:00 -0500 Subject: net: 802: LLC+SNAP OID:PID lookup on start of skb data 802.2+LLC+SNAP frames received by napi_complete_done() with GRO and DSA have skb->transport_header set two bytes short, or pointing 2 bytes before network_header & skb->data. This was an issue as snap_rcv() expected offset to point to SNAP header (OID:PID), causing packet to be dropped. A fix at llc_fixup_skb() (a024e377efed) resets transport_header for any LLC consumers that may care about it, and stops SNAP packets from being dropped, but doesn't fix the problem which is that LLC and SNAP should not use transport_header offset. Ths patch eliminates the use of transport_header offset for SNAP lookup of OID:PID so that SNAP does not rely on the offset at all. The offset is reset after pull for any SNAP packet consumers that may (but shouldn't) use it. Fixes: fda55eca5a33 ("net: introduce skb_transport_header_was_set()") Signed-off-by: Antonio Pastor Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250103012303.746521-1-antonio.pastor@gmail.com Signed-off-by: Jakub Kicinski --- net/802/psnap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/802/psnap.c b/net/802/psnap.c index fca9d454905f..389df460c8c4 100644 --- a/net/802/psnap.c +++ b/net/802/psnap.c @@ -55,11 +55,11 @@ static int snap_rcv(struct sk_buff *skb, struct net_device *dev, goto drop; rcu_read_lock(); - proto = find_snap_client(skb_transport_header(skb)); + proto = find_snap_client(skb->data); if (proto) { /* Pass the frame on. */ - skb->transport_header += 5; skb_pull_rcsum(skb, 5); + skb_reset_transport_header(skb); rc = proto->rcvfunc(skb, dev, &snap_packet_type, orig_dev); } rcu_read_unlock(); -- cgit v1.2.3 From a039e54397c6a75b713b9ce7894a62e06956aa92 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Jan 2025 10:45:46 +0000 Subject: net_sched: cls_flow: validate TCA_FLOW_RSHIFT attribute syzbot found that TCA_FLOW_RSHIFT attribute was not validated. Right shitfing a 32bit integer is undefined for large shift values. UBSAN: shift-out-of-bounds in net/sched/cls_flow.c:329:23 shift exponent 9445 is too large for 32-bit type 'u32' (aka 'unsigned int') CPU: 1 UID: 0 PID: 54 Comm: kworker/u8:3 Not tainted 6.13.0-rc3-syzkaller-00180-g4f619d518db9 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Workqueue: ipv6_addrconf addrconf_dad_work Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 ubsan_epilogue lib/ubsan.c:231 [inline] __ubsan_handle_shift_out_of_bounds+0x3c8/0x420 lib/ubsan.c:468 flow_classify+0x24d5/0x25b0 net/sched/cls_flow.c:329 tc_classify include/net/tc_wrapper.h:197 [inline] __tcf_classify net/sched/cls_api.c:1771 [inline] tcf_classify+0x420/0x1160 net/sched/cls_api.c:1867 sfb_classify net/sched/sch_sfb.c:260 [inline] sfb_enqueue+0x3ad/0x18b0 net/sched/sch_sfb.c:318 dev_qdisc_enqueue+0x4b/0x290 net/core/dev.c:3793 __dev_xmit_skb net/core/dev.c:3889 [inline] __dev_queue_xmit+0xf0e/0x3f50 net/core/dev.c:4400 dev_queue_xmit include/linux/netdevice.h:3168 [inline] neigh_hh_output include/net/neighbour.h:523 [inline] neigh_output include/net/neighbour.h:537 [inline] ip_finish_output2+0xd41/0x1390 net/ipv4/ip_output.c:236 iptunnel_xmit+0x55d/0x9b0 net/ipv4/ip_tunnel_core.c:82 udp_tunnel_xmit_skb+0x262/0x3b0 net/ipv4/udp_tunnel_core.c:173 geneve_xmit_skb drivers/net/geneve.c:916 [inline] geneve_xmit+0x21dc/0x2d00 drivers/net/geneve.c:1039 __netdev_start_xmit include/linux/netdevice.h:5002 [inline] netdev_start_xmit include/linux/netdevice.h:5011 [inline] xmit_one net/core/dev.c:3590 [inline] dev_hard_start_xmit+0x27a/0x7d0 net/core/dev.c:3606 __dev_queue_xmit+0x1b73/0x3f50 net/core/dev.c:4434 Fixes: e5dfb815181f ("[NET_SCHED]: Add flow classifier") Reported-by: syzbot+1dbb57d994e54aaa04d2@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6777bf49.050a0220.178762.0040.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250103104546.3714168-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/cls_flow.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 5502998aace7..5c2580a07530 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -356,7 +356,8 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = { [TCA_FLOW_KEYS] = { .type = NLA_U32 }, [TCA_FLOW_MODE] = { .type = NLA_U32 }, [TCA_FLOW_BASECLASS] = { .type = NLA_U32 }, - [TCA_FLOW_RSHIFT] = { .type = NLA_U32 }, + [TCA_FLOW_RSHIFT] = NLA_POLICY_MAX(NLA_U32, + 31 /* BITS_PER_U32 - 1 */), [TCA_FLOW_ADDEND] = { .type = NLA_U32 }, [TCA_FLOW_MASK] = { .type = NLA_U32 }, [TCA_FLOW_XOR] = { .type = NLA_U32 }, -- cgit v1.2.3 From fd48f071a3d6d51e737e953bb43fe69785cf59a9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Jan 2025 10:32:07 -0800 Subject: net: don't dump Tx and uninitialized NAPIs We use NAPI ID as the key for continuing dumps. We also depend on the NAPIs being sorted by ID within the driver list. Tx NAPIs (which don't have an ID assigned) break this expectation, it's not currently possible to dump them reliably. Since Tx NAPIs are relatively rare, and can't be used in doit (GET or SET) hide them from the dump API as well. Fixes: 27f91aaf49b3 ("netdev-genl: Add netlink framework functions for napi") Signed-off-by: Jakub Kicinski Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250103183207.1216004-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/core/netdev-genl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index b0772d135efb..125b660004d3 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -176,8 +176,7 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, if (!hdr) return -EMSGSIZE; - if (napi->napi_id >= MIN_NAPI_ID && - nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id)) + if (nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id)) goto nla_put_failure; if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex)) @@ -272,6 +271,8 @@ netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp, return err; list_for_each_entry(napi, &netdev->napi_list, dev_list) { + if (napi->napi_id < MIN_NAPI_ID) + continue; if (ctx->napi_id && napi->napi_id >= ctx->napi_id) continue; -- cgit v1.2.3 From b341ca51d2679829d26a3f6a4aa9aee9abd94f92 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Sat, 4 Jan 2025 10:29:45 -0500 Subject: tls: Fix tls_sw_sendmsg error handling We've noticed that NFS can hang when using RPC over TLS on an unstable connection, and investigation shows that the RPC layer is stuck in a tight loop attempting to transmit, but forever getting -EBADMSG back from the underlying network. The loop begins when tcp_sendmsg_locked() returns -EPIPE to tls_tx_records(), but that error is converted to -EBADMSG when calling the socket's error reporting handler. Instead of converting errors from tcp_sendmsg_locked(), let's pass them along in this path. The RPC layer handles -EPIPE by reconnecting the transport, which prevents the endless attempts to transmit on a broken connection. Signed-off-by: Benjamin Coddington Fixes: a42055e8d2c3 ("net/tls: Add support for async encryption of records for performance") Link: https://patch.msgid.link/9594185559881679d81f071b181a10eb07cd079f.1736004079.git.bcodding@redhat.com Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index bbf26cc4f6ee..7bcc9b4408a2 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -458,7 +458,7 @@ int tls_tx_records(struct sock *sk, int flags) tx_err: if (rc < 0 && rc != -EAGAIN) - tls_err_abort(sk, -EBADMSG); + tls_err_abort(sk, rc); return rc; } -- cgit v1.2.3 From cb358ff94154774d031159b018adf45e17673941 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 6 Jan 2025 16:19:11 +0900 Subject: ipvlan: Fix use-after-free in ipvlan_get_iflink(). syzbot presented an use-after-free report [0] regarding ipvlan and linkwatch. ipvlan does not hold a refcnt of the lower device unlike vlan and macvlan. If the linkwatch work is triggered for the ipvlan dev, the lower dev might have already been freed, resulting in UAF of ipvlan->phy_dev in ipvlan_get_iflink(). We can delay the lower dev unregistration like vlan and macvlan by holding the lower dev's refcnt in dev->netdev_ops->ndo_init() and releasing it in dev->priv_destructor(). Jakub pointed out calling .ndo_XXX after unregister_netdevice() has returned is error prone and suggested [1] addressing this UAF in the core by taking commit 750e51603395 ("net: avoid potential UAF in default_operstate()") further. Let's assume unregistering devices DOWN and use RCU protection in default_operstate() not to race with the device unregistration. [0]: BUG: KASAN: slab-use-after-free in ipvlan_get_iflink+0x84/0x88 drivers/net/ipvlan/ipvlan_main.c:353 Read of size 4 at addr ffff0000d768c0e0 by task kworker/u8:35/6944 CPU: 0 UID: 0 PID: 6944 Comm: kworker/u8:35 Not tainted 6.13.0-rc2-g9bc5c9515b48 #12 4c3cb9e8b4565456f6a355f312ff91f4f29b3c47 Hardware name: linux,dummy-virt (DT) Workqueue: events_unbound linkwatch_event Call trace: show_stack+0x38/0x50 arch/arm64/kernel/stacktrace.c:484 (C) __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0xbc/0x108 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0x16c/0x6f0 mm/kasan/report.c:489 kasan_report+0xc0/0x120 mm/kasan/report.c:602 __asan_report_load4_noabort+0x20/0x30 mm/kasan/report_generic.c:380 ipvlan_get_iflink+0x84/0x88 drivers/net/ipvlan/ipvlan_main.c:353 dev_get_iflink+0x7c/0xd8 net/core/dev.c:674 default_operstate net/core/link_watch.c:45 [inline] rfc2863_policy+0x144/0x360 net/core/link_watch.c:72 linkwatch_do_dev+0x60/0x228 net/core/link_watch.c:175 __linkwatch_run_queue+0x2f4/0x5b8 net/core/link_watch.c:239 linkwatch_event+0x64/0xa8 net/core/link_watch.c:282 process_one_work+0x700/0x1398 kernel/workqueue.c:3229 process_scheduled_works kernel/workqueue.c:3310 [inline] worker_thread+0x8c4/0xe10 kernel/workqueue.c:3391 kthread+0x2b0/0x360 kernel/kthread.c:389 ret_from_fork+0x10/0x20 arch/arm64/kernel/entry.S:862 Allocated by task 9303: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x30/0x68 mm/kasan/common.c:68 kasan_save_alloc_info+0x44/0x58 mm/kasan/generic.c:568 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x84/0xa0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __do_kmalloc_node mm/slub.c:4283 [inline] __kmalloc_node_noprof+0x2a0/0x560 mm/slub.c:4289 __kvmalloc_node_noprof+0x9c/0x230 mm/util.c:650 alloc_netdev_mqs+0xb4/0x1118 net/core/dev.c:11209 rtnl_create_link+0x2b8/0xb60 net/core/rtnetlink.c:3595 rtnl_newlink_create+0x19c/0x868 net/core/rtnetlink.c:3771 __rtnl_newlink net/core/rtnetlink.c:3896 [inline] rtnl_newlink+0x122c/0x15c0 net/core/rtnetlink.c:4011 rtnetlink_rcv_msg+0x61c/0x918 net/core/rtnetlink.c:6901 netlink_rcv_skb+0x1dc/0x398 net/netlink/af_netlink.c:2542 rtnetlink_rcv+0x34/0x50 net/core/rtnetlink.c:6928 netlink_unicast_kernel net/netlink/af_netlink.c:1321 [inline] netlink_unicast+0x618/0x838 net/netlink/af_netlink.c:1347 netlink_sendmsg+0x5fc/0x8b0 net/netlink/af_netlink.c:1891 sock_sendmsg_nosec net/socket.c:711 [inline] __sock_sendmsg net/socket.c:726 [inline] __sys_sendto+0x2ec/0x438 net/socket.c:2197 __do_sys_sendto net/socket.c:2204 [inline] __se_sys_sendto net/socket.c:2200 [inline] __arm64_sys_sendto+0xe4/0x110 net/socket.c:2200 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x90/0x278 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x13c/0x250 arch/arm64/kernel/syscall.c:132 do_el0_svc+0x54/0x70 arch/arm64/kernel/syscall.c:151 el0_svc+0x4c/0xa8 arch/arm64/kernel/entry-common.c:744 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:762 el0t_64_sync+0x198/0x1a0 arch/arm64/kernel/entry.S:600 Freed by task 10200: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x30/0x68 mm/kasan/common.c:68 kasan_save_free_info+0x58/0x70 mm/kasan/generic.c:582 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x48/0x68 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2338 [inline] slab_free mm/slub.c:4598 [inline] kfree+0x140/0x420 mm/slub.c:4746 kvfree+0x4c/0x68 mm/util.c:693 netdev_release+0x94/0xc8 net/core/net-sysfs.c:2034 device_release+0x98/0x1c0 kobject_cleanup lib/kobject.c:689 [inline] kobject_release lib/kobject.c:720 [inline] kref_put include/linux/kref.h:65 [inline] kobject_put+0x2b0/0x438 lib/kobject.c:737 netdev_run_todo+0xdd8/0xf48 net/core/dev.c:10924 rtnl_unlock net/core/rtnetlink.c:152 [inline] rtnl_net_unlock net/core/rtnetlink.c:209 [inline] rtnl_dellink+0x484/0x680 net/core/rtnetlink.c:3526 rtnetlink_rcv_msg+0x61c/0x918 net/core/rtnetlink.c:6901 netlink_rcv_skb+0x1dc/0x398 net/netlink/af_netlink.c:2542 rtnetlink_rcv+0x34/0x50 net/core/rtnetlink.c:6928 netlink_unicast_kernel net/netlink/af_netlink.c:1321 [inline] netlink_unicast+0x618/0x838 net/netlink/af_netlink.c:1347 netlink_sendmsg+0x5fc/0x8b0 net/netlink/af_netlink.c:1891 sock_sendmsg_nosec net/socket.c:711 [inline] __sock_sendmsg net/socket.c:726 [inline] ____sys_sendmsg+0x410/0x708 net/socket.c:2583 ___sys_sendmsg+0x178/0x1d8 net/socket.c:2637 __sys_sendmsg net/socket.c:2669 [inline] __do_sys_sendmsg net/socket.c:2674 [inline] __se_sys_sendmsg net/socket.c:2672 [inline] __arm64_sys_sendmsg+0x12c/0x1c8 net/socket.c:2672 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x90/0x278 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x13c/0x250 arch/arm64/kernel/syscall.c:132 do_el0_svc+0x54/0x70 arch/arm64/kernel/syscall.c:151 el0_svc+0x4c/0xa8 arch/arm64/kernel/entry-common.c:744 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:762 el0t_64_sync+0x198/0x1a0 arch/arm64/kernel/entry.S:600 The buggy address belongs to the object at ffff0000d768c000 which belongs to the cache kmalloc-cg-4k of size 4096 The buggy address is located 224 bytes inside of freed 4096-byte region [ffff0000d768c000, ffff0000d768d000) The buggy address belongs to the physical page: page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x117688 head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 memcg:ffff0000c77ef981 flags: 0xbfffe0000000040(head|node=0|zone=2|lastcpupid=0x1ffff) page_type: f5(slab) raw: 0bfffe0000000040 ffff0000c000f500 dead000000000100 dead000000000122 raw: 0000000000000000 0000000000040004 00000001f5000000 ffff0000c77ef981 head: 0bfffe0000000040 ffff0000c000f500 dead000000000100 dead000000000122 head: 0000000000000000 0000000000040004 00000001f5000000 ffff0000c77ef981 head: 0bfffe0000000003 fffffdffc35da201 ffffffffffffffff 0000000000000000 head: 0000000000000008 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff0000d768bf80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff0000d768c000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff0000d768c080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff0000d768c100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff0000d768c180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fixes: 8c55facecd7a ("net: linkwatch: only report IF_OPER_LOWERLAYERDOWN if iflink is actually down") Reported-by: syzkaller Suggested-by: Jakub Kicinski Link: https://lore.kernel.org/netdev/20250102174400.085fd8ac@kernel.org/ [1] Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250106071911.64355-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/link_watch.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 1b4d39e38084..cb04ef2b9807 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -42,14 +42,18 @@ static unsigned int default_operstate(const struct net_device *dev) * first check whether lower is indeed the source of its down state. */ if (!netif_carrier_ok(dev)) { - int iflink = dev_get_iflink(dev); struct net_device *peer; + int iflink; /* If called from netdev_run_todo()/linkwatch_sync_dev(), * dev_net(dev) can be already freed, and RTNL is not held. */ - if (dev->reg_state == NETREG_UNREGISTERED || - iflink == dev->ifindex) + if (dev->reg_state <= NETREG_REGISTERED) + iflink = dev_get_iflink(dev); + else + iflink = dev->ifindex; + + if (iflink == dev->ifindex) return IF_OPER_DOWN; ASSERT_RTNL(); -- cgit v1.2.3 From c2994b008492db033d40bd767be1620229a3035e Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 25 Nov 2024 15:42:09 -0500 Subject: Bluetooth: hci_sync: Fix not setting Random Address when required This fixes errors such as the following when Own address type is set to Random Address but it has not been programmed yet due to either be advertising or connecting: < HCI Command: LE Set Exte.. (0x08|0x0041) plen 13 Own address type: Random (0x03) Filter policy: Ignore not in accept list (0x01) PHYs: 0x05 Entry 0: LE 1M Type: Passive (0x00) Interval: 60.000 msec (0x0060) Window: 30.000 msec (0x0030) Entry 1: LE Coded Type: Passive (0x00) Interval: 180.000 msec (0x0120) Window: 90.000 msec (0x0090) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Scan Parameters (0x08|0x0041) ncmd 1 Status: Success (0x00) < HCI Command: LE Set Exten.. (0x08|0x0042) plen 6 Extended scan: Enabled (0x01) Filter duplicates: Enabled (0x01) Duration: 0 msec (0x0000) Period: 0.00 sec (0x0000) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Scan Enable (0x08|0x0042) ncmd 1 Status: Invalid HCI Command Parameters (0x12) Fixes: c45074d68a9b ("Bluetooth: Fix not generating RPA when required") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index c86f4e42e69c..7b2b04d6b856 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -1031,9 +1031,9 @@ static bool adv_use_rpa(struct hci_dev *hdev, uint32_t flags) static int hci_set_random_addr_sync(struct hci_dev *hdev, bdaddr_t *rpa) { - /* If we're advertising or initiating an LE connection we can't - * go ahead and change the random address at this time. This is - * because the eventual initiator address used for the + /* If a random_addr has been set we're advertising or initiating an LE + * connection we can't go ahead and change the random address at this + * time. This is because the eventual initiator address used for the * subsequently created connection will be undefined (some * controllers use the new address and others the one we had * when the operation started). @@ -1041,8 +1041,9 @@ static int hci_set_random_addr_sync(struct hci_dev *hdev, bdaddr_t *rpa) * In this kind of scenario skip the update and let the random * address be updated at the next cycle. */ - if (hci_dev_test_flag(hdev, HCI_LE_ADV) || - hci_lookup_le_connect(hdev)) { + if (bacmp(&hdev->random_addr, BDADDR_ANY) && + (hci_dev_test_flag(hdev, HCI_LE_ADV) || + hci_lookup_le_connect(hdev))) { bt_dev_dbg(hdev, "Deferring random address update"); hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); return 0; -- cgit v1.2.3 From a182d9c84f9c52fb5db895ecceeee8b3a1bf661e Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 25 Nov 2024 15:42:10 -0500 Subject: Bluetooth: MGMT: Fix Add Device to responding before completing Add Device with LE type requires updating resolving/accept list which requires quite a number of commands to complete and each of them may fail, so instead of pretending it would always work this checks the return of hci_update_passive_scan_sync which indicates if everything worked as intended. Fixes: e8907f76544f ("Bluetooth: hci_sync: Make use of hci_cmd_sync_queue set 3") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index b31192d473d0..de47ad999d7b 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -7655,6 +7655,24 @@ static void device_added(struct sock *sk, struct hci_dev *hdev, mgmt_event(MGMT_EV_DEVICE_ADDED, hdev, &ev, sizeof(ev), sk); } +static void add_device_complete(struct hci_dev *hdev, void *data, int err) +{ + struct mgmt_pending_cmd *cmd = data; + struct mgmt_cp_add_device *cp = cmd->param; + + if (!err) { + device_added(cmd->sk, hdev, &cp->addr.bdaddr, cp->addr.type, + cp->action); + device_flags_changed(NULL, hdev, &cp->addr.bdaddr, + cp->addr.type, hdev->conn_flags, + PTR_UINT(cmd->user_data)); + } + + mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_ADD_DEVICE, + mgmt_status(err), &cp->addr, sizeof(cp->addr)); + mgmt_pending_free(cmd); +} + static int add_device_sync(struct hci_dev *hdev, void *data) { return hci_update_passive_scan_sync(hdev); @@ -7663,6 +7681,7 @@ static int add_device_sync(struct hci_dev *hdev, void *data) static int add_device(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { + struct mgmt_pending_cmd *cmd; struct mgmt_cp_add_device *cp = data; u8 auto_conn, addr_type; struct hci_conn_params *params; @@ -7743,9 +7762,24 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, current_flags = params->flags; } - err = hci_cmd_sync_queue(hdev, add_device_sync, NULL, NULL); - if (err < 0) + cmd = mgmt_pending_new(sk, MGMT_OP_ADD_DEVICE, hdev, data, len); + if (!cmd) { + err = -ENOMEM; goto unlock; + } + + cmd->user_data = UINT_PTR(current_flags); + + err = hci_cmd_sync_queue(hdev, add_device_sync, cmd, + add_device_complete); + if (err < 0) { + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, + MGMT_STATUS_FAILED, &cp->addr, + sizeof(cp->addr)); + mgmt_pending_free(cmd); + } + + goto unlock; added: device_added(sk, hdev, &cp->addr.bdaddr, cp->addr.type, cp->action); -- cgit v1.2.3 From 67dba2c28fe0af7e25ea1aeade677162ed05310a Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Wed, 8 Jan 2025 17:50:28 +0800 Subject: Bluetooth: btmtk: Fix failed to send func ctrl for MediaTek devices. Use usb_autopm_get_interface() and usb_autopm_put_interface() in btmtk_usb_shutdown(), it could send func ctrl after enabling autosuspend. Bluetooth: btmtk_usb_hci_wmt_sync() hci0: Execution of wmt command timed out Bluetooth: btmtk_usb_shutdown() hci0: Failed to send wmt func ctrl (-110) Fixes: 5c5e8c52e3ca ("Bluetooth: btmtk: move btusb_mtk_[setup, shutdown] to btmtk.c") Signed-off-by: Chris Lu Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btmtk.c | 7 +++++++ net/bluetooth/rfcomm/tty.c | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/drivers/bluetooth/btmtk.c b/drivers/bluetooth/btmtk.c index 7fd9d5ddce02..224eafc27dbe 100644 --- a/drivers/bluetooth/btmtk.c +++ b/drivers/bluetooth/btmtk.c @@ -1472,10 +1472,15 @@ EXPORT_SYMBOL_GPL(btmtk_usb_setup); int btmtk_usb_shutdown(struct hci_dev *hdev) { + struct btmtk_data *data = hci_get_priv(hdev); struct btmtk_hci_wmt_params wmt_params; u8 param = 0; int err; + err = usb_autopm_get_interface(data->intf); + if (err < 0) + return err; + /* Disable the device */ wmt_params.op = BTMTK_WMT_FUNC_CTRL; wmt_params.flag = 0; @@ -1486,9 +1491,11 @@ int btmtk_usb_shutdown(struct hci_dev *hdev) err = btmtk_usb_hci_wmt_sync(hdev, &wmt_params); if (err < 0) { bt_dev_err(hdev, "Failed to send wmt func ctrl (%d)", err); + usb_autopm_put_interface(data->intf); return err; } + usb_autopm_put_interface(data->intf); return 0; } EXPORT_SYMBOL_GPL(btmtk_usb_shutdown); diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index af80d599c337..21a5b5535ebc 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -201,14 +201,14 @@ static ssize_t address_show(struct device *tty_dev, struct device_attribute *attr, char *buf) { struct rfcomm_dev *dev = dev_get_drvdata(tty_dev); - return sprintf(buf, "%pMR\n", &dev->dst); + return sysfs_emit(buf, "%pMR\n", &dev->dst); } static ssize_t channel_show(struct device *tty_dev, struct device_attribute *attr, char *buf) { struct rfcomm_dev *dev = dev_get_drvdata(tty_dev); - return sprintf(buf, "%d\n", dev->channel); + return sysfs_emit(buf, "%d\n", dev->channel); } static DEVICE_ATTR_RO(address); -- cgit v1.2.3 From d1cacd74776895f6435941f86a1130e58f6dd226 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 6 Jan 2025 10:01:36 -0800 Subject: netdev: prevent accessing NAPI instances from another namespace The NAPI IDs were not fully exposed to user space prior to the netlink API, so they were never namespaced. The netlink API must ensure that at the very least NAPI instance belongs to the same netns as the owner of the genl sock. napi_by_id() can become static now, but it needs to move because of dev_get_by_napi_id(). Cc: stable@vger.kernel.org Fixes: 1287c1ae0fc2 ("netdev-genl: Support setting per-NAPI config values") Fixes: 27f91aaf49b3 ("netdev-genl: Add netlink framework functions for napi") Reviewed-by: Sridhar Samudrala Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250106180137.1861472-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/dev.c | 43 ++++++++++++++++++++++++++++++------------- net/core/dev.h | 3 ++- net/core/netdev-genl.c | 6 ++---- 3 files changed, 34 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index faa23042df38..a9f62f5aeb84 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -753,6 +753,36 @@ int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, } EXPORT_SYMBOL_GPL(dev_fill_forward_path); +/* must be called under rcu_read_lock(), as we dont take a reference */ +static struct napi_struct *napi_by_id(unsigned int napi_id) +{ + unsigned int hash = napi_id % HASH_SIZE(napi_hash); + struct napi_struct *napi; + + hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) + if (napi->napi_id == napi_id) + return napi; + + return NULL; +} + +/* must be called under rcu_read_lock(), as we dont take a reference */ +struct napi_struct *netdev_napi_by_id(struct net *net, unsigned int napi_id) +{ + struct napi_struct *napi; + + napi = napi_by_id(napi_id); + if (!napi) + return NULL; + + if (WARN_ON_ONCE(!napi->dev)) + return NULL; + if (!net_eq(net, dev_net(napi->dev))) + return NULL; + + return napi; +} + /** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace @@ -6293,19 +6323,6 @@ bool napi_complete_done(struct napi_struct *n, int work_done) } EXPORT_SYMBOL(napi_complete_done); -/* must be called under rcu_read_lock(), as we dont take a reference */ -struct napi_struct *napi_by_id(unsigned int napi_id) -{ - unsigned int hash = napi_id % HASH_SIZE(napi_hash); - struct napi_struct *napi; - - hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) - if (napi->napi_id == napi_id) - return napi; - - return NULL; -} - static void skb_defer_free_flush(struct softnet_data *sd) { struct sk_buff *skb, *next; diff --git a/net/core/dev.h b/net/core/dev.h index d043dee25a68..deb5eae5749f 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -22,6 +22,8 @@ struct sd_flow_limit { extern int netdev_flow_limit_table_len; +struct napi_struct *netdev_napi_by_id(struct net *net, unsigned int napi_id); + #ifdef CONFIG_PROC_FS int __init dev_proc_init(void); #else @@ -269,7 +271,6 @@ void xdp_do_check_flushed(struct napi_struct *napi); static inline void xdp_do_check_flushed(struct napi_struct *napi) { } #endif -struct napi_struct *napi_by_id(unsigned int napi_id); void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); #define XMIT_RECURSION_LIMIT 8 diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 125b660004d3..a3bdaf075b6b 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -167,8 +167,6 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, void *hdr; pid_t pid; - if (WARN_ON_ONCE(!napi->dev)) - return -EINVAL; if (!(napi->dev->flags & IFF_UP)) return 0; @@ -234,7 +232,7 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) rtnl_lock(); rcu_read_lock(); - napi = napi_by_id(napi_id); + napi = netdev_napi_by_id(genl_info_net(info), napi_id); if (napi) { err = netdev_nl_napi_fill_one(rsp, napi, info); } else { @@ -355,7 +353,7 @@ int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info) rtnl_lock(); rcu_read_lock(); - napi = napi_by_id(napi_id); + napi = netdev_napi_by_id(genl_info_net(info), napi_id); if (napi) { err = netdev_nl_napi_set_config(napi, info); } else { -- cgit v1.2.3 From 80fb40baba19e25a1b6f3ecff6fc5c0171806bde Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 7 Jan 2025 11:14:39 +0100 Subject: tcp: Annotate data-race around sk->sk_mark in tcp_v4_send_reset This is a follow-up to 3c5b4d69c358 ("net: annotate data-races around sk->sk_mark"). sk->sk_mark can be read and written without holding the socket lock. IPv6 equivalent is already covered with READ_ONCE() annotation in tcp_v6_send_response(). Fixes: 3c5b4d69c358 ("net: annotate data-races around sk->sk_mark") Signed-off-by: Daniel Borkmann Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/f459d1fc44f205e13f6d8bdca2c8bfb9902ffac9.1736244569.git.daniel@iogearbox.net Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a38c8b1f44db..c26f6c4b7bb4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -896,7 +896,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, sock_net_set(ctl_sk, net); if (sk) { ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? - inet_twsk(sk)->tw_mark : sk->sk_mark; + inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); transmit_time = tcp_transmit_time(sk); -- cgit v1.2.3 From 13210fc63f353fe78584048079343413a3cdf819 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 2 Jan 2025 13:01:13 +0100 Subject: netfilter: nf_tables: imbalance in flowtable binding All these cases cause imbalance between BIND and UNBIND calls: - Delete an interface from a flowtable with multiple interfaces - Add a (device to a) flowtable with --check flag - Delete a netns containing a flowtable - In an interactive nft session, create a table with owner flag and flowtable inside, then quit. Fix it by calling FLOW_BLOCK_UNBIND when unregistering hooks, then remove late FLOW_BLOCK_UNBIND call when destroying flowtable. Fixes: ff4bf2f42a40 ("netfilter: nf_tables: add nft_unregister_flowtable_hook()") Reported-by: Phil Sutter Tested-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 0b9f1e8dfe49..c4af283356e7 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8822,6 +8822,7 @@ static void nft_unregister_flowtable_hook(struct net *net, } static void __nft_unregister_flowtable_net_hooks(struct net *net, + struct nft_flowtable *flowtable, struct list_head *hook_list, bool release_netdev) { @@ -8829,6 +8830,8 @@ static void __nft_unregister_flowtable_net_hooks(struct net *net, list_for_each_entry_safe(hook, next, hook_list, list) { nf_unregister_net_hook(net, &hook->ops); + flowtable->data.type->setup(&flowtable->data, hook->ops.dev, + FLOW_BLOCK_UNBIND); if (release_netdev) { list_del(&hook->list); kfree_rcu(hook, rcu); @@ -8837,9 +8840,10 @@ static void __nft_unregister_flowtable_net_hooks(struct net *net, } static void nft_unregister_flowtable_net_hooks(struct net *net, + struct nft_flowtable *flowtable, struct list_head *hook_list) { - __nft_unregister_flowtable_net_hooks(net, hook_list, false); + __nft_unregister_flowtable_net_hooks(net, flowtable, hook_list, false); } static int nft_register_flowtable_net_hooks(struct net *net, @@ -9481,8 +9485,6 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) flowtable->data.type->free(&flowtable->data); list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { - flowtable->data.type->setup(&flowtable->data, hook->ops.dev, - FLOW_BLOCK_UNBIND); list_del_rcu(&hook->list); kfree_rcu(hook, rcu); } @@ -10870,6 +10872,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) &nft_trans_flowtable_hooks(trans), trans->msg_type); nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans)); } else { list_del_rcu(&nft_trans_flowtable(trans)->list); @@ -10878,6 +10881,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) NULL, trans->msg_type); nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable(trans)->hook_list); } break; @@ -11140,11 +11144,13 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) case NFT_MSG_NEWFLOWTABLE: if (nft_trans_flowtable_update(trans)) { nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans)); } else { nft_use_dec_restore(&table->use); list_del_rcu(&nft_trans_flowtable(trans)->list); nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable(trans)->hook_list); } break; @@ -11737,7 +11743,8 @@ static void __nft_release_hook(struct net *net, struct nft_table *table) list_for_each_entry(chain, &table->chains, list) __nf_tables_unregister_hook(net, table, chain, true); list_for_each_entry(flowtable, &table->flowtables, list) - __nft_unregister_flowtable_net_hooks(net, &flowtable->hook_list, + __nft_unregister_flowtable_net_hooks(net, flowtable, + &flowtable->hook_list, true); } -- cgit v1.2.3 From b541ba7d1f5a5b7b3e2e22dc9e40e18a7d6dbc13 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 8 Jan 2025 22:56:33 +0100 Subject: netfilter: conntrack: clamp maximum hashtable size to INT_MAX Use INT_MAX as maximum size for the conntrack hashtable. Otherwise, it is possible to hit WARN_ON_ONCE in __kvmalloc_node_noprof() when resizing hashtable because __GFP_NOWARN is unset. See: 0708a0afe291 ("mm: Consider __GFP_NOWARN flag for oversized kvmalloc() calls") Note: hashtable resize is only possible from init_netns. Fixes: 9cc1c73ad666 ("netfilter: conntrack: avoid integer overflow when resizing") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 9db3e2b0b1c3..456446d7af20 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2517,12 +2517,15 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) struct hlist_nulls_head *hash; unsigned int nr_slots, i; - if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) + if (*sizep > (INT_MAX / sizeof(struct hlist_nulls_head))) return NULL; BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); + if (nr_slots > (INT_MAX / sizeof(struct hlist_nulls_head))) + return NULL; + hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL); if (hash && nulls) -- cgit v1.2.3 From 737d4d91d35b5f7fa5bb442651472277318b0bfd Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 7 Jan 2025 13:01:05 +0100 Subject: sched: sch_cake: add bounds checks to host bulk flow fairness counts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Even though we fixed a logic error in the commit cited below, syzbot still managed to trigger an underflow of the per-host bulk flow counters, leading to an out of bounds memory access. To avoid any such logic errors causing out of bounds memory accesses, this commit factors out all accesses to the per-host bulk flow counters to a series of helpers that perform bounds-checking before any increments and decrements. This also has the benefit of improving readability by moving the conditional checks for the flow mode into these helpers, instead of having them spread out throughout the code (which was the cause of the original logic error). As part of this change, the flow quantum calculation is consolidated into a helper function, which means that the dithering applied to the ost load scaling is now applied both in the DRR rotation and when a sparse flow's quantum is first initiated. The only user-visible effect of this is that the maximum packet size that can be sent while a flow stays sparse will now vary with +/- one byte in some cases. This should not make a noticeable difference in practice, and thus it's not worth complicating the code to preserve the old behaviour. Fixes: 546ea84d07e3 ("sched: sch_cake: fix bulk flow accounting logic for host fairness") Reported-by: syzbot+f63600d288bfb7057424@syzkaller.appspotmail.com Signed-off-by: Toke Høiland-Jørgensen Acked-by: Dave Taht Link: https://patch.msgid.link/20250107120105.70685-1-toke@redhat.com Signed-off-by: Jakub Kicinski --- net/sched/sch_cake.c | 140 +++++++++++++++++++++++++++------------------------ 1 file changed, 75 insertions(+), 65 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 8d8b2db4653c..2c2e2a67f3b2 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -627,6 +627,63 @@ static bool cake_ddst(int flow_mode) return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST; } +static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_dsrc(flow_mode) && + q->hosts[flow->srchost].srchost_bulk_flow_count)) + q->hosts[flow->srchost].srchost_bulk_flow_count--; +} + +static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_dsrc(flow_mode) && + q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES)) + q->hosts[flow->srchost].srchost_bulk_flow_count++; +} + +static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_ddst(flow_mode) && + q->hosts[flow->dsthost].dsthost_bulk_flow_count)) + q->hosts[flow->dsthost].dsthost_bulk_flow_count--; +} + +static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_ddst(flow_mode) && + q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES)) + q->hosts[flow->dsthost].dsthost_bulk_flow_count++; +} + +static u16 cake_get_flow_quantum(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + u16 host_load = 1; + + if (cake_dsrc(flow_mode)) + host_load = max(host_load, + q->hosts[flow->srchost].srchost_bulk_flow_count); + + if (cake_ddst(flow_mode)) + host_load = max(host_load, + q->hosts[flow->dsthost].dsthost_bulk_flow_count); + + /* The get_random_u16() is a way to apply dithering to avoid + * accumulating roundoff errors + */ + return (q->flow_quantum * quantum_div[host_load] + + get_random_u16()) >> 16; +} + static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, int flow_mode, u16 flow_override, u16 host_override) { @@ -773,10 +830,8 @@ skip_hash: allocate_dst = cake_ddst(flow_mode); if (q->flows[outer_hash + k].set == CAKE_SET_BULK) { - if (allocate_src) - q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--; - if (allocate_dst) - q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--; + cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode); + cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode); } found: /* reserve queue for future packets in same flow */ @@ -801,9 +856,10 @@ found: q->hosts[outer_hash + k].srchost_tag = srchost_hash; found_src: srchost_idx = outer_hash + k; - if (q->flows[reduced_hash].set == CAKE_SET_BULK) - q->hosts[srchost_idx].srchost_bulk_flow_count++; q->flows[reduced_hash].srchost = srchost_idx; + + if (q->flows[reduced_hash].set == CAKE_SET_BULK) + cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode); } if (allocate_dst) { @@ -824,9 +880,10 @@ found_src: q->hosts[outer_hash + k].dsthost_tag = dsthost_hash; found_dst: dsthost_idx = outer_hash + k; - if (q->flows[reduced_hash].set == CAKE_SET_BULK) - q->hosts[dsthost_idx].dsthost_bulk_flow_count++; q->flows[reduced_hash].dsthost = dsthost_idx; + + if (q->flows[reduced_hash].set == CAKE_SET_BULK) + cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode); } } @@ -1839,10 +1896,6 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* flowchain */ if (!flow->set || flow->set == CAKE_SET_DECAYING) { - struct cake_host *srchost = &b->hosts[flow->srchost]; - struct cake_host *dsthost = &b->hosts[flow->dsthost]; - u16 host_load = 1; - if (!flow->set) { list_add_tail(&flow->flowchain, &b->new_flows); } else { @@ -1852,18 +1905,8 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, flow->set = CAKE_SET_SPARSE; b->sparse_flow_count++; - if (cake_dsrc(q->flow_mode)) - host_load = max(host_load, srchost->srchost_bulk_flow_count); - - if (cake_ddst(q->flow_mode)) - host_load = max(host_load, dsthost->dsthost_bulk_flow_count); - - flow->deficit = (b->flow_quantum * - quantum_div[host_load]) >> 16; + flow->deficit = cake_get_flow_quantum(b, flow, q->flow_mode); } else if (flow->set == CAKE_SET_SPARSE_WAIT) { - struct cake_host *srchost = &b->hosts[flow->srchost]; - struct cake_host *dsthost = &b->hosts[flow->dsthost]; - /* this flow was empty, accounted as a sparse flow, but actually * in the bulk rotation. */ @@ -1871,12 +1914,8 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, b->sparse_flow_count--; b->bulk_flow_count++; - if (cake_dsrc(q->flow_mode)) - srchost->srchost_bulk_flow_count++; - - if (cake_ddst(q->flow_mode)) - dsthost->dsthost_bulk_flow_count++; - + cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode); } if (q->buffer_used > q->buffer_max_used) @@ -1933,13 +1972,11 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch) { struct cake_sched_data *q = qdisc_priv(sch); struct cake_tin_data *b = &q->tins[q->cur_tin]; - struct cake_host *srchost, *dsthost; ktime_t now = ktime_get(); struct cake_flow *flow; struct list_head *head; bool first_flow = true; struct sk_buff *skb; - u16 host_load; u64 delay; u32 len; @@ -2039,11 +2076,6 @@ retry: q->cur_flow = flow - b->flows; first_flow = false; - /* triple isolation (modified DRR++) */ - srchost = &b->hosts[flow->srchost]; - dsthost = &b->hosts[flow->dsthost]; - host_load = 1; - /* flow isolation (DRR++) */ if (flow->deficit <= 0) { /* Keep all flows with deficits out of the sparse and decaying @@ -2055,11 +2087,8 @@ retry: b->sparse_flow_count--; b->bulk_flow_count++; - if (cake_dsrc(q->flow_mode)) - srchost->srchost_bulk_flow_count++; - - if (cake_ddst(q->flow_mode)) - dsthost->dsthost_bulk_flow_count++; + cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode); flow->set = CAKE_SET_BULK; } else { @@ -2071,19 +2100,7 @@ retry: } } - if (cake_dsrc(q->flow_mode)) - host_load = max(host_load, srchost->srchost_bulk_flow_count); - - if (cake_ddst(q->flow_mode)) - host_load = max(host_load, dsthost->dsthost_bulk_flow_count); - - WARN_ON(host_load > CAKE_QUEUES); - - /* The get_random_u16() is a way to apply dithering to avoid - * accumulating roundoff errors - */ - flow->deficit += (b->flow_quantum * quantum_div[host_load] + - get_random_u16()) >> 16; + flow->deficit += cake_get_flow_quantum(b, flow, q->flow_mode); list_move_tail(&flow->flowchain, &b->old_flows); goto retry; @@ -2107,11 +2124,8 @@ retry: if (flow->set == CAKE_SET_BULK) { b->bulk_flow_count--; - if (cake_dsrc(q->flow_mode)) - srchost->srchost_bulk_flow_count--; - - if (cake_ddst(q->flow_mode)) - dsthost->dsthost_bulk_flow_count--; + cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode); b->decaying_flow_count++; } else if (flow->set == CAKE_SET_SPARSE || @@ -2129,12 +2143,8 @@ retry: else if (flow->set == CAKE_SET_BULK) { b->bulk_flow_count--; - if (cake_dsrc(q->flow_mode)) - srchost->srchost_bulk_flow_count--; - - if (cake_ddst(q->flow_mode)) - dsthost->dsthost_bulk_flow_count--; - + cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode); } else b->decaying_flow_count--; -- cgit v1.2.3 From 771ec78dc8b48d562e6015bb535ed3cd37043d78 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:29 +0100 Subject: mptcp: sysctl: avail sched: remove write access 'net.mptcp.available_schedulers' sysctl knob is there to list available schedulers, not to modify this list. There are then no reasons to give write access to it. Nothing would have been written anyway, but no errors would have been returned, which is unexpected. Fixes: 73c900aa3660 ("mptcp: add net.mptcp.available_schedulers") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-1-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 38d8121331d4..d9b57fab2a13 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -228,7 +228,7 @@ static struct ctl_table mptcp_sysctl_table[] = { { .procname = "available_schedulers", .maxlen = MPTCP_SCHED_BUF_MAX, - .mode = 0644, + .mode = 0444, .proc_handler = proc_available_schedulers, }, { -- cgit v1.2.3 From d38e26e36206ae3d544d496513212ae931d1da0a Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:30 +0100 Subject: mptcp: sysctl: sched: avoid using current->nsproxy Using the 'net' structure via 'current' is not recommended for different reasons. First, if the goal is to use it to read or write per-netns data, this is inconsistent with how the "generic" sysctl entries are doing: directly by only using pointers set to the table entry, e.g. table->data. Linked to that, the per-netns data should always be obtained from the table linked to the netns it had been created for, which may not coincide with the reader's or writer's netns. Another reason is that access to current->nsproxy->netns can oops if attempted when current->nsproxy had been dropped when the current task is exiting. This is what syzbot found, when using acct(2): Oops: general protection fault, probably for non-canonical address 0xdffffc0000000005: 0000 [#1] PREEMPT SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000028-0x000000000000002f] CPU: 1 UID: 0 PID: 5924 Comm: syz-executor Not tainted 6.13.0-rc5-syzkaller-00004-gccb98ccef0e5 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:proc_scheduler+0xc6/0x3c0 net/mptcp/ctrl.c:125 Code: 03 42 80 3c 38 00 0f 85 fe 02 00 00 4d 8b a4 24 08 09 00 00 48 b8 00 00 00 00 00 fc ff df 49 8d 7c 24 28 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 cc 02 00 00 4d 8b 7c 24 28 48 8d 84 24 c8 00 00 RSP: 0018:ffffc900034774e8 EFLAGS: 00010206 RAX: dffffc0000000000 RBX: 1ffff9200068ee9e RCX: ffffc90003477620 RDX: 0000000000000005 RSI: ffffffff8b08f91e RDI: 0000000000000028 RBP: 0000000000000001 R08: ffffc90003477710 R09: 0000000000000040 R10: 0000000000000040 R11: 00000000726f7475 R12: 0000000000000000 R13: ffffc90003477620 R14: ffffc90003477710 R15: dffffc0000000000 FS: 0000000000000000(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fee3cd452d8 CR3: 000000007d116000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: proc_sys_call_handler+0x403/0x5d0 fs/proc/proc_sysctl.c:601 __kernel_write_iter+0x318/0xa80 fs/read_write.c:612 __kernel_write+0xf6/0x140 fs/read_write.c:632 do_acct_process+0xcb0/0x14a0 kernel/acct.c:539 acct_pin_kill+0x2d/0x100 kernel/acct.c:192 pin_kill+0x194/0x7c0 fs/fs_pin.c:44 mnt_pin_kill+0x61/0x1e0 fs/fs_pin.c:81 cleanup_mnt+0x3ac/0x450 fs/namespace.c:1366 task_work_run+0x14e/0x250 kernel/task_work.c:239 exit_task_work include/linux/task_work.h:43 [inline] do_exit+0xad8/0x2d70 kernel/exit.c:938 do_group_exit+0xd3/0x2a0 kernel/exit.c:1087 get_signal+0x2576/0x2610 kernel/signal.c:3017 arch_do_signal_or_restart+0x90/0x7e0 arch/x86/kernel/signal.c:337 exit_to_user_mode_loop kernel/entry/common.c:111 [inline] exit_to_user_mode_prepare include/linux/entry-common.h:329 [inline] __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline] syscall_exit_to_user_mode+0x150/0x2a0 kernel/entry/common.c:218 do_syscall_64+0xda/0x250 arch/x86/entry/common.c:89 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fee3cb87a6a Code: Unable to access opcode bytes at 0x7fee3cb87a40. RSP: 002b:00007fffcccac688 EFLAGS: 00000202 ORIG_RAX: 0000000000000037 RAX: 0000000000000000 RBX: 00007fffcccac710 RCX: 00007fee3cb87a6a RDX: 0000000000000041 RSI: 0000000000000000 RDI: 0000000000000003 RBP: 0000000000000003 R08: 00007fffcccac6ac R09: 00007fffcccacac7 R10: 00007fffcccac710 R11: 0000000000000202 R12: 00007fee3cd49500 R13: 00007fffcccac6ac R14: 0000000000000000 R15: 00007fee3cd4b000 Modules linked in: ---[ end trace 0000000000000000 ]--- RIP: 0010:proc_scheduler+0xc6/0x3c0 net/mptcp/ctrl.c:125 Code: 03 42 80 3c 38 00 0f 85 fe 02 00 00 4d 8b a4 24 08 09 00 00 48 b8 00 00 00 00 00 fc ff df 49 8d 7c 24 28 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 cc 02 00 00 4d 8b 7c 24 28 48 8d 84 24 c8 00 00 RSP: 0018:ffffc900034774e8 EFLAGS: 00010206 RAX: dffffc0000000000 RBX: 1ffff9200068ee9e RCX: ffffc90003477620 RDX: 0000000000000005 RSI: ffffffff8b08f91e RDI: 0000000000000028 RBP: 0000000000000001 R08: ffffc90003477710 R09: 0000000000000040 R10: 0000000000000040 R11: 00000000726f7475 R12: 0000000000000000 R13: ffffc90003477620 R14: ffffc90003477710 R15: dffffc0000000000 FS: 0000000000000000(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fee3cd452d8 CR3: 000000007d116000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 ---------------- Code disassembly (best guess), 1 bytes skipped: 0: 42 80 3c 38 00 cmpb $0x0,(%rax,%r15,1) 5: 0f 85 fe 02 00 00 jne 0x309 b: 4d 8b a4 24 08 09 00 mov 0x908(%r12),%r12 12: 00 13: 48 b8 00 00 00 00 00 movabs $0xdffffc0000000000,%rax 1a: fc ff df 1d: 49 8d 7c 24 28 lea 0x28(%r12),%rdi 22: 48 89 fa mov %rdi,%rdx 25: 48 c1 ea 03 shr $0x3,%rdx * 29: 80 3c 02 00 cmpb $0x0,(%rdx,%rax,1) <-- trapping instruction 2d: 0f 85 cc 02 00 00 jne 0x2ff 33: 4d 8b 7c 24 28 mov 0x28(%r12),%r15 38: 48 rex.W 39: 8d .byte 0x8d 3a: 84 24 c8 test %ah,(%rax,%rcx,8) Here with 'net.mptcp.scheduler', the 'net' structure is not really needed, because the table->data already has a pointer to the current scheduler, the only thing needed from the per-netns data. Simply use 'data', instead of getting (most of the time) the same thing, but from a longer and indirect way. Fixes: 6963c508fd7a ("mptcp: only allow set existing scheduler for net.mptcp.scheduler") Cc: stable@vger.kernel.org Reported-by: syzbot+e364f774c6f57f2c86d1@syzkaller.appspotmail.com Closes: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com Suggested-by: Al Viro Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-2-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/ctrl.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index d9b57fab2a13..81c30aa02196 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -102,16 +102,15 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) } #ifdef CONFIG_SYSCTL -static int mptcp_set_scheduler(const struct net *net, const char *name) +static int mptcp_set_scheduler(char *scheduler, const char *name) { - struct mptcp_pernet *pernet = mptcp_get_pernet(net); struct mptcp_sched_ops *sched; int ret = 0; rcu_read_lock(); sched = mptcp_sched_find(name); if (sched) - strscpy(pernet->scheduler, name, MPTCP_SCHED_NAME_MAX); + strscpy(scheduler, name, MPTCP_SCHED_NAME_MAX); else ret = -ENOENT; rcu_read_unlock(); @@ -122,7 +121,7 @@ static int mptcp_set_scheduler(const struct net *net, const char *name) static int proc_scheduler(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - const struct net *net = current->nsproxy->net_ns; + char (*scheduler)[MPTCP_SCHED_NAME_MAX] = ctl->data; char val[MPTCP_SCHED_NAME_MAX]; struct ctl_table tbl = { .data = val, @@ -130,11 +129,11 @@ static int proc_scheduler(const struct ctl_table *ctl, int write, }; int ret; - strscpy(val, mptcp_get_scheduler(net), MPTCP_SCHED_NAME_MAX); + strscpy(val, *scheduler, MPTCP_SCHED_NAME_MAX); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) - ret = mptcp_set_scheduler(net, val); + ret = mptcp_set_scheduler(*scheduler, val); return ret; } -- cgit v1.2.3 From 92cf7a51bdae24a32c592adcdd59a773ae149289 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:31 +0100 Subject: mptcp: sysctl: blackhole timeout: avoid using current->nsproxy As mentioned in the previous commit, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The 'pernet' structure can be obtained from the table->data using container_of(). Fixes: 27069e7cb3d1 ("mptcp: disable active MPTCP in case of blackhole") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-3-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/ctrl.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 81c30aa02196..b0dd008e2114 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -160,7 +160,9 @@ static int proc_blackhole_detect_timeout(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct mptcp_pernet *pernet = mptcp_get_pernet(current->nsproxy->net_ns); + struct mptcp_pernet *pernet = container_of(table->data, + struct mptcp_pernet, + blackhole_timeout); int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); -- cgit v1.2.3 From ea62dd1383913b5999f3d16ae99d411f41b528d4 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:32 +0100 Subject: sctp: sysctl: cookie_hmac_alg: avoid using current->nsproxy As mentioned in a previous commit of this series, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The 'net' structure can be obtained from the table->data using container_of(). Note that table->data could also be used directly, as this is the only member needed from the 'net' structure, but that would increase the size of this fix, to use '*data' everywhere 'net->sctp.sctp_hmac_alg' is used. Fixes: 3c68198e7511 ("sctp: Make hmac algorithm selection for cookie generation dynamic") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-4-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/sctp/sysctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index e5a5af343c4c..9848d19630a4 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -387,7 +387,8 @@ static struct ctl_table sctp_net_table[] = { static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, + sctp.sctp_hmac_alg); struct ctl_table tbl; bool changed = false; char *none = "none"; -- cgit v1.2.3 From 9fc17b76fc70763780aa78b38fcf4742384044a5 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:33 +0100 Subject: sctp: sysctl: rto_min/max: avoid using current->nsproxy As mentioned in a previous commit of this series, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The 'net' structure can be obtained from the table->data using container_of(). Note that table->data could also be used directly, as this is the only member needed from the 'net' structure, but that would increase the size of this fix, to use '*data' everywhere 'net->sctp.rto_min/max' is used. Fixes: 4f3fdf3bc59c ("sctp: add check rto_min and rto_max in sysctl") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-5-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/sctp/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 9848d19630a4..a5285815264d 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -433,7 +433,7 @@ static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write, static int proc_sctp_do_rto_min(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, sctp.rto_min); unsigned int min = *(unsigned int *) ctl->extra1; unsigned int max = *(unsigned int *) ctl->extra2; struct ctl_table tbl; @@ -461,7 +461,7 @@ static int proc_sctp_do_rto_min(const struct ctl_table *ctl, int write, static int proc_sctp_do_rto_max(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, sctp.rto_max); unsigned int min = *(unsigned int *) ctl->extra1; unsigned int max = *(unsigned int *) ctl->extra2; struct ctl_table tbl; -- cgit v1.2.3 From 15649fd5415eda664ef35780c2013adeb5d9c695 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:34 +0100 Subject: sctp: sysctl: auth_enable: avoid using current->nsproxy As mentioned in a previous commit of this series, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The 'net' structure can be obtained from the table->data using container_of(). Note that table->data could also be used directly, but that would increase the size of this fix, while 'sctp.ctl_sock' still needs to be retrieved from 'net' structure. Fixes: b14878ccb7fa ("net: sctp: cache auth_enable per endpoint") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-6-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/sctp/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index a5285815264d..9d29611621fe 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -499,7 +499,7 @@ static int proc_sctp_do_alpha_beta(const struct ctl_table *ctl, int write, static int proc_sctp_do_auth(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, sctp.auth_enable); struct ctl_table tbl; int new_value, ret; -- cgit v1.2.3 From c10377bbc1972d858eaf0ab366a311b39f8ef1b6 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:35 +0100 Subject: sctp: sysctl: udp_port: avoid using current->nsproxy As mentioned in a previous commit of this series, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The 'net' structure can be obtained from the table->data using container_of(). Note that table->data could also be used directly, but that would increase the size of this fix, while 'sctp.ctl_sock' still needs to be retrieved from 'net' structure. Fixes: 046c052b475e ("sctp: enable udp tunneling socks") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-7-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/sctp/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 9d29611621fe..18fa4f44e8ec 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -528,7 +528,7 @@ static int proc_sctp_do_auth(const struct ctl_table *ctl, int write, static int proc_sctp_do_udp_port(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, sctp.udp_port); unsigned int min = *(unsigned int *)ctl->extra1; unsigned int max = *(unsigned int *)ctl->extra2; struct ctl_table tbl; -- cgit v1.2.3 From 6259d2484d0ceff42245d1f09cc8cb6ee72d847a Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:36 +0100 Subject: sctp: sysctl: plpmtud_probe_interval: avoid using current->nsproxy As mentioned in a previous commit of this series, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The 'net' structure can be obtained from the table->data using container_of(). Note that table->data could also be used directly, as this is the only member needed from the 'net' structure, but that would increase the size of this fix, to use '*data' everywhere 'net->sctp.probe_interval' is used. Fixes: d1e462a7a5f3 ("sctp: add probe_interval in sysctl and sock/asoc/transport") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-8-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/sctp/sysctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 18fa4f44e8ec..8e1e97be4df7 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -569,7 +569,8 @@ static int proc_sctp_do_udp_port(const struct ctl_table *ctl, int write, static int proc_sctp_do_probe_interval(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, + sctp.probe_interval); struct ctl_table tbl; int ret, new_value; -- cgit v1.2.3 From 7f5611cbc4871c7fb1ad36c2e5a9edad63dca95c Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:37 +0100 Subject: rds: sysctl: rds_tcp_{rcv,snd}buf: avoid using current->nsproxy As mentioned in a previous commit of this series, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The per-netns structure can be obtained from the table->data using container_of(), then the 'net' one can be retrieved from the listen socket (if available). Fixes: c6a58ffed536 ("RDS: TCP: Add sysctl tunables for sndbuf/rcvbuf on rds-tcp socket") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-9-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/rds/tcp.c | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 351ac1747224..0581c53e6517 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -61,8 +61,10 @@ static atomic_t rds_tcp_unloading = ATOMIC_INIT(0); static struct kmem_cache *rds_tcp_conn_slab; -static int rds_tcp_skbuf_handler(const struct ctl_table *ctl, int write, - void *buffer, size_t *lenp, loff_t *fpos); +static int rds_tcp_sndbuf_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *fpos); +static int rds_tcp_rcvbuf_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *fpos); static int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF; static int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF; @@ -74,7 +76,7 @@ static struct ctl_table rds_tcp_sysctl_table[] = { /* data is per-net pointer */ .maxlen = sizeof(int), .mode = 0644, - .proc_handler = rds_tcp_skbuf_handler, + .proc_handler = rds_tcp_sndbuf_handler, .extra1 = &rds_tcp_min_sndbuf, }, #define RDS_TCP_RCVBUF 1 @@ -83,7 +85,7 @@ static struct ctl_table rds_tcp_sysctl_table[] = { /* data is per-net pointer */ .maxlen = sizeof(int), .mode = 0644, - .proc_handler = rds_tcp_skbuf_handler, + .proc_handler = rds_tcp_rcvbuf_handler, .extra1 = &rds_tcp_min_rcvbuf, }, }; @@ -682,10 +684,10 @@ static void rds_tcp_sysctl_reset(struct net *net) spin_unlock_irq(&rds_tcp_conn_lock); } -static int rds_tcp_skbuf_handler(const struct ctl_table *ctl, int write, +static int rds_tcp_skbuf_handler(struct rds_tcp_net *rtn, + const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *fpos) { - struct net *net = current->nsproxy->net_ns; int err; err = proc_dointvec_minmax(ctl, write, buffer, lenp, fpos); @@ -694,11 +696,34 @@ static int rds_tcp_skbuf_handler(const struct ctl_table *ctl, int write, *(int *)(ctl->extra1)); return err; } - if (write) + + if (write && rtn->rds_tcp_listen_sock && rtn->rds_tcp_listen_sock->sk) { + struct net *net = sock_net(rtn->rds_tcp_listen_sock->sk); + rds_tcp_sysctl_reset(net); + } + return 0; } +static int rds_tcp_sndbuf_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *fpos) +{ + struct rds_tcp_net *rtn = container_of(ctl->data, struct rds_tcp_net, + sndbuf_size); + + return rds_tcp_skbuf_handler(rtn, ctl, write, buffer, lenp, fpos); +} + +static int rds_tcp_rcvbuf_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *fpos) +{ + struct rds_tcp_net *rtn = container_of(ctl->data, struct rds_tcp_net, + rcvbuf_size); + + return rds_tcp_skbuf_handler(rtn, ctl, write, buffer, lenp, fpos); +} + static void rds_tcp_exit(void) { rds_tcp_set_unloading(); -- cgit v1.2.3 From 8c7a6efc017e59f2b773a8a4c0897309dfe1d742 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Wed, 8 Jan 2025 17:57:15 +0100 Subject: ipv4: route: fix drop reason being overridden in ip_route_input_slow When jumping to 'martian_destination' a drop reason is always set but that label falls-through the 'e_nobufs' one, overriding the value. The behavior was introduced by the mentioned commit. The logic went from, goto martian_destination; ... martian_destination: ... e_inval: err = -EINVAL; goto out; e_nobufs: err = -ENOBUFS; goto out; to, reason = ...; goto martian_destination; ... martian_destination: ... e_nobufs: reason = SKB_DROP_REASON_NOMEM; goto out; A 'goto out' is clearly missing now after 'martian_destination' to avoid overriding the drop reason. Fixes: 5b92112acd8e ("net: ip: make ip_route_input_slow() return drop reasons") Reported-by: Sabrina Dubroca Cc: Menglong Dong Signed-off-by: Antoine Tenart Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250108165725.404564-1-atenart@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv4/route.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 0fbec3509618..e1564b95fab0 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2445,6 +2445,7 @@ martian_destination: net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n", &daddr, &saddr, dev->name); #endif + goto out; e_nobufs: reason = SKB_DROP_REASON_NOMEM; -- cgit v1.2.3 From b3af60928ab9129befa65e6df0310d27300942bf Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Fri, 10 Jan 2025 14:21:55 +0100 Subject: bpf: Fix bpf_sk_select_reuseport() memory leak As pointed out in the original comment, lookup in sockmap can return a TCP ESTABLISHED socket. Such TCP socket may have had SO_ATTACH_REUSEPORT_EBPF set before it was ESTABLISHED. In other words, a non-NULL sk_reuseport_cb does not imply a non-refcounted socket. Drop sk's reference in both error paths. unreferenced object 0xffff888101911800 (size 2048): comm "test_progs", pid 44109, jiffies 4297131437 hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 80 00 01 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace (crc 9336483b): __kmalloc_noprof+0x3bf/0x560 __reuseport_alloc+0x1d/0x40 reuseport_alloc+0xca/0x150 reuseport_attach_prog+0x87/0x140 sk_reuseport_attach_bpf+0xc8/0x100 sk_setsockopt+0x1181/0x1990 do_sock_setsockopt+0x12b/0x160 __sys_setsockopt+0x7b/0xc0 __x64_sys_setsockopt+0x1b/0x30 do_syscall_64+0x93/0x180 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fixes: 64d85290d79c ("bpf: Allow bpf_map_lookup_elem for SOCKMAP and SOCKHASH") Signed-off-by: Michal Luczaj Reviewed-by: Martin KaFai Lau Link: https://patch.msgid.link/20250110-reuseport-memleak-v1-1-fa1ddab0adfe@rbox.co Signed-off-by: Jakub Kicinski --- net/core/filter.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 834614071727..2fb45a86f3dd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -11251,6 +11251,7 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY; struct sock_reuseport *reuse; struct sock *selected_sk; + int err; selected_sk = map->ops->map_lookup_elem(map, key); if (!selected_sk) @@ -11258,10 +11259,6 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, reuse = rcu_dereference(selected_sk->sk_reuseport_cb); if (!reuse) { - /* Lookup in sock_map can return TCP ESTABLISHED sockets. */ - if (sk_is_refcounted(selected_sk)) - sock_put(selected_sk); - /* reuseport_array has only sk with non NULL sk_reuseport_cb. * The only (!reuse) case here is - the sk has already been * unhashed (e.g. by close()), so treat it as -ENOENT. @@ -11269,24 +11266,33 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, * Other maps (e.g. sock_map) do not provide this guarantee and * the sk may never be in the reuseport group to begin with. */ - return is_sockarray ? -ENOENT : -EINVAL; + err = is_sockarray ? -ENOENT : -EINVAL; + goto error; } if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) { struct sock *sk = reuse_kern->sk; - if (sk->sk_protocol != selected_sk->sk_protocol) - return -EPROTOTYPE; - else if (sk->sk_family != selected_sk->sk_family) - return -EAFNOSUPPORT; - - /* Catch all. Likely bound to a different sockaddr. */ - return -EBADFD; + if (sk->sk_protocol != selected_sk->sk_protocol) { + err = -EPROTOTYPE; + } else if (sk->sk_family != selected_sk->sk_family) { + err = -EAFNOSUPPORT; + } else { + /* Catch all. Likely bound to a different sockaddr. */ + err = -EBADFD; + } + goto error; } reuse_kern->selected_sk = selected_sk; return 0; +error: + /* Lookup in sock_map can return TCP ESTABLISHED sockets. */ + if (sk_is_refcounted(selected_sk)) + sock_put(selected_sk); + + return err; } static const struct bpf_func_proto sk_select_reuseport_proto = { -- cgit v1.2.3 From 5ef44b3cb43bda4009ba7fe3d54e0d258ae4aee7 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 8 Jan 2025 16:34:36 -0800 Subject: xsk: Bring back busy polling support Commit 86e25f40aa1e ("net: napi: Add napi_config") moved napi->napi_id assignment to a later point in time (napi_hash_add_with_id). This breaks __xdp_rxq_info_reg which copies napi_id at an earlier time and now stores 0 napi_id. It also makes sk_mark_napi_id_once_xdp and __sk_mark_napi_id_once useless because they now work against 0 napi_id. Since sk_busy_loop requires valid napi_id to busy-poll on, there is no way to busy-poll AF_XDP sockets anymore. Bring back the ability to busy-poll on XSK by resolving socket's napi_id at bind time. This relies on relatively recent netif_queue_set_napi, but (assume) at this point most popular drivers should have been converted. This also removes per-tx/rx cycles which used to check and/or set the napi_id value. Confirmed by running a busy-polling AF_XDP socket (github.com/fomichev/xskrtt) on mlx5 and looking at BusyPollRxPackets from /proc/net/netstat. Fixes: 86e25f40aa1e ("net: napi: Add napi_config") Signed-off-by: Stanislav Fomichev Acked-by: Magnus Karlsson Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/20250109003436.2829560-1-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/net/busy_poll.h | 8 -------- include/net/xdp.h | 1 - include/net/xdp_sock_drv.h | 14 -------------- net/core/xdp.c | 1 - net/xdp/xsk.c | 14 +++++++++----- 5 files changed, 9 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index c858270141bc..c39a426ebf52 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -174,12 +174,4 @@ static inline void sk_mark_napi_id_once(struct sock *sk, #endif } -static inline void sk_mark_napi_id_once_xdp(struct sock *sk, - const struct xdp_buff *xdp) -{ -#ifdef CONFIG_NET_RX_BUSY_POLL - __sk_mark_napi_id_once(sk, xdp->rxq->napi_id); -#endif -} - #endif /* _LINUX_NET_BUSY_POLL_H */ diff --git a/include/net/xdp.h b/include/net/xdp.h index e6770dd40c91..b5b10f2b88e5 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -62,7 +62,6 @@ struct xdp_rxq_info { u32 queue_index; u32 reg_state; struct xdp_mem_info mem; - unsigned int napi_id; u32 frag_size; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 40085afd9160..7a7316d9c0da 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -59,15 +59,6 @@ static inline void xsk_pool_fill_cb(struct xsk_buff_pool *pool, xp_fill_cb(pool, desc); } -static inline unsigned int xsk_pool_get_napi_id(struct xsk_buff_pool *pool) -{ -#ifdef CONFIG_NET_RX_BUSY_POLL - return pool->heads[0].xdp.rxq->napi_id; -#else - return 0; -#endif -} - static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) { @@ -306,11 +297,6 @@ static inline void xsk_pool_fill_cb(struct xsk_buff_pool *pool, { } -static inline unsigned int xsk_pool_get_napi_id(struct xsk_buff_pool *pool) -{ - return 0; -} - static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) { diff --git a/net/core/xdp.c b/net/core/xdp.c index bcc5551c6424..2315feed94ef 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -186,7 +186,6 @@ int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, xdp_rxq_info_init(xdp_rxq); xdp_rxq->dev = dev; xdp_rxq->queue_index = queue_index; - xdp_rxq->napi_id = napi_id; xdp_rxq->frag_size = frag_size; xdp_rxq->reg_state = REG_STATE_REGISTERED; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 3fa70286c846..89d2bef96469 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -322,7 +322,6 @@ static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) return -ENOSPC; } - sk_mark_napi_id_once_xdp(&xs->sk, xdp); return 0; } @@ -908,11 +907,8 @@ static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len if (unlikely(!xs->tx)) return -ENOBUFS; - if (sk_can_busy_loop(sk)) { - if (xs->zc) - __sk_mark_napi_id_once(sk, xsk_pool_get_napi_id(xs->pool)); + if (sk_can_busy_loop(sk)) sk_busy_loop(sk, 1); /* only support non-blocking sockets */ - } if (xs->zc && xsk_no_wakeup(sk)) return 0; @@ -1298,6 +1294,14 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) xs->queue_id = qid; xp_add_xsk(xs->pool, xs); + if (xs->zc && qid < dev->real_num_rx_queues) { + struct netdev_rx_queue *rxq; + + rxq = __netif_get_rx_queue(dev, qid); + if (rxq->napi) + __sk_mark_napi_id_once(sk, rxq->napi->napi_id); + } + out_unlock: if (err) { dev_put(dev); -- cgit v1.2.3 From 47e55e4b410f7d552e43011baa5be1aab4093990 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 9 Jan 2025 13:21:24 +0100 Subject: openvswitch: fix lockup on tx to unregistering netdev with carrier Commit in a fixes tag attempted to fix the issue in the following sequence of calls: do_output -> ovs_vport_send -> dev_queue_xmit -> __dev_queue_xmit -> netdev_core_pick_tx -> skb_tx_hash When device is unregistering, the 'dev->real_num_tx_queues' goes to zero and the 'while (unlikely(hash >= qcount))' loop inside the 'skb_tx_hash' becomes infinite, locking up the core forever. But unfortunately, checking just the carrier status is not enough to fix the issue, because some devices may still be in unregistering state while reporting carrier status OK. One example of such device is a net/dummy. It sets carrier ON on start, but it doesn't implement .ndo_stop to set the carrier off. And it makes sense, because dummy doesn't really have a carrier. Therefore, while this device is unregistering, it's still easy to hit the infinite loop in the skb_tx_hash() from the OVS datapath. There might be other drivers that do the same, but dummy by itself is important for the OVS ecosystem, because it is frequently used as a packet sink for tcpdump while debugging OVS deployments. And when the issue is hit, the only way to recover is to reboot. Fix that by also checking if the device is running. The running state is handled by the net core during unregistering, so it covers unregistering case better, and we don't really need to send packets to devices that are not running anyway. While only checking the running state might be enough, the carrier check is preserved. The running and the carrier states seem disjoined throughout the code and different drivers. And other core functions like __dev_direct_xmit() check both before attempting to transmit a packet. So, it seems safer to check both flags in OVS as well. Fixes: 066b86787fa3 ("net: openvswitch: fix race on port output") Reported-by: Friedrich Weber Closes: https://mail.openvswitch.org/pipermail/ovs-discuss/2025-January/053423.html Signed-off-by: Ilya Maximets Tested-by: Friedrich Weber Reviewed-by: Aaron Conole Link: https://patch.msgid.link/20250109122225.4034688-1-i.maximets@ovn.org Signed-off-by: Jakub Kicinski --- net/openvswitch/actions.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 16e260014684..704c858cf209 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -934,7 +934,9 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, { struct vport *vport = ovs_vport_rcu(dp, out_port); - if (likely(vport && netif_carrier_ok(vport->dev))) { + if (likely(vport && + netif_running(vport->dev) && + netif_carrier_ok(vport->dev))) { u16 mru = OVS_CB(skb)->mru; u32 cutlen = OVS_CB(skb)->cutlen; -- cgit v1.2.3 From 76201b5979768500bca362871db66d77cb4c225e Mon Sep 17 00:00:00 2001 From: Artem Chernyshev Date: Thu, 9 Jan 2025 11:30:39 +0300 Subject: pktgen: Avoid out-of-bounds access in get_imix_entries Passing a sufficient amount of imix entries leads to invalid access to the pkt_dev->imix_entries array because of the incorrect boundary check. UBSAN: array-index-out-of-bounds in net/core/pktgen.c:874:24 index 20 is out of range for type 'imix_pkt [20]' CPU: 2 PID: 1210 Comm: bash Not tainted 6.10.0-rc1 #121 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) Call Trace: dump_stack_lvl lib/dump_stack.c:117 __ubsan_handle_out_of_bounds lib/ubsan.c:429 get_imix_entries net/core/pktgen.c:874 pktgen_if_write net/core/pktgen.c:1063 pde_write fs/proc/inode.c:334 proc_reg_write fs/proc/inode.c:346 vfs_write fs/read_write.c:593 ksys_write fs/read_write.c:644 do_syscall_64 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe arch/x86/entry/entry_64.S:130 Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 52a62f8603f9 ("pktgen: Parse internet mix (imix) input") Signed-off-by: Artem Chernyshev [ fp: allow to fill the array completely; minor changelog cleanup ] Signed-off-by: Fedor Pchelkin Signed-off-by: David S. Miller --- net/core/pktgen.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 7e23cacbe66e..4cb547fae91f 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -851,6 +851,9 @@ static ssize_t get_imix_entries(const char __user *buffer, unsigned long weight; unsigned long size; + if (pkt_dev->n_imix_entries >= MAX_IMIX_ENTRIES) + return -E2BIG; + len = num_arg(&buffer[i], max_digits, &size); if (len < 0) return len; @@ -880,9 +883,6 @@ static ssize_t get_imix_entries(const char __user *buffer, i++; pkt_dev->n_imix_entries++; - - if (pkt_dev->n_imix_entries > MAX_IMIX_ENTRIES) - return -E2BIG; } while (c == ' '); return i; -- cgit v1.2.3 From 9e2bbab94b88295dcc57c7580393c9ee08d7314d Mon Sep 17 00:00:00 2001 From: Paul Fertser Date: Thu, 9 Jan 2025 17:50:54 +0300 Subject: net/ncsi: fix locking in Get MAC Address handling Obtaining RTNL lock in a response handler is not allowed since it runs in an atomic softirq context. Postpone setting the MAC address by adding a dedicated step to the configuration FSM. Fixes: 790071347a0a ("net/ncsi: change from ndo_set_mac_address to dev_set_mac_address") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/20241129-potin-revert-ncsi-set-mac-addr-v1-1-94ea2cb596af@gmail.com Signed-off-by: Paul Fertser Tested-by: Potin Lai Link: https://patch.msgid.link/20250109145054.30925-1-fercerpav@gmail.com Signed-off-by: Jakub Kicinski --- net/ncsi/internal.h | 2 ++ net/ncsi/ncsi-manage.c | 16 ++++++++++++++-- net/ncsi/ncsi-rsp.c | 19 ++++++------------- 3 files changed, 22 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index ef0f8f73826f..4e0842df5234 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -289,6 +289,7 @@ enum { ncsi_dev_state_config_sp = 0x0301, ncsi_dev_state_config_cis, ncsi_dev_state_config_oem_gma, + ncsi_dev_state_config_apply_mac, ncsi_dev_state_config_clear_vids, ncsi_dev_state_config_svf, ncsi_dev_state_config_ev, @@ -322,6 +323,7 @@ struct ncsi_dev_priv { #define NCSI_DEV_RESHUFFLE 4 #define NCSI_DEV_RESET 8 /* Reset state of NC */ unsigned int gma_flag; /* OEM GMA flag */ + struct sockaddr pending_mac; /* MAC address received from GMA */ spinlock_t lock; /* Protect the NCSI device */ unsigned int package_probe_id;/* Current ID during probe */ unsigned int package_num; /* Number of packages */ diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 5cf55bde366d..bf276eaf9330 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -1038,7 +1038,7 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) : ncsi_dev_state_config_clear_vids; break; case ncsi_dev_state_config_oem_gma: - nd->state = ncsi_dev_state_config_clear_vids; + nd->state = ncsi_dev_state_config_apply_mac; nca.package = np->id; nca.channel = nc->id; @@ -1050,10 +1050,22 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) nca.type = NCSI_PKT_CMD_OEM; ret = ncsi_gma_handler(&nca, nc->version.mf_id); } - if (ret < 0) + if (ret < 0) { + nd->state = ncsi_dev_state_config_clear_vids; schedule_work(&ndp->work); + } break; + case ncsi_dev_state_config_apply_mac: + rtnl_lock(); + ret = dev_set_mac_address(dev, &ndp->pending_mac, NULL); + rtnl_unlock(); + if (ret < 0) + netdev_warn(dev, "NCSI: 'Writing MAC address to device failed\n"); + + nd->state = ncsi_dev_state_config_clear_vids; + + fallthrough; case ncsi_dev_state_config_clear_vids: case ncsi_dev_state_config_svf: case ncsi_dev_state_config_ev: diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index e28be33bdf2c..14bd66909ca4 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -628,16 +628,14 @@ static int ncsi_rsp_handler_snfc(struct ncsi_request *nr) static int ncsi_rsp_handler_oem_gma(struct ncsi_request *nr, int mfr_id) { struct ncsi_dev_priv *ndp = nr->ndp; + struct sockaddr *saddr = &ndp->pending_mac; struct net_device *ndev = ndp->ndev.dev; struct ncsi_rsp_oem_pkt *rsp; - struct sockaddr saddr; u32 mac_addr_off = 0; - int ret = 0; /* Get the response header */ rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp); - saddr.sa_family = ndev->type; ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE; if (mfr_id == NCSI_OEM_MFR_BCM_ID) mac_addr_off = BCM_MAC_ADDR_OFFSET; @@ -646,22 +644,17 @@ static int ncsi_rsp_handler_oem_gma(struct ncsi_request *nr, int mfr_id) else if (mfr_id == NCSI_OEM_MFR_INTEL_ID) mac_addr_off = INTEL_MAC_ADDR_OFFSET; - memcpy(saddr.sa_data, &rsp->data[mac_addr_off], ETH_ALEN); + saddr->sa_family = ndev->type; + memcpy(saddr->sa_data, &rsp->data[mac_addr_off], ETH_ALEN); if (mfr_id == NCSI_OEM_MFR_BCM_ID || mfr_id == NCSI_OEM_MFR_INTEL_ID) - eth_addr_inc((u8 *)saddr.sa_data); - if (!is_valid_ether_addr((const u8 *)saddr.sa_data)) + eth_addr_inc((u8 *)saddr->sa_data); + if (!is_valid_ether_addr((const u8 *)saddr->sa_data)) return -ENXIO; /* Set the flag for GMA command which should only be called once */ ndp->gma_flag = 1; - rtnl_lock(); - ret = dev_set_mac_address(ndev, &saddr, NULL); - rtnl_unlock(); - if (ret < 0) - netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n"); - - return ret; + return 0; } /* Response handler for Mellanox card */ -- cgit v1.2.3 From 644f9108f3a505022ef43510e5143cb985e0cf8b Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Fri, 10 Jan 2025 09:08:10 +0800 Subject: udp: Make rehash4 independent in udp_lib_rehash() As discussed in [0], rehash4 could be missed in udp_lib_rehash() when udp hash4 changes while hash2 doesn't change. This patch fixes this by moving rehash4 codes out of rehash2 checking, and then rehash2 and rehash4 are done separately. By doing this, we no longer need to call rehash4 explicitly in udp_lib_hash4(), as the rehash callback in __ip4_datagram_connect takes it. Thus, now udp_lib_hash4() returns directly if the sk is already hashed. Note that uhash4 may fail to work under consecutive connect() calls because rehash() is not called with every connect(). To overcome this, connect() needs to be called after the next connect to a new destination. [0] https://lore.kernel.org/all/4761e466ab9f7542c68cdc95f248987d127044d2.1733499715.git.pabeni@redhat.com/ Fixes: 78c91ae2c6de ("ipv4/udp: Add 4-tuple hash for connected socket") Suggested-by: Paolo Abeni Signed-off-by: Philo Lu Link: https://patch.msgid.link/20250110010810.107145-1-lulie@linux.alibaba.com Signed-off-by: Paolo Abeni --- net/ipv4/udp.c | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e8953e88efef..86d282618515 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -533,7 +533,7 @@ begin: return NULL; } -/* In hash4, rehash can happen in connect(), where hash4_cnt keeps unchanged. */ +/* udp_rehash4() only checks hslot4, and hash4_cnt is not processed. */ static void udp_rehash4(struct udp_table *udptable, struct sock *sk, u16 newhash4) { @@ -582,15 +582,13 @@ void udp_lib_hash4(struct sock *sk, u16 hash) struct net *net = sock_net(sk); struct udp_table *udptable; - /* Connected udp socket can re-connect to another remote address, - * so rehash4 is needed. + /* Connected udp socket can re-connect to another remote address, which + * will be handled by rehash. Thus no need to redo hash4 here. */ - udptable = net->ipv4.udp_table; - if (udp_hashed4(sk)) { - udp_rehash4(udptable, sk, hash); + if (udp_hashed4(sk)) return; - } + udptable = net->ipv4.udp_table; hslot = udp_hashslot(udptable, net, udp_sk(sk)->udp_port_hash); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); hslot4 = udp_hashslot4(udptable, hash); @@ -2173,14 +2171,14 @@ void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4) struct udp_table *udptable = udp_get_table_prot(sk); struct udp_hslot *hslot, *hslot2, *nhslot2; + hslot = udp_hashslot(udptable, sock_net(sk), + udp_sk(sk)->udp_port_hash); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); nhslot2 = udp_hashslot2(udptable, newhash); udp_sk(sk)->udp_portaddr_hash = newhash; if (hslot2 != nhslot2 || rcu_access_pointer(sk->sk_reuseport_cb)) { - hslot = udp_hashslot(udptable, sock_net(sk), - udp_sk(sk)->udp_port_hash); /* we must lock primary chain too */ spin_lock_bh(&hslot->lock); if (rcu_access_pointer(sk->sk_reuseport_cb)) @@ -2199,19 +2197,29 @@ void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4) spin_unlock(&nhslot2->lock); } - if (udp_hashed4(sk)) { - udp_rehash4(udptable, sk, newhash4); + spin_unlock_bh(&hslot->lock); + } + + /* Now process hash4 if necessary: + * (1) update hslot4; + * (2) update hslot2->hash4_cnt. + * Note that hslot2/hslot4 should be checked separately, as + * either of them may change with the other unchanged. + */ + if (udp_hashed4(sk)) { + spin_lock_bh(&hslot->lock); - if (hslot2 != nhslot2) { - spin_lock(&hslot2->lock); - udp_hash4_dec(hslot2); - spin_unlock(&hslot2->lock); + udp_rehash4(udptable, sk, newhash4); + if (hslot2 != nhslot2) { + spin_lock(&hslot2->lock); + udp_hash4_dec(hslot2); + spin_unlock(&hslot2->lock); - spin_lock(&nhslot2->lock); - udp_hash4_inc(nhslot2); - spin_unlock(&nhslot2->lock); - } + spin_lock(&nhslot2->lock); + udp_hash4_inc(nhslot2); + spin_unlock(&nhslot2->lock); } + spin_unlock_bh(&hslot->lock); } } -- cgit v1.2.3 From 2cb7c756f605ec02ffe562fb26828e4bcc5fdfc1 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 10 Jan 2025 09:35:07 +0100 Subject: vsock/virtio: discard packets if the transport changes If the socket has been de-assigned or assigned to another transport, we must discard any packets received because they are not expected and would cause issues when we access vsk->transport. A possible scenario is described by Hyunwoo Kim in the attached link, where after a first connect() interrupted by a signal, and a second connect() failed, we can find `vsk->transport` at NULL, leading to a NULL pointer dereference. Fixes: c0cfa2d8a788 ("vsock: add multi-transports support") Cc: stable@vger.kernel.org Reported-by: Hyunwoo Kim Reported-by: Wongi Lee Closes: https://lore.kernel.org/netdev/Z2LvdTTQR7dBmPb5@v4bel-B760M-AORUS-ELITE-AX/ Signed-off-by: Stefano Garzarella Reviewed-by: Hyunwoo Kim Signed-off-by: Paolo Abeni --- net/vmw_vsock/virtio_transport_common.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 9acc13ab3f82..51a494b69be8 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -1628,8 +1628,11 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, lock_sock(sk); - /* Check if sk has been closed before lock_sock */ - if (sock_flag(sk, SOCK_DONE)) { + /* Check if sk has been closed or assigned to another transport before + * lock_sock (note: listener sockets are not assigned to any transport) + */ + if (sock_flag(sk, SOCK_DONE) || + (sk->sk_state != TCP_LISTEN && vsk->transport != &t->transport)) { (void)virtio_transport_reset_no_sock(t, skb); release_sock(sk); sock_put(sk); -- cgit v1.2.3 From f6abafcd32f9cfc4b1a2f820ecea70773e26d423 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 10 Jan 2025 09:35:08 +0100 Subject: vsock/bpf: return early if transport is not assigned Some of the core functions can only be called if the transport has been assigned. As Michal reported, a socket might have the transport at NULL, for example after a failed connect(), causing the following trace: BUG: kernel NULL pointer dereference, address: 00000000000000a0 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 12faf8067 P4D 12faf8067 PUD 113670067 PMD 0 Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 15 UID: 0 PID: 1198 Comm: a.out Not tainted 6.13.0-rc2+ RIP: 0010:vsock_connectible_has_data+0x1f/0x40 Call Trace: vsock_bpf_recvmsg+0xca/0x5e0 sock_recvmsg+0xb9/0xc0 __sys_recvfrom+0xb3/0x130 __x64_sys_recvfrom+0x20/0x30 do_syscall_64+0x93/0x180 entry_SYSCALL_64_after_hwframe+0x76/0x7e So we need to check the `vsk->transport` in vsock_bpf_recvmsg(), especially for connected sockets (stream/seqpacket) as we already do in __vsock_connectible_recvmsg(). Fixes: 634f1a7110b4 ("vsock: support sockmap") Cc: stable@vger.kernel.org Reported-by: Michal Luczaj Closes: https://lore.kernel.org/netdev/5ca20d4c-1017-49c2-9516-f6f75fd331e9@rbox.co/ Tested-by: Michal Luczaj Reported-by: syzbot+3affdbfc986ecd9200fd@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/677f84a8.050a0220.25a300.01b3.GAE@google.com/ Tested-by: syzbot+3affdbfc986ecd9200fd@syzkaller.appspotmail.com Reviewed-by: Hyunwoo Kim Acked-by: Michael S. Tsirkin Reviewed-by: Luigi Leonardi Signed-off-by: Stefano Garzarella Signed-off-by: Paolo Abeni --- net/vmw_vsock/vsock_bpf.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/vmw_vsock/vsock_bpf.c b/net/vmw_vsock/vsock_bpf.c index 4aa6e74ec295..f201d9eca1df 100644 --- a/net/vmw_vsock/vsock_bpf.c +++ b/net/vmw_vsock/vsock_bpf.c @@ -77,6 +77,7 @@ static int vsock_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct sk_psock *psock; + struct vsock_sock *vsk; int copied; psock = sk_psock_get(sk); @@ -84,6 +85,13 @@ static int vsock_bpf_recvmsg(struct sock *sk, struct msghdr *msg, return __vsock_recvmsg(sk, msg, len, flags); lock_sock(sk); + vsk = vsock_sk(sk); + + if (!vsk->transport) { + copied = -ENODEV; + goto out; + } + if (vsock_has_data(sk, psock) && sk_psock_queue_empty(psock)) { release_sock(sk); sk_psock_put(sk, psock); @@ -108,6 +116,7 @@ static int vsock_bpf_recvmsg(struct sock *sk, struct msghdr *msg, copied = sk_msg_recvmsg(sk, psock, msg, len, flags); } +out: release_sock(sk); sk_psock_put(sk, psock); -- cgit v1.2.3 From df137da9d6d166e87e40980e36eb8e0bc90483ef Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 10 Jan 2025 09:35:09 +0100 Subject: vsock/virtio: cancel close work in the destructor During virtio_transport_release() we can schedule a delayed work to perform the closing of the socket before destruction. The destructor is called either when the socket is really destroyed (reference counter to zero), or it can also be called when we are de-assigning the transport. In the former case, we are sure the delayed work has completed, because it holds a reference until it completes, so the destructor will definitely be called after the delayed work is finished. But in the latter case, the destructor is called by AF_VSOCK core, just after the release(), so there may still be delayed work scheduled. Refactor the code, moving the code to delete the close work already in the do_close() to a new function. Invoke it during destruction to make sure we don't leave any pending work. Fixes: c0cfa2d8a788 ("vsock: add multi-transports support") Cc: stable@vger.kernel.org Reported-by: Hyunwoo Kim Closes: https://lore.kernel.org/netdev/Z37Sh+utS+iV3+eb@v4bel-B760M-AORUS-ELITE-AX/ Signed-off-by: Stefano Garzarella Reviewed-by: Luigi Leonardi Tested-by: Hyunwoo Kim Signed-off-by: Paolo Abeni --- net/vmw_vsock/virtio_transport_common.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 51a494b69be8..7f7de6d88096 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -26,6 +26,9 @@ /* Threshold for detecting small packets to copy */ #define GOOD_COPY_LEN 128 +static void virtio_transport_cancel_close_work(struct vsock_sock *vsk, + bool cancel_timeout); + static const struct virtio_transport * virtio_transport_get_ops(struct vsock_sock *vsk) { @@ -1109,6 +1112,8 @@ void virtio_transport_destruct(struct vsock_sock *vsk) { struct virtio_vsock_sock *vvs = vsk->trans; + virtio_transport_cancel_close_work(vsk, true); + kfree(vvs); vsk->trans = NULL; } @@ -1204,17 +1209,11 @@ static void virtio_transport_wait_close(struct sock *sk, long timeout) } } -static void virtio_transport_do_close(struct vsock_sock *vsk, - bool cancel_timeout) +static void virtio_transport_cancel_close_work(struct vsock_sock *vsk, + bool cancel_timeout) { struct sock *sk = sk_vsock(vsk); - sock_set_flag(sk, SOCK_DONE); - vsk->peer_shutdown = SHUTDOWN_MASK; - if (vsock_stream_has_data(vsk) <= 0) - sk->sk_state = TCP_CLOSING; - sk->sk_state_change(sk); - if (vsk->close_work_scheduled && (!cancel_timeout || cancel_delayed_work(&vsk->close_work))) { vsk->close_work_scheduled = false; @@ -1226,6 +1225,20 @@ static void virtio_transport_do_close(struct vsock_sock *vsk, } } +static void virtio_transport_do_close(struct vsock_sock *vsk, + bool cancel_timeout) +{ + struct sock *sk = sk_vsock(vsk); + + sock_set_flag(sk, SOCK_DONE); + vsk->peer_shutdown = SHUTDOWN_MASK; + if (vsock_stream_has_data(vsk) <= 0) + sk->sk_state = TCP_CLOSING; + sk->sk_state_change(sk); + + virtio_transport_cancel_close_work(vsk, cancel_timeout); +} + static void virtio_transport_close_timeout(struct work_struct *work) { struct vsock_sock *vsk = -- cgit v1.2.3 From a24009bc9be60242651a21702609381b5092459e Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 10 Jan 2025 09:35:10 +0100 Subject: vsock: reset socket state when de-assigning the transport Transport's release() and destruct() are called when de-assigning the vsock transport. These callbacks can touch some socket state like sock flags, sk_state, and peer_shutdown. Since we are reassigning the socket to a new transport during vsock_connect(), let's reset these fields to have a clean state with the new transport. Fixes: c0cfa2d8a788 ("vsock: add multi-transports support") Cc: stable@vger.kernel.org Signed-off-by: Stefano Garzarella Reviewed-by: Luigi Leonardi Signed-off-by: Paolo Abeni --- net/vmw_vsock/af_vsock.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 5cf8109f672a..74d35a871644 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -491,6 +491,15 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) */ vsk->transport->release(vsk); vsock_deassign_transport(vsk); + + /* transport's release() and destruct() can touch some socket + * state, since we are reassigning the socket to a new transport + * during vsock_connect(), let's reset these fields to have a + * clean state. + */ + sock_reset_flag(sk, SOCK_DONE); + sk->sk_state = TCP_CLOSE; + vsk->peer_shutdown = 0; } /* We increase the module refcnt to prevent the transport unloading -- cgit v1.2.3 From 91751e248256efc111e52e15115840c35d85abaf Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 10 Jan 2025 09:35:11 +0100 Subject: vsock: prevent null-ptr-deref in vsock_*[has_data|has_space] Recent reports have shown how we sometimes call vsock_*_has_data() when a vsock socket has been de-assigned from a transport (see attached links), but we shouldn't. Previous commits should have solved the real problems, but we may have more in the future, so to avoid null-ptr-deref, we can return 0 (no space, no data available) but with a warning. This way the code should continue to run in a nearly consistent state and have a warning that allows us to debug future problems. Fixes: c0cfa2d8a788 ("vsock: add multi-transports support") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/netdev/Z2K%2FI4nlHdfMRTZC@v4bel-B760M-AORUS-ELITE-AX/ Link: https://lore.kernel.org/netdev/5ca20d4c-1017-49c2-9516-f6f75fd331e9@rbox.co/ Link: https://lore.kernel.org/netdev/677f84a8.050a0220.25a300.01b3.GAE@google.com/ Co-developed-by: Hyunwoo Kim Signed-off-by: Hyunwoo Kim Co-developed-by: Wongi Lee Signed-off-by: Wongi Lee Signed-off-by: Stefano Garzarella Reviewed-by: Luigi Leonardi Reviewed-by: Hyunwoo Kim Signed-off-by: Paolo Abeni --- net/vmw_vsock/af_vsock.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 74d35a871644..fa9d1b49599b 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -879,6 +879,9 @@ EXPORT_SYMBOL_GPL(vsock_create_connected); s64 vsock_stream_has_data(struct vsock_sock *vsk) { + if (WARN_ON(!vsk->transport)) + return 0; + return vsk->transport->stream_has_data(vsk); } EXPORT_SYMBOL_GPL(vsock_stream_has_data); @@ -887,6 +890,9 @@ s64 vsock_connectible_has_data(struct vsock_sock *vsk) { struct sock *sk = sk_vsock(vsk); + if (WARN_ON(!vsk->transport)) + return 0; + if (sk->sk_type == SOCK_SEQPACKET) return vsk->transport->seqpacket_has_data(vsk); else @@ -896,6 +902,9 @@ EXPORT_SYMBOL_GPL(vsock_connectible_has_data); s64 vsock_stream_has_space(struct vsock_sock *vsk) { + if (WARN_ON(!vsk->transport)) + return 0; + return vsk->transport->stream_has_space(vsk); } EXPORT_SYMBOL_GPL(vsock_stream_has_space); -- cgit v1.2.3 From 2ca06a2f65310aeef30bb69b7405437a14766e4d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 13 Jan 2025 16:44:56 +0100 Subject: mptcp: be sure to send ack when mptcp-level window re-opens mptcp_cleanup_rbuf() is responsible to send acks when the user-space reads enough data to update the receive windows significantly. It tries hard to avoid acquiring the subflow sockets locks by checking conditions similar to the ones implemented at the TCP level. To avoid too much code duplication - the MPTCP protocol can't reuse the TCP helpers as part of the relevant status is maintained into the msk socket - and multiple costly window size computation, mptcp_cleanup_rbuf uses a rough estimate for the most recently advertised window size: the MPTCP receive free space, as recorded as at last-ack time. Unfortunately the above does not allow mptcp_cleanup_rbuf() to detect a zero to non-zero win change in some corner cases, skipping the tcp_cleanup_rbuf call and leaving the peer stuck. After commit ea66758c1795 ("tcp: allow MPTCP to update the announced window"), MPTCP has actually cheap access to the announced window value. Use it in mptcp_cleanup_rbuf() for a more accurate ack generation. Fixes: e3859603ba13 ("mptcp: better msk receive window updates") Cc: stable@vger.kernel.org Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/20250107131845.5e5de3c5@kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250113-net-mptcp-connect-st-flakes-v1-1-0d986ee7b1b6@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index a62bc874bf1e..123f3f297284 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -607,7 +607,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, } opts->ext_copy.use_ack = 1; opts->suboptions = OPTION_MPTCP_DSS; - WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk)); /* Add kind/length/subtype/flag overhead if mapping is not populated */ if (dss_size == 0) @@ -1288,7 +1287,7 @@ static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th) } MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICT); } - return; + goto update_wspace; } if (rcv_wnd_new != rcv_wnd_old) { @@ -1313,6 +1312,9 @@ raise_win: th->window = htons(new_win); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDSHARED); } + +update_wspace: + WRITE_ONCE(msk->old_wspace, tp->rcv_wnd); } __sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum) -- cgit v1.2.3 From e226d9259dc4f5d2c19e6682ad1356fa97cf38f4 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 13 Jan 2025 16:44:57 +0100 Subject: mptcp: fix spurious wake-up on under memory pressure The wake-up condition currently implemented by mptcp_epollin_ready() is wrong, as it could mark the MPTCP socket as readable even when no data are present and the system is under memory pressure. Explicitly check for some data being available in the receive queue. Fixes: 5684ab1a0eff ("mptcp: give rcvlowat some love") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250113-net-mptcp-connect-st-flakes-v1-2-0d986ee7b1b6@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index a93e661ef5c4..73526f1d768f 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -760,10 +760,15 @@ static inline u64 mptcp_data_avail(const struct mptcp_sock *msk) static inline bool mptcp_epollin_ready(const struct sock *sk) { + u64 data_avail = mptcp_data_avail(mptcp_sk(sk)); + + if (!data_avail) + return false; + /* mptcp doesn't have to deal with small skbs in the receive queue, - * at it can always coalesce them + * as it can always coalesce them */ - return (mptcp_data_avail(mptcp_sk(sk)) >= sk->sk_rcvlowat) || + return (data_avail >= sk->sk_rcvlowat) || (mem_cgroup_sockets_enabled && sk->sk_memcg && mem_cgroup_under_socket_pressure(sk->sk_memcg)) || READ_ONCE(tcp_memory_pressure); -- cgit v1.2.3 From f0d0277796db613c124206544b6dbe95b520ab6c Mon Sep 17 00:00:00 2001 From: John Sperbeck Date: Mon, 13 Jan 2025 17:13:54 -0800 Subject: net: netpoll: ensure skb_pool list is always initialized When __netpoll_setup() is called directly, instead of through netpoll_setup(), the np->skb_pool list head isn't initialized. If skb_pool_flush() is later called, then we hit a NULL pointer in skb_queue_purge_reason(). This can be seen with this repro, when CONFIG_NETCONSOLE is enabled as a module: ip tuntap add mode tap tap0 ip link add name br0 type bridge ip link set dev tap0 master br0 modprobe netconsole netconsole=4444@10.0.0.1/br0,9353@10.0.0.2/ rmmod netconsole The backtrace is: BUG: kernel NULL pointer dereference, address: 0000000000000008 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page ... ... ... Call Trace: __netpoll_free+0xa5/0xf0 br_netpoll_cleanup+0x43/0x50 [bridge] do_netpoll_cleanup+0x43/0xc0 netconsole_netdev_event+0x1e3/0x300 [netconsole] unregister_netdevice_notifier+0xd9/0x150 cleanup_module+0x45/0x920 [netconsole] __se_sys_delete_module+0x205/0x290 do_syscall_64+0x70/0x150 entry_SYSCALL_64_after_hwframe+0x76/0x7e Move the skb_pool list setup and initial skb fill into __netpoll_setup(). Fixes: 221a9c1df790 ("net: netpoll: Individualize the skb pool") Signed-off-by: John Sperbeck Reviewed-by: Breno Leitao Link: https://patch.msgid.link/20250114011354.2096812-1-jsperbeck@google.com Signed-off-by: Jakub Kicinski --- net/core/netpoll.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 2e459b9d88eb..96a6ed37d4cc 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -627,6 +627,8 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) const struct net_device_ops *ops; int err; + skb_queue_head_init(&np->skb_pool); + if (ndev->priv_flags & IFF_DISABLE_NETPOLL) { np_err(np, "%s doesn't support polling, aborting\n", ndev->name); @@ -662,6 +664,9 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) strscpy(np->dev_name, ndev->name, IFNAMSIZ); npinfo->netpoll = np; + /* fill up the skb queue */ + refill_skbs(np); + /* last thing to do is link it to the net device structure */ rcu_assign_pointer(ndev->npinfo, npinfo); @@ -681,8 +686,6 @@ int netpoll_setup(struct netpoll *np) struct in_device *in_dev; int err; - skb_queue_head_init(&np->skb_pool); - rtnl_lock(); if (np->dev_name[0]) { struct net *net = current->nsproxy->net_ns; @@ -782,9 +785,6 @@ put_noaddr: } } - /* fill up the skb queue */ - refill_skbs(np); - err = __netpoll_setup(np, ndev); if (err) goto flush; -- cgit v1.2.3 From a50da36562cd62b41de9bef08edbb3e8af00f118 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 15 Jan 2025 08:14:36 -0800 Subject: netdev: avoid CFI problems with sock priv helpers Li Li reports that casting away callback type may cause issues for CFI. Let's generate a small wrapper for each callback, to make sure compiler sees the anticipated types. Reported-by: Li Li Link: https://lore.kernel.org/CANBPYPjQVqmzZ4J=rVQX87a9iuwmaetULwbK_5_3YWk2eGzkaA@mail.gmail.com Fixes: 170aafe35cb9 ("netdev: support binding dma-buf to netdevice") Signed-off-by: Jakub Kicinski Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20250115161436.648646-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/core/netdev-genl-gen.c | 14 ++++++++++++-- tools/net/ynl/ynl-gen-c.py | 16 +++++++++++++--- 2 files changed, 25 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index a89cbd8d87c3..996ac6a449eb 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -197,6 +197,16 @@ static const struct genl_multicast_group netdev_nl_mcgrps[] = { [NETDEV_NLGRP_PAGE_POOL] = { "page-pool", }, }; +static void __netdev_nl_sock_priv_init(void *priv) +{ + netdev_nl_sock_priv_init(priv); +} + +static void __netdev_nl_sock_priv_destroy(void *priv) +{ + netdev_nl_sock_priv_destroy(priv); +} + struct genl_family netdev_nl_family __ro_after_init = { .name = NETDEV_FAMILY_NAME, .version = NETDEV_FAMILY_VERSION, @@ -208,6 +218,6 @@ struct genl_family netdev_nl_family __ro_after_init = { .mcgrps = netdev_nl_mcgrps, .n_mcgrps = ARRAY_SIZE(netdev_nl_mcgrps), .sock_priv_size = sizeof(struct list_head), - .sock_priv_init = (void *)netdev_nl_sock_priv_init, - .sock_priv_destroy = (void *)netdev_nl_sock_priv_destroy, + .sock_priv_init = __netdev_nl_sock_priv_init, + .sock_priv_destroy = __netdev_nl_sock_priv_destroy, }; diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index d8201c4b1520..6750fdb42564 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -2384,6 +2384,17 @@ def print_kernel_family_struct_src(family, cw): if not kernel_can_gen_family_struct(family): return + if 'sock-priv' in family.kernel_family: + # Generate "trampolines" to make CFI happy + cw.write_func("static void", f"__{family.c_name}_nl_sock_priv_init", + [f"{family.c_name}_nl_sock_priv_init(priv);"], + ["void *priv"]) + cw.nl() + cw.write_func("static void", f"__{family.c_name}_nl_sock_priv_destroy", + [f"{family.c_name}_nl_sock_priv_destroy(priv);"], + ["void *priv"]) + cw.nl() + cw.block_start(f"struct genl_family {family.ident_name}_nl_family __ro_after_init =") cw.p('.name\t\t= ' + family.fam_key + ',') cw.p('.version\t= ' + family.ver_key + ',') @@ -2401,9 +2412,8 @@ def print_kernel_family_struct_src(family, cw): cw.p(f'.n_mcgrps\t= ARRAY_SIZE({family.c_name}_nl_mcgrps),') if 'sock-priv' in family.kernel_family: cw.p(f'.sock_priv_size\t= sizeof({family.kernel_family["sock-priv"]}),') - # Force cast here, actual helpers take pointer to the real type. - cw.p(f'.sock_priv_init\t= (void *){family.c_name}_nl_sock_priv_init,') - cw.p(f'.sock_priv_destroy = (void *){family.c_name}_nl_sock_priv_destroy,') + cw.p(f'.sock_priv_init\t= __{family.c_name}_nl_sock_priv_init,') + cw.p(f'.sock_priv_destroy = __{family.c_name}_nl_sock_priv_destroy,') cw.block_end(';') -- cgit v1.2.3