From 028363685bd0b7a19b4a820f82dd905b1dc83999 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 9 Apr 2025 15:59:57 +0200 Subject: espintcp: remove encap socket caching to avoid reference leak The current scheme for caching the encap socket can lead to reference leaks when we try to delete the netns. The reference chain is: xfrm_state -> enacp_sk -> netns Since the encap socket is a userspace socket, it holds a reference on the netns. If we delete the espintcp state (through flush or individual delete) before removing the netns, the reference on the socket is dropped and the netns is correctly deleted. Otherwise, the netns may not be reachable anymore (if all processes within the ns have terminated), so we cannot delete the xfrm state to drop its reference on the socket. This patch results in a small (~2% in my tests) performance regression. A GC-type mechanism could be added for the socket cache, to clear references if the state hasn't been used "recently", but it's a lot more complex than just not caching the socket. Fixes: e27cca96cd68 ("xfrm: add espintcp (RFC 8229)") Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 39365fd2ea17..06ab2a3d2ebd 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -236,7 +236,6 @@ struct xfrm_state { /* Data for encapsulator */ struct xfrm_encap_tmpl *encap; - struct sock __rcu *encap_sk; /* NAT keepalive */ u32 nat_keepalive_interval; /* seconds */ -- cgit v1.2.3 From c46286fdd6aa1d0e33c245bcffe9ff2428a777bd Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 15 May 2025 18:49:26 +0200 Subject: mr: consolidate the ipmr_can_free_table() checks. Guoyu Yin reported a splat in the ipmr netns cleanup path: WARNING: CPU: 2 PID: 14564 at net/ipv4/ipmr.c:440 ipmr_free_table net/ipv4/ipmr.c:440 [inline] WARNING: CPU: 2 PID: 14564 at net/ipv4/ipmr.c:440 ipmr_rules_exit+0x135/0x1c0 net/ipv4/ipmr.c:361 Modules linked in: CPU: 2 UID: 0 PID: 14564 Comm: syz.4.838 Not tainted 6.14.0 #1 Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 RIP: 0010:ipmr_free_table net/ipv4/ipmr.c:440 [inline] RIP: 0010:ipmr_rules_exit+0x135/0x1c0 net/ipv4/ipmr.c:361 Code: ff df 48 c1 ea 03 80 3c 02 00 75 7d 48 c7 83 60 05 00 00 00 00 00 00 5b 5d 41 5c 41 5d 41 5e e9 71 67 7f 00 e8 4c 2d 8a fd 90 <0f> 0b 90 eb 93 e8 41 2d 8a fd 0f b6 2d 80 54 ea 01 31 ff 89 ee e8 RSP: 0018:ffff888109547c58 EFLAGS: 00010293 RAX: 0000000000000000 RBX: ffff888108c12dc0 RCX: ffffffff83e09868 RDX: ffff8881022b3300 RSI: ffffffff83e098d4 RDI: 0000000000000005 RBP: ffff888104288000 R08: 0000000000000000 R09: ffffed10211825c9 R10: 0000000000000001 R11: ffff88801816c4a0 R12: 0000000000000001 R13: ffff888108c13320 R14: ffff888108c12dc0 R15: fffffbfff0b74058 FS: 00007f84f39316c0(0000) GS:ffff88811b100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f84f3930f98 CR3: 0000000113b56000 CR4: 0000000000350ef0 Call Trace: ipmr_net_exit_batch+0x50/0x90 net/ipv4/ipmr.c:3160 ops_exit_list+0x10c/0x160 net/core/net_namespace.c:177 setup_net+0x47d/0x8e0 net/core/net_namespace.c:394 copy_net_ns+0x25d/0x410 net/core/net_namespace.c:516 create_new_namespaces+0x3f6/0xaf0 kernel/nsproxy.c:110 unshare_nsproxy_namespaces+0xc3/0x180 kernel/nsproxy.c:228 ksys_unshare+0x78d/0x9a0 kernel/fork.c:3342 __do_sys_unshare kernel/fork.c:3413 [inline] __se_sys_unshare kernel/fork.c:3411 [inline] __x64_sys_unshare+0x31/0x40 kernel/fork.c:3411 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xa6/0x1a0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f84f532cc29 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f84f3931038 EFLAGS: 00000246 ORIG_RAX: 0000000000000110 RAX: ffffffffffffffda RBX: 00007f84f5615fa0 RCX: 00007f84f532cc29 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000040000400 RBP: 00007f84f53fba18 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 00007f84f5615fa0 R15: 00007fff51c5f328 The running kernel has CONFIG_IP_MROUTE_MULTIPLE_TABLES disabled, and the sanity check for such build is still too loose. Address the issue consolidating the relevant sanity check in a single helper regardless of the kernel configuration. Also share it between the ipv4 and ipv6 code. Reported-by: Guoyu Yin Fixes: 50b94204446e ("ipmr: tune the ipmr_can_free_table() checks.") Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/372dc261e1bf12742276e1b984fc5a071b7fc5a8.1747321903.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/mroute_base.h | 5 +++++ net/ipv4/ipmr.c | 12 +----------- net/ipv6/ip6mr.c | 12 +----------- 3 files changed, 7 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 58a2401e4b55..0075f6e5c3da 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -262,6 +262,11 @@ struct mr_table { int mroute_reg_vif_num; }; +static inline bool mr_can_free_table(struct net *net) +{ + return !check_net(net) || !net_initialized(net); +} + #ifdef CONFIG_IP_MROUTE_COMMON void vif_device_init(struct vif_device *v, struct net_device *dev, diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index a8b04d4abcaa..85dc208f32e9 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -120,11 +120,6 @@ static void ipmr_expire_process(struct timer_list *t); lockdep_rtnl_is_held() || \ list_empty(&net->ipv4.mr_tables)) -static bool ipmr_can_free_table(struct net *net) -{ - return !check_net(net) || !net_initialized(net); -} - static struct mr_table *ipmr_mr_table_iter(struct net *net, struct mr_table *mrt) { @@ -317,11 +312,6 @@ EXPORT_SYMBOL(ipmr_rule_default); #define ipmr_for_each_table(mrt, net) \ for (mrt = net->ipv4.mrt; mrt; mrt = NULL) -static bool ipmr_can_free_table(struct net *net) -{ - return !check_net(net); -} - static struct mr_table *ipmr_mr_table_iter(struct net *net, struct mr_table *mrt) { @@ -437,7 +427,7 @@ static void ipmr_free_table(struct mr_table *mrt) { struct net *net = read_pnet(&mrt->net); - WARN_ON_ONCE(!ipmr_can_free_table(net)); + WARN_ON_ONCE(!mr_can_free_table(net)); timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC | diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index b413c9c8a21c..3276cde5ebd7 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -108,11 +108,6 @@ static void ipmr_expire_process(struct timer_list *t); lockdep_rtnl_is_held() || \ list_empty(&net->ipv6.mr6_tables)) -static bool ip6mr_can_free_table(struct net *net) -{ - return !check_net(net) || !net_initialized(net); -} - static struct mr_table *ip6mr_mr_table_iter(struct net *net, struct mr_table *mrt) { @@ -306,11 +301,6 @@ EXPORT_SYMBOL(ip6mr_rule_default); #define ip6mr_for_each_table(mrt, net) \ for (mrt = net->ipv6.mrt6; mrt; mrt = NULL) -static bool ip6mr_can_free_table(struct net *net) -{ - return !check_net(net); -} - static struct mr_table *ip6mr_mr_table_iter(struct net *net, struct mr_table *mrt) { @@ -416,7 +406,7 @@ static void ip6mr_free_table(struct mr_table *mrt) { struct net *net = read_pnet(&mrt->net); - WARN_ON_ONCE(!ip6mr_can_free_table(net)); + WARN_ON_ONCE(!mr_can_free_table(net)); timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC | -- cgit v1.2.3 From aed031da7e8cf6e31816a34afde961fbe0305fea Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 19 May 2025 13:41:28 -0700 Subject: bnxt_en: Fix netdev locking in ULP IRQ functions netdev_lock is already held when calling bnxt_ulp_irq_stop() and bnxt_ulp_irq_restart(). When converting rtnl_lock to netdev_lock, the original code was rtnl_dereference() to indicate that rtnl_lock was already held. rcu_dereference_protected() is the correct conversion after replacing rtnl_lock with netdev_lock. Add a new helper netdev_lock_dereference() similar to rtnl_dereference(). Fixes: 004b5008016a ("eth: bnxt: remove most dependencies on RTNL") Reviewed-by: Andy Gospodarek Reviewed-by: Pavan Chebbi Signed-off-by: Michael Chan Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250519204130.3097027-2-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 9 +++------ include/net/netdev_lock.h | 3 +++ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index a8e930d5dbb0..7564705d6478 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "bnxt_hsi.h" #include "bnxt.h" @@ -309,14 +310,12 @@ void bnxt_ulp_irq_stop(struct bnxt *bp) if (!ulp->msix_requested) return; - netdev_lock(bp->dev); - ops = rcu_dereference(ulp->ulp_ops); + ops = netdev_lock_dereference(ulp->ulp_ops, bp->dev); if (!ops || !ops->ulp_irq_stop) return; if (test_bit(BNXT_STATE_FW_RESET_DET, &bp->state)) reset = true; ops->ulp_irq_stop(ulp->handle, reset); - netdev_unlock(bp->dev); } } @@ -335,8 +334,7 @@ void bnxt_ulp_irq_restart(struct bnxt *bp, int err) if (!ulp->msix_requested) return; - netdev_lock(bp->dev); - ops = rcu_dereference(ulp->ulp_ops); + ops = netdev_lock_dereference(ulp->ulp_ops, bp->dev); if (!ops || !ops->ulp_irq_restart) return; @@ -348,7 +346,6 @@ void bnxt_ulp_irq_restart(struct bnxt *bp, int err) bnxt_fill_msix_vecs(bp, ent); } ops->ulp_irq_restart(ulp->handle, ent); - netdev_unlock(bp->dev); kfree(ent); } } diff --git a/include/net/netdev_lock.h b/include/net/netdev_lock.h index c316b551df8d..0ee5bc766810 100644 --- a/include/net/netdev_lock.h +++ b/include/net/netdev_lock.h @@ -98,6 +98,9 @@ static inline int netdev_lock_cmp_fn(const struct lockdep_map *a, &qdisc_xmit_lock_key); \ } +#define netdev_lock_dereference(p, dev) \ + rcu_dereference_protected(p, lockdep_is_held(&(dev)->lock)) + int netdev_debug_event(struct notifier_block *nb, unsigned long event, void *ptr); -- cgit v1.2.3