From 424f8416bb39936df6365442d651ee729b283460 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 May 2023 17:06:18 +0000 Subject: net: skb_partial_csum_set() fix against transport header magic value skb->transport_header uses the special 0xFFFF value to mark if the transport header was set or not. We must prevent callers to accidentaly set skb->transport_header to 0xFFFF. Note that only fuzzers can possibly do this today. syzbot reported: WARNING: CPU: 0 PID: 2340 at include/linux/skbuff.h:2847 skb_transport_offset include/linux/skbuff.h:2956 [inline] WARNING: CPU: 0 PID: 2340 at include/linux/skbuff.h:2847 virtio_net_hdr_to_skb+0xbcc/0x10c0 include/linux/virtio_net.h:103 Modules linked in: CPU: 0 PID: 2340 Comm: syz-executor.0 Not tainted 6.3.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/14/2023 RIP: 0010:skb_transport_header include/linux/skbuff.h:2847 [inline] RIP: 0010:skb_transport_offset include/linux/skbuff.h:2956 [inline] RIP: 0010:virtio_net_hdr_to_skb+0xbcc/0x10c0 include/linux/virtio_net.h:103 Code: 41 39 df 0f 82 c3 04 00 00 48 8b 7c 24 10 44 89 e6 e8 08 6e 59 ff 48 85 c0 74 54 e8 ce 36 7e fc e9 37 f8 ff ff e8 c4 36 7e fc <0f> 0b e9 93 f8 ff ff 44 89 f7 44 89 e6 e8 32 38 7e fc 45 39 e6 0f RSP: 0018:ffffc90004497880 EFLAGS: 00010293 RAX: ffffffff84fea55c RBX: 000000000000ffff RCX: ffff888120be2100 RDX: 0000000000000000 RSI: 000000000000ffff RDI: 000000000000ffff RBP: ffffc90004497990 R08: ffffffff84fe9de5 R09: 0000000000000034 R10: ffffea00048ebd80 R11: 0000000000000034 R12: ffff88811dc2d9c8 R13: dffffc0000000000 R14: ffff88811dc2d9ae R15: 1ffff11023b85b35 FS: 00007f9211a59700(0000) GS:ffff8881f6c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000200002c0 CR3: 00000001215a5000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: packet_snd net/packet/af_packet.c:3076 [inline] packet_sendmsg+0x4590/0x61a0 net/packet/af_packet.c:3115 sock_sendmsg_nosec net/socket.c:724 [inline] sock_sendmsg net/socket.c:747 [inline] __sys_sendto+0x472/0x630 net/socket.c:2144 __do_sys_sendto net/socket.c:2156 [inline] __se_sys_sendto net/socket.c:2152 [inline] __x64_sys_sendto+0xe5/0x100 net/socket.c:2152 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x2f/0x50 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f9210c8c169 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 f1 19 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f9211a59168 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 00007f9210dabf80 RCX: 00007f9210c8c169 RDX: 000000000000ffed RSI: 00000000200000c0 RDI: 0000000000000003 RBP: 00007f9210ce7ca1 R08: 0000000020000540 R09: 0000000000000014 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007ffe135d65cf R14: 00007f9211a59300 R15: 0000000000022000 Fixes: 66e4c8d95008 ("net: warn if transport header was not set") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Willem de Bruijn Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/skbuff.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 26a586007d8b..515ec5cdc79c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5298,7 +5298,7 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) u32 csum_end = (u32)start + (u32)off + sizeof(__sum16); u32 csum_start = skb_headroom(skb) + (u32)start; - if (unlikely(csum_start > U16_MAX || csum_end > skb_headlen(skb))) { + if (unlikely(csum_start >= U16_MAX || csum_end > skb_headlen(skb))) { net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n", start, off, skb_headroom(skb), skb_headlen(skb)); return false; @@ -5306,7 +5306,7 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = csum_start; skb->csum_offset = off; - skb_set_transport_header(skb, start); + skb->transport_header = csum_start; return true; } EXPORT_SYMBOL_GPL(skb_partial_csum_set); -- cgit v1.2.3 From dc1c9fd4a8bbe1e06add9053010b652449bfe411 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 4 May 2023 14:20:21 +0200 Subject: netfilter: nf_tables: always release netdev hooks from notifier This reverts "netfilter: nf_tables: skip netdev events generated on netns removal". The problem is that when a veth device is released, the veth release callback will also queue the peer netns device for removal. Its possible that the peer netns is also slated for removal. In this case, the device memory is already released before the pre_exit hook of the peer netns runs: BUG: KASAN: slab-use-after-free in nf_hook_entry_head+0x1b8/0x1d0 Read of size 8 at addr ffff88812c0124f0 by task kworker/u8:1/45 Workqueue: netns cleanup_net Call Trace: nf_hook_entry_head+0x1b8/0x1d0 __nf_unregister_net_hook+0x76/0x510 nft_netdev_unregister_hooks+0xa0/0x220 __nft_release_hook+0x184/0x490 nf_tables_pre_exit_net+0x12f/0x1b0 .. Order is: 1. First netns is released, veth_dellink() queues peer netns device for removal 2. peer netns is queued for removal 3. peer netns device is released, unreg event is triggered 4. unreg event is ignored because netns is going down 5. pre_exit hook calls nft_netdev_unregister_hooks but device memory might be free'd already. Fixes: 68a3765c659f ("netfilter: nf_tables: skip netdev events generated on netns removal") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_chain_filter.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index c3563f0be269..680fe557686e 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -344,6 +344,12 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev, return; } + /* UNREGISTER events are also happening on netns exit. + * + * Although nf_tables core releases all tables/chains, only this event + * handler provides guarantee that hook->ops.dev is still accessible, + * so we cannot skip exiting net namespaces. + */ __nft_release_basechain(ctx); } @@ -362,9 +368,6 @@ static int nf_tables_netdev_event(struct notifier_block *this, event != NETDEV_CHANGENAME) return NOTIFY_DONE; - if (!check_net(ctx.net)) - return NOTIFY_DONE; - nft_net = nft_pernet(ctx.net); mutex_lock(&nft_net->commit_mutex); list_for_each_entry(table, &nft_net->tables, list) { -- cgit v1.2.3 From e72eeab542dbf4f544e389e64fa13b82a1b6d003 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 4 May 2023 14:55:02 +0200 Subject: netfilter: conntrack: fix possible bug_on with enable_hooks=1 I received a bug report (no reproducer so far) where we trip over 712 rcu_read_lock(); 713 ct_hook = rcu_dereference(nf_ct_hook); 714 BUG_ON(ct_hook == NULL); // here In nf_conntrack_destroy(). First turn this BUG_ON into a WARN. I think it was triggered via enable_hooks=1 flag. When this flag is turned on, the conntrack hooks are registered before nf_ct_hook pointer gets assigned. This opens a short window where packets enter the conntrack machinery, can have skb->_nfct set up and a subsequent kfree_skb might occur before nf_ct_hook is set. Call nf_conntrack_init_end() to set nf_ct_hook before we register the pernet ops. Fixes: ba3fbe663635 ("netfilter: nf_conntrack: provide modparam to always register conntrack hooks") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/core.c | 6 ++++-- net/netfilter/nf_conntrack_standalone.c | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/core.c b/net/netfilter/core.c index f0783e42108b..5f76ae86a656 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -711,9 +711,11 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct) rcu_read_lock(); ct_hook = rcu_dereference(nf_ct_hook); - BUG_ON(ct_hook == NULL); - ct_hook->destroy(nfct); + if (ct_hook) + ct_hook->destroy(nfct); rcu_read_unlock(); + + WARN_ON(!ct_hook); } EXPORT_SYMBOL(nf_conntrack_destroy); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 57f6724c99a7..169e16fc2bce 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -1218,11 +1218,12 @@ static int __init nf_conntrack_standalone_init(void) nf_conntrack_htable_size_user = nf_conntrack_htable_size; #endif + nf_conntrack_init_end(); + ret = register_pernet_subsys(&nf_conntrack_net_ops); if (ret < 0) goto out_pernet; - nf_conntrack_init_end(); return 0; out_pernet: -- cgit v1.2.3 From a939d14919b799e6fff8a9c80296ca229ba2f8a4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 May 2023 16:56:34 +0000 Subject: netlink: annotate accesses to nlk->cb_running Both netlink_recvmsg() and netlink_native_seq_show() read nlk->cb_running locklessly. Use READ_ONCE() there. Add corresponding WRITE_ONCE() to netlink_dump() and __netlink_dump_start() syzbot reported: BUG: KCSAN: data-race in __netlink_dump_start / netlink_recvmsg write to 0xffff88813ea4db59 of 1 bytes by task 28219 on cpu 0: __netlink_dump_start+0x3af/0x4d0 net/netlink/af_netlink.c:2399 netlink_dump_start include/linux/netlink.h:308 [inline] rtnetlink_rcv_msg+0x70f/0x8c0 net/core/rtnetlink.c:6130 netlink_rcv_skb+0x126/0x220 net/netlink/af_netlink.c:2577 rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:6192 netlink_unicast_kernel net/netlink/af_netlink.c:1339 [inline] netlink_unicast+0x56f/0x640 net/netlink/af_netlink.c:1365 netlink_sendmsg+0x665/0x770 net/netlink/af_netlink.c:1942 sock_sendmsg_nosec net/socket.c:724 [inline] sock_sendmsg net/socket.c:747 [inline] sock_write_iter+0x1aa/0x230 net/socket.c:1138 call_write_iter include/linux/fs.h:1851 [inline] new_sync_write fs/read_write.c:491 [inline] vfs_write+0x463/0x760 fs/read_write.c:584 ksys_write+0xeb/0x1a0 fs/read_write.c:637 __do_sys_write fs/read_write.c:649 [inline] __se_sys_write fs/read_write.c:646 [inline] __x64_sys_write+0x42/0x50 fs/read_write.c:646 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd read to 0xffff88813ea4db59 of 1 bytes by task 28222 on cpu 1: netlink_recvmsg+0x3b4/0x730 net/netlink/af_netlink.c:2022 sock_recvmsg_nosec+0x4c/0x80 net/socket.c:1017 ____sys_recvmsg+0x2db/0x310 net/socket.c:2718 ___sys_recvmsg net/socket.c:2762 [inline] do_recvmmsg+0x2e5/0x710 net/socket.c:2856 __sys_recvmmsg net/socket.c:2935 [inline] __do_sys_recvmmsg net/socket.c:2958 [inline] __se_sys_recvmmsg net/socket.c:2951 [inline] __x64_sys_recvmmsg+0xe2/0x160 net/socket.c:2951 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x00 -> 0x01 Fixes: 16b304f3404f ("netlink: Eliminate kmalloc in netlink dump operation.") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 7ef8b9a1e30c..c87804112d0c 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1990,7 +1990,7 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, skb_free_datagram(sk, skb); - if (nlk->cb_running && + if (READ_ONCE(nlk->cb_running) && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) { ret = netlink_dump(sk); if (ret) { @@ -2302,7 +2302,7 @@ static int netlink_dump(struct sock *sk) if (cb->done) cb->done(cb); - nlk->cb_running = false; + WRITE_ONCE(nlk->cb_running, false); module = cb->module; skb = cb->skb; mutex_unlock(nlk->cb_mutex); @@ -2365,7 +2365,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, goto error_put; } - nlk->cb_running = true; + WRITE_ONCE(nlk->cb_running, true); nlk->dump_done_errno = INT_MAX; mutex_unlock(nlk->cb_mutex); @@ -2703,7 +2703,7 @@ static int netlink_native_seq_show(struct seq_file *seq, void *v) nlk->groups ? (u32)nlk->groups[0] : 0, sk_rmem_alloc_get(s), sk_wmem_alloc_get(s), - nlk->cb_running, + READ_ONCE(nlk->cb_running), refcount_read(&s->sk_refcnt), atomic_read(&s->sk_drops), sock_i_ino(s) -- cgit v1.2.3 From e05a5f510f26607616fecdd4ac136310c8bea56b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 May 2023 16:35:53 +0000 Subject: net: annotate sk->sk_err write from do_recvmmsg() do_recvmmsg() can write to sk->sk_err from multiple threads. As said before, many other points reading or writing sk_err need annotations. Fixes: 34b88a68f26a ("net: Fix use after free in the recvmmsg exit path") Signed-off-by: Eric Dumazet Reported-by: syzbot Reviewed-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index a7b4b37d86df..b7e01d0fe082 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2911,7 +2911,7 @@ static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg, * error to return on the next call or if the * app asks about it using getsockopt(SO_ERROR). */ - sock->sk->sk_err = -err; + WRITE_ONCE(sock->sk->sk_err, -err); } out_put: fput_light(sock->file, fput_needed); -- cgit v1.2.3 From d0ac89f6f9879fae316c155de77b5173b3e2c9c9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 May 2023 18:29:48 +0000 Subject: net: deal with most data-races in sk_wait_event() __condition is evaluated twice in sk_wait_event() macro. First invocation is lockless, and reads can race with writes, as spotted by syzbot. BUG: KCSAN: data-race in sk_stream_wait_connect / tcp_disconnect write to 0xffff88812d83d6a0 of 4 bytes by task 9065 on cpu 1: tcp_disconnect+0x2cd/0xdb0 inet_shutdown+0x19e/0x1f0 net/ipv4/af_inet.c:911 __sys_shutdown_sock net/socket.c:2343 [inline] __sys_shutdown net/socket.c:2355 [inline] __do_sys_shutdown net/socket.c:2363 [inline] __se_sys_shutdown+0xf8/0x140 net/socket.c:2361 __x64_sys_shutdown+0x31/0x40 net/socket.c:2361 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd read to 0xffff88812d83d6a0 of 4 bytes by task 9040 on cpu 0: sk_stream_wait_connect+0x1de/0x3a0 net/core/stream.c:75 tcp_sendmsg_locked+0x2e4/0x2120 net/ipv4/tcp.c:1266 tcp_sendmsg+0x30/0x50 net/ipv4/tcp.c:1484 inet6_sendmsg+0x63/0x80 net/ipv6/af_inet6.c:651 sock_sendmsg_nosec net/socket.c:724 [inline] sock_sendmsg net/socket.c:747 [inline] __sys_sendto+0x246/0x300 net/socket.c:2142 __do_sys_sendto net/socket.c:2154 [inline] __se_sys_sendto net/socket.c:2150 [inline] __x64_sys_sendto+0x78/0x90 net/socket.c:2150 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x00000000 -> 0x00000068 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/stream.c | 12 ++++++------ net/ipv4/tcp_bpf.c | 2 +- net/llc/af_llc.c | 8 +++++--- net/smc/smc_close.c | 4 ++-- net/smc/smc_rx.c | 4 ++-- net/smc/smc_tx.c | 4 ++-- net/tipc/socket.c | 4 ++-- net/tls/tls_main.c | 3 ++- 8 files changed, 22 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/core/stream.c b/net/core/stream.c index 434446ab14c5..f5c4e47df165 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -73,8 +73,8 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) add_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending++; done = sk_wait_event(sk, timeo_p, - !sk->sk_err && - !((1 << sk->sk_state) & + !READ_ONCE(sk->sk_err) && + !((1 << READ_ONCE(sk->sk_state)) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)), &wait); remove_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending--; @@ -87,9 +87,9 @@ EXPORT_SYMBOL(sk_stream_wait_connect); * sk_stream_closing - Return 1 if we still have things to send in our buffers. * @sk: socket to verify */ -static inline int sk_stream_closing(struct sock *sk) +static int sk_stream_closing(const struct sock *sk) { - return (1 << sk->sk_state) & + return (1 << READ_ONCE(sk->sk_state)) & (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK); } @@ -142,8 +142,8 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_pending++; - sk_wait_event(sk, ¤t_timeo, sk->sk_err || - (sk->sk_shutdown & SEND_SHUTDOWN) || + sk_wait_event(sk, ¤t_timeo, READ_ONCE(sk->sk_err) || + (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) || (sk_stream_memory_free(sk) && !vm_wait), &wait); sk->sk_write_pending--; diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index ebf917511937..2e9547467edb 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -168,7 +168,7 @@ static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock, sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); ret = sk_wait_event(sk, &timeo, !list_empty(&psock->ingress_msg) || - !skb_queue_empty(&sk->sk_receive_queue), &wait); + !skb_queue_empty_lockless(&sk->sk_receive_queue), &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); return ret; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index da7fe94bea2e..9ffbc667be6c 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -583,7 +583,8 @@ static int llc_ui_wait_for_disc(struct sock *sk, long timeout) add_wait_queue(sk_sleep(sk), &wait); while (1) { - if (sk_wait_event(sk, &timeout, sk->sk_state == TCP_CLOSE, &wait)) + if (sk_wait_event(sk, &timeout, + READ_ONCE(sk->sk_state) == TCP_CLOSE, &wait)) break; rc = -ERESTARTSYS; if (signal_pending(current)) @@ -603,7 +604,8 @@ static bool llc_ui_wait_for_conn(struct sock *sk, long timeout) add_wait_queue(sk_sleep(sk), &wait); while (1) { - if (sk_wait_event(sk, &timeout, sk->sk_state != TCP_SYN_SENT, &wait)) + if (sk_wait_event(sk, &timeout, + READ_ONCE(sk->sk_state) != TCP_SYN_SENT, &wait)) break; if (signal_pending(current) || !timeout) break; @@ -622,7 +624,7 @@ static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout) while (1) { rc = 0; if (sk_wait_event(sk, &timeout, - (sk->sk_shutdown & RCV_SHUTDOWN) || + (READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN) || (!llc_data_accept_state(llc->state) && !llc->remote_busy_flag && !llc->p_flag), &wait)) diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 31db7438857c..dbdf03e8aa5b 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -67,8 +67,8 @@ static void smc_close_stream_wait(struct smc_sock *smc, long timeout) rc = sk_wait_event(sk, &timeout, !smc_tx_prepared_sends(&smc->conn) || - sk->sk_err == ECONNABORTED || - sk->sk_err == ECONNRESET || + READ_ONCE(sk->sk_err) == ECONNABORTED || + READ_ONCE(sk->sk_err) == ECONNRESET || smc->conn.killed, &wait); if (rc) diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 4380d32f5a5f..9a2f3638d161 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -267,9 +267,9 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo, sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); add_wait_queue(sk_sleep(sk), &wait); rc = sk_wait_event(sk, timeo, - sk->sk_err || + READ_ONCE(sk->sk_err) || cflags->peer_conn_abort || - sk->sk_shutdown & RCV_SHUTDOWN || + READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN || conn->killed || fcrit(conn), &wait); diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index f4b6a71ac488..45128443f1f1 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -113,8 +113,8 @@ static int smc_tx_wait(struct smc_sock *smc, int flags) break; /* at least 1 byte of free & no urgent data */ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk_wait_event(sk, &timeo, - sk->sk_err || - (sk->sk_shutdown & SEND_SHUTDOWN) || + READ_ONCE(sk->sk_err) || + (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) || smc_cdc_rxed_any_close(conn) || (atomic_read(&conn->sndbuf_space) && !conn->urg_tx_pend), diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 37edfe10f8c6..dd73d71c02a9 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -314,9 +314,9 @@ static void tsk_rej_rx_queue(struct sock *sk, int error) tipc_sk_respond(sk, skb, error); } -static bool tipc_sk_connected(struct sock *sk) +static bool tipc_sk_connected(const struct sock *sk) { - return sk->sk_state == TIPC_ESTABLISHED; + return READ_ONCE(sk->sk_state) == TIPC_ESTABLISHED; } /* tipc_sk_type_connectionless - check if the socket is datagram socket diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index b32c112984dd..f2e7302a4d96 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -111,7 +111,8 @@ int wait_on_pending_writer(struct sock *sk, long *timeo) break; } - if (sk_wait_event(sk, timeo, !sk->sk_write_pending, &wait)) + if (sk_wait_event(sk, timeo, + !READ_ONCE(sk->sk_write_pending), &wait)) break; } remove_wait_queue(sk_sleep(sk), &wait); -- cgit v1.2.3 From 4063384ef762cc5946fc7a3f89879e76c6ec51e2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 May 2023 13:18:57 +0000 Subject: net: add vlan_get_protocol_and_depth() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before blamed commit, pskb_may_pull() was used instead of skb_header_pointer() in __vlan_get_protocol() and friends. Few callers depended on skb->head being populated with MAC header, syzbot caught one of them (skb_mac_gso_segment()) Add vlan_get_protocol_and_depth() to make the intent clearer and use it where sensible. This is a more generic fix than commit e9d3f80935b6 ("net/af_packet: make sure to pull mac header") which was dealing with a similar issue. kernel BUG at include/linux/skbuff.h:2655 ! invalid opcode: 0000 [#1] SMP KASAN CPU: 0 PID: 1441 Comm: syz-executor199 Not tainted 6.1.24-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/14/2023 RIP: 0010:__skb_pull include/linux/skbuff.h:2655 [inline] RIP: 0010:skb_mac_gso_segment+0x68f/0x6a0 net/core/gro.c:136 Code: fd 48 8b 5c 24 10 44 89 6b 70 48 c7 c7 c0 ae 0d 86 44 89 e6 e8 a1 91 d0 00 48 c7 c7 00 af 0d 86 48 89 de 31 d2 e8 d1 4a e9 ff <0f> 0b 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 48 89 e5 41 RSP: 0018:ffffc90001bd7520 EFLAGS: 00010286 RAX: ffffffff8469736a RBX: ffff88810f31dac0 RCX: ffff888115a18b00 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffffc90001bd75e8 R08: ffffffff84697183 R09: fffff5200037adf9 R10: 0000000000000000 R11: dffffc0000000001 R12: 0000000000000012 R13: 000000000000fee5 R14: 0000000000005865 R15: 000000000000fed7 FS: 000055555633f300(0000) GS:ffff8881f6a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020000000 CR3: 0000000116fea000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: [] __skb_gso_segment+0x32d/0x4c0 net/core/dev.c:3419 [] skb_gso_segment include/linux/netdevice.h:4819 [inline] [] validate_xmit_skb+0x3aa/0xee0 net/core/dev.c:3725 [] __dev_queue_xmit+0x1332/0x3300 net/core/dev.c:4313 [] dev_queue_xmit+0x17/0x20 include/linux/netdevice.h:3029 [] packet_snd net/packet/af_packet.c:3111 [inline] [] packet_sendmsg+0x49d2/0x6470 net/packet/af_packet.c:3142 [] sock_sendmsg_nosec net/socket.c:716 [inline] [] sock_sendmsg net/socket.c:736 [inline] [] __sys_sendto+0x472/0x5f0 net/socket.c:2139 [] __do_sys_sendto net/socket.c:2151 [inline] [] __se_sys_sendto net/socket.c:2147 [inline] [] __x64_sys_sendto+0xe5/0x100 net/socket.c:2147 [] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [] do_syscall_64+0x2f/0x50 arch/x86/entry/common.c:80 [] entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: 469aceddfa3e ("vlan: consolidate VLAN parsing code and limit max parsing depth") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Toke Høiland-Jørgensen Cc: Willem de Bruijn Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/tap.c | 4 ++-- include/linux/if_vlan.h | 17 +++++++++++++++++ net/bridge/br_forward.c | 2 +- net/core/dev.c | 2 +- net/packet/af_packet.c | 6 ++---- 5 files changed, 23 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/drivers/net/tap.c b/drivers/net/tap.c index ce993cc75bf3..d30d730ed5a7 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -742,7 +742,7 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, /* Move network header to the right position for VLAN tagged packets */ if (eth_type_vlan(skb->protocol) && - __vlan_get_protocol(skb, skb->protocol, &depth) != 0) + vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) skb_set_network_header(skb, depth); /* copy skb_ubuf_info for callback when skb has no error */ @@ -1197,7 +1197,7 @@ static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp) /* Move network header to the right position for VLAN tagged packets */ if (eth_type_vlan(skb->protocol) && - __vlan_get_protocol(skb, skb->protocol, &depth) != 0) + vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) skb_set_network_header(skb, depth); rcu_read_lock(); diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 0f40f379d75c..6ba71957851e 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -637,6 +637,23 @@ static inline __be16 vlan_get_protocol(const struct sk_buff *skb) return __vlan_get_protocol(skb, skb->protocol, NULL); } +/* This version of __vlan_get_protocol() also pulls mac header in skb->head */ +static inline __be16 vlan_get_protocol_and_depth(struct sk_buff *skb, + __be16 type, int *depth) +{ + int maclen; + + type = __vlan_get_protocol(skb, type, &maclen); + + if (type) { + if (!pskb_may_pull(skb, maclen)) + type = 0; + else if (depth) + *depth = maclen; + } + return type; +} + /* A getter for the SKB protocol field which will handle VLAN tags consistently * whether VLAN acceleration is enabled or not. */ diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 57744704ff69..84d6dd5e5b1a 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -42,7 +42,7 @@ int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb eth_type_vlan(skb->protocol)) { int depth; - if (!__vlan_get_protocol(skb, skb->protocol, &depth)) + if (!vlan_get_protocol_and_depth(skb, skb->protocol, &depth)) goto drop; skb_set_network_header(skb, depth); diff --git a/net/core/dev.c b/net/core/dev.c index 735096d42c1d..b3c13e041935 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3335,7 +3335,7 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth) type = eth->h_proto; } - return __vlan_get_protocol(skb, type, depth); + return vlan_get_protocol_and_depth(skb, type, depth); } /* openvswitch calls this on rx path, so we need a different check. diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 640d94e34635..94c6a1ffa459 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1934,10 +1934,8 @@ static void packet_parse_headers(struct sk_buff *skb, struct socket *sock) /* Move network header to the right position for VLAN tagged packets */ if (likely(skb->dev->type == ARPHRD_ETHER) && eth_type_vlan(skb->protocol) && - __vlan_get_protocol(skb, skb->protocol, &depth) != 0) { - if (pskb_may_pull(skb, depth)) - skb_set_network_header(skb, depth); - } + vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) + skb_set_network_header(skb, depth); skb_probe_transport_header(skb); } -- cgit v1.2.3 From e14cadfd80d76f01bfaa1a8d745b1db19b57d6be Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 May 2023 20:36:56 +0000 Subject: tcp: add annotations around sk->sk_shutdown accesses Now sk->sk_shutdown is no longer a bitfield, we can add standard READ_ONCE()/WRITE_ONCE() annotations to silence KCSAN reports like the following: BUG: KCSAN: data-race in tcp_disconnect / tcp_poll write to 0xffff88814588582c of 1 bytes by task 3404 on cpu 1: tcp_disconnect+0x4d6/0xdb0 net/ipv4/tcp.c:3121 __inet_stream_connect+0x5dd/0x6e0 net/ipv4/af_inet.c:715 inet_stream_connect+0x48/0x70 net/ipv4/af_inet.c:727 __sys_connect_file net/socket.c:2001 [inline] __sys_connect+0x19b/0x1b0 net/socket.c:2018 __do_sys_connect net/socket.c:2028 [inline] __se_sys_connect net/socket.c:2025 [inline] __x64_sys_connect+0x41/0x50 net/socket.c:2025 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd read to 0xffff88814588582c of 1 bytes by task 3374 on cpu 0: tcp_poll+0x2e6/0x7d0 net/ipv4/tcp.c:562 sock_poll+0x253/0x270 net/socket.c:1383 vfs_poll include/linux/poll.h:88 [inline] io_poll_check_events io_uring/poll.c:281 [inline] io_poll_task_func+0x15a/0x820 io_uring/poll.c:333 handle_tw_list io_uring/io_uring.c:1184 [inline] tctx_task_work+0x1fe/0x4d0 io_uring/io_uring.c:1246 task_work_run+0x123/0x160 kernel/task_work.c:179 get_signal+0xe64/0xff0 kernel/signal.c:2635 arch_do_signal_or_restart+0x89/0x2a0 arch/x86/kernel/signal.c:306 exit_to_user_mode_loop+0x6f/0xe0 kernel/entry/common.c:168 exit_to_user_mode_prepare+0x6c/0xb0 kernel/entry/common.c:204 __syscall_exit_to_user_mode_work kernel/entry/common.c:286 [inline] syscall_exit_to_user_mode+0x26/0x140 kernel/entry/common.c:297 do_syscall_64+0x4d/0xc0 arch/x86/entry/common.c:86 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x03 -> 0x00 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 +- net/ipv4/tcp.c | 14 ++++++++------ net/ipv4/tcp_input.c | 4 ++-- 3 files changed, 11 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 940062e08f57..c4aab3aacbd8 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -894,7 +894,7 @@ int inet_shutdown(struct socket *sock, int how) EPOLLHUP, even on eg. unconnected UDP sockets -- RR */ fallthrough; default: - sk->sk_shutdown |= how; + WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | how); if (sk->sk_prot->shutdown) sk->sk_prot->shutdown(sk, how); break; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 20db115c38c4..4d6392c16b7a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -498,6 +498,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) __poll_t mask; struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); + u8 shutdown; int state; sock_poll_wait(file, sock, wait); @@ -540,9 +541,10 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) * NOTE. Check for TCP_CLOSE is added. The goal is to prevent * blocking on fresh not-connected or disconnected socket. --ANK */ - if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) + shutdown = READ_ONCE(sk->sk_shutdown); + if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) mask |= EPOLLHUP; - if (sk->sk_shutdown & RCV_SHUTDOWN) + if (shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; /* Connected or passive Fast Open socket? */ @@ -559,7 +561,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) if (tcp_stream_is_readable(sk, target)) mask |= EPOLLIN | EPOLLRDNORM; - if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { + if (!(shutdown & SEND_SHUTDOWN)) { if (__sk_stream_is_writeable(sk, 1)) { mask |= EPOLLOUT | EPOLLWRNORM; } else { /* send SIGIO later */ @@ -2867,7 +2869,7 @@ void __tcp_close(struct sock *sk, long timeout) int data_was_unread = 0; int state; - sk->sk_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if (sk->sk_state == TCP_LISTEN) { tcp_set_state(sk, TCP_CLOSE); @@ -3119,7 +3121,7 @@ int tcp_disconnect(struct sock *sk, int flags) inet_bhash2_reset_saddr(sk); - sk->sk_shutdown = 0; + WRITE_ONCE(sk->sk_shutdown, 0); sock_reset_flag(sk, SOCK_DONE); tp->srtt_us = 0; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); @@ -4649,7 +4651,7 @@ void tcp_done(struct sock *sk) if (req) reqsk_fastopen_remove(sk, req, false); - sk->sk_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a057330d6f59..61b6710f337a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4362,7 +4362,7 @@ void tcp_fin(struct sock *sk) inet_csk_schedule_ack(sk); - sk->sk_shutdown |= RCV_SHUTDOWN; + WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN); sock_set_flag(sk, SOCK_DONE); switch (sk->sk_state) { @@ -6599,7 +6599,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) break; tcp_set_state(sk, TCP_FIN_WAIT2); - sk->sk_shutdown |= SEND_SHUTDOWN; + WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | SEND_SHUTDOWN); sk_dst_confirm(sk); -- cgit v1.2.3 From 5bca1d081f44c9443e61841842ce4e9179d327b6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 May 2023 17:31:31 +0000 Subject: net: datagram: fix data-races in datagram_poll() datagram_poll() runs locklessly, we should add READ_ONCE() annotations while reading sk->sk_err, sk->sk_shutdown and sk->sk_state. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20230509173131.3263780-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/datagram.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/datagram.c b/net/core/datagram.c index 5662dff3d381..176eb5834746 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -807,18 +807,21 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, { struct sock *sk = sock->sk; __poll_t mask; + u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; /* exceptional events? */ - if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) + if (READ_ONCE(sk->sk_err) || + !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); - if (sk->sk_shutdown & RCV_SHUTDOWN) + shutdown = READ_ONCE(sk->sk_shutdown); + if (shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; - if (sk->sk_shutdown == SHUTDOWN_MASK) + if (shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; /* readable? */ @@ -827,10 +830,12 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, /* Connection-based need to check for termination and startup */ if (connection_based(sk)) { - if (sk->sk_state == TCP_CLOSE) + int state = READ_ONCE(sk->sk_state); + + if (state == TCP_CLOSE) mask |= EPOLLHUP; /* connection hasn't started yet? */ - if (sk->sk_state == TCP_SYN_SENT) + if (state == TCP_SYN_SENT) return mask; } -- cgit v1.2.3 From 679ed006d416ea0cecfe24a99d365d1dea69c683 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 9 May 2023 17:34:55 -0700 Subject: af_unix: Fix a data race of sk->sk_receive_queue->qlen. KCSAN found a data race of sk->sk_receive_queue->qlen where recvmsg() updates qlen under the queue lock and sendmsg() checks qlen under unix_state_sock(), not the queue lock, so the reader side needs READ_ONCE(). BUG: KCSAN: data-race in __skb_try_recv_from_queue / unix_wait_for_peer write (marked) to 0xffff888019fe7c68 of 4 bytes by task 49792 on cpu 0: __skb_unlink include/linux/skbuff.h:2347 [inline] __skb_try_recv_from_queue+0x3de/0x470 net/core/datagram.c:197 __skb_try_recv_datagram+0xf7/0x390 net/core/datagram.c:263 __unix_dgram_recvmsg+0x109/0x8a0 net/unix/af_unix.c:2452 unix_dgram_recvmsg+0x94/0xa0 net/unix/af_unix.c:2549 sock_recvmsg_nosec net/socket.c:1019 [inline] ____sys_recvmsg+0x3a3/0x3b0 net/socket.c:2720 ___sys_recvmsg+0xc8/0x150 net/socket.c:2764 do_recvmmsg+0x182/0x560 net/socket.c:2858 __sys_recvmmsg net/socket.c:2937 [inline] __do_sys_recvmmsg net/socket.c:2960 [inline] __se_sys_recvmmsg net/socket.c:2953 [inline] __x64_sys_recvmmsg+0x153/0x170 net/socket.c:2953 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3b/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x72/0xdc read to 0xffff888019fe7c68 of 4 bytes by task 49793 on cpu 1: skb_queue_len include/linux/skbuff.h:2127 [inline] unix_recvq_full net/unix/af_unix.c:229 [inline] unix_wait_for_peer+0x154/0x1a0 net/unix/af_unix.c:1445 unix_dgram_sendmsg+0x13bc/0x14b0 net/unix/af_unix.c:2048 sock_sendmsg_nosec net/socket.c:724 [inline] sock_sendmsg+0x148/0x160 net/socket.c:747 ____sys_sendmsg+0x20e/0x620 net/socket.c:2503 ___sys_sendmsg+0xc6/0x140 net/socket.c:2557 __sys_sendmmsg+0x11d/0x370 net/socket.c:2643 __do_sys_sendmmsg net/socket.c:2672 [inline] __se_sys_sendmmsg net/socket.c:2669 [inline] __x64_sys_sendmmsg+0x58/0x70 net/socket.c:2669 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3b/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x72/0xdc value changed: 0x0000000b -> 0x00000001 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 49793 Comm: syz-executor.0 Not tainted 6.3.0-rc7-02330-gca6270c12e20 #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Michal Kubiak Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index fb31e8a4409e..08102e728b15 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1442,7 +1442,7 @@ static long unix_wait_for_peer(struct sock *other, long timeo) sched = !sock_flag(other, SOCK_DEAD) && !(other->sk_shutdown & RCV_SHUTDOWN) && - unix_recvq_full(other); + unix_recvq_full_lockless(other); unix_state_unlock(other); -- cgit v1.2.3 From e1d09c2c2f5793474556b60f83900e088d0d366d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 9 May 2023 17:34:56 -0700 Subject: af_unix: Fix data races around sk->sk_shutdown. KCSAN found a data race around sk->sk_shutdown where unix_release_sock() and unix_shutdown() update it under unix_state_lock(), OTOH unix_poll() and unix_dgram_poll() read it locklessly. We need to annotate the writes and reads with WRITE_ONCE() and READ_ONCE(). BUG: KCSAN: data-race in unix_poll / unix_release_sock write to 0xffff88800d0f8aec of 1 bytes by task 264 on cpu 0: unix_release_sock+0x75c/0x910 net/unix/af_unix.c:631 unix_release+0x59/0x80 net/unix/af_unix.c:1042 __sock_release+0x7d/0x170 net/socket.c:653 sock_close+0x19/0x30 net/socket.c:1397 __fput+0x179/0x5e0 fs/file_table.c:321 ____fput+0x15/0x20 fs/file_table.c:349 task_work_run+0x116/0x1a0 kernel/task_work.c:179 resume_user_mode_work include/linux/resume_user_mode.h:49 [inline] exit_to_user_mode_loop kernel/entry/common.c:171 [inline] exit_to_user_mode_prepare+0x174/0x180 kernel/entry/common.c:204 __syscall_exit_to_user_mode_work kernel/entry/common.c:286 [inline] syscall_exit_to_user_mode+0x1a/0x30 kernel/entry/common.c:297 do_syscall_64+0x4b/0x90 arch/x86/entry/common.c:86 entry_SYSCALL_64_after_hwframe+0x72/0xdc read to 0xffff88800d0f8aec of 1 bytes by task 222 on cpu 1: unix_poll+0xa3/0x2a0 net/unix/af_unix.c:3170 sock_poll+0xcf/0x2b0 net/socket.c:1385 vfs_poll include/linux/poll.h:88 [inline] ep_item_poll.isra.0+0x78/0xc0 fs/eventpoll.c:855 ep_send_events fs/eventpoll.c:1694 [inline] ep_poll fs/eventpoll.c:1823 [inline] do_epoll_wait+0x6c4/0xea0 fs/eventpoll.c:2258 __do_sys_epoll_wait fs/eventpoll.c:2270 [inline] __se_sys_epoll_wait fs/eventpoll.c:2265 [inline] __x64_sys_epoll_wait+0xcc/0x190 fs/eventpoll.c:2265 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3b/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x72/0xdc value changed: 0x00 -> 0x03 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 222 Comm: dbus-broker Not tainted 6.3.0-rc7-02330-gca6270c12e20 #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 Fixes: 3c73419c09a5 ("af_unix: fix 'poll for write'/ connected DGRAM sockets") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Michal Kubiak Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 08102e728b15..cc695c9f09ec 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -603,7 +603,7 @@ static void unix_release_sock(struct sock *sk, int embrion) /* Clear state */ unix_state_lock(sk); sock_orphan(sk); - sk->sk_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); path = u->path; u->path.dentry = NULL; u->path.mnt = NULL; @@ -628,7 +628,7 @@ static void unix_release_sock(struct sock *sk, int embrion) if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { unix_state_lock(skpair); /* No more writes */ - skpair->sk_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); if (!skb_queue_empty(&sk->sk_receive_queue) || embrion) WRITE_ONCE(skpair->sk_err, ECONNRESET); unix_state_unlock(skpair); @@ -3008,7 +3008,7 @@ static int unix_shutdown(struct socket *sock, int mode) ++mode; unix_state_lock(sk); - sk->sk_shutdown |= mode; + WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); other = unix_peer(sk); if (other) sock_hold(other); @@ -3028,7 +3028,7 @@ static int unix_shutdown(struct socket *sock, int mode) if (mode&SEND_SHUTDOWN) peer_mode |= RCV_SHUTDOWN; unix_state_lock(other); - other->sk_shutdown |= peer_mode; + WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); unix_state_unlock(other); other->sk_state_change(other); if (peer_mode == SHUTDOWN_MASK) @@ -3160,16 +3160,18 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa { struct sock *sk = sock->sk; __poll_t mask; + u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; + shutdown = READ_ONCE(sk->sk_shutdown); /* exceptional events? */ if (READ_ONCE(sk->sk_err)) mask |= EPOLLERR; - if (sk->sk_shutdown == SHUTDOWN_MASK) + if (shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; - if (sk->sk_shutdown & RCV_SHUTDOWN) + if (shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; /* readable? */ @@ -3203,9 +3205,11 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk, *other; unsigned int writable; __poll_t mask; + u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; + shutdown = READ_ONCE(sk->sk_shutdown); /* exceptional events? */ if (READ_ONCE(sk->sk_err) || @@ -3213,9 +3217,9 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); - if (sk->sk_shutdown & RCV_SHUTDOWN) + if (shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; - if (sk->sk_shutdown == SHUTDOWN_MASK) + if (shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; /* readable? */ -- cgit v1.2.3