From 79efebae4afc2221fa814c3cae001bede66ab259 Mon Sep 17 00:00:00 2001 From: Pedro Falcato Date: Wed, 7 Aug 2024 10:47:25 +0100 Subject: 9p: Avoid creating multiple slab caches with the same name In the spirit of [1], avoid creating multiple slab caches with the same name. Instead, add the dev_name into the mix. [1]: https://lore.kernel.org/all/20240807090746.2146479-1-pedro.falcato@gmail.com/ Signed-off-by: Pedro Falcato Reported-by: syzbot+3c5d43e97993e1fa612b@syzkaller.appspotmail.com Message-ID: <20240807094725.2193423-1-pedro.falcato@gmail.com> Signed-off-by: Dominique Martinet --- net/9p/client.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/9p/client.c b/net/9p/client.c index 5cd94721d974..9e7b9151816d 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -979,6 +979,7 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) int err; struct p9_client *clnt; char *client_id; + char *cache_name; clnt = kmalloc(sizeof(*clnt), GFP_KERNEL); if (!clnt) @@ -1035,15 +1036,22 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) if (err) goto close_trans; + cache_name = kasprintf(GFP_KERNEL, "9p-fcall-cache-%s", dev_name); + if (!cache_name) { + err = -ENOMEM; + goto close_trans; + } + /* P9_HDRSZ + 4 is the smallest packet header we can have that is * followed by data accessed from userspace by read */ clnt->fcall_cache = - kmem_cache_create_usercopy("9p-fcall-cache", clnt->msize, + kmem_cache_create_usercopy(cache_name, clnt->msize, 0, 0, P9_HDRSZ + 4, clnt->msize - (P9_HDRSZ + 4), NULL); + kfree(cache_name); return clnt; close_trans: -- cgit v1.2.3 From 1d498df44e709d9708c0bf666012933bbc7ef1d6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 27 Sep 2024 16:49:01 +1000 Subject: sunrpc: fix prog selection loop in svc_process_common If the rq_prog is not in the list of programs, then we use the last program in the list and we don't get the expected rpc_prog_unavail error as the subsequent tests on 'progp' being NULL are ineffective. We should only assign progp when we find the right program, and we should initialize it to NULL Reported-by: Dan Carpenter Fixes: 86ab08beb3f0 ("SUNRPC: replace program list with program array") Signed-off-by: NeilBrown Acked-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/svc.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 7e7f4e0390c7..79879b7d39cb 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1321,7 +1321,7 @@ static int svc_process_common(struct svc_rqst *rqstp) { struct xdr_stream *xdr = &rqstp->rq_res_stream; - struct svc_program *progp; + struct svc_program *progp = NULL; const struct svc_procedure *procp = NULL; struct svc_serv *serv = rqstp->rq_server; struct svc_process_info process; @@ -1351,12 +1351,9 @@ svc_process_common(struct svc_rqst *rqstp) rqstp->rq_vers = be32_to_cpup(p++); rqstp->rq_proc = be32_to_cpup(p); - for (pr = 0; pr < serv->sv_nprogs; pr++) { - progp = &serv->sv_programs[pr]; - - if (rqstp->rq_prog == progp->pg_prog) - break; - } + for (pr = 0; pr < serv->sv_nprogs; pr++) + if (rqstp->rq_prog == serv->sv_programs[pr].pg_prog) + progp = &serv->sv_programs[pr]; /* * Decode auth data, and add verifier to reply buffer. -- cgit v1.2.3 From 09d88791c7cd888d5195c84733caf9183dcfbd16 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 20 Sep 2024 14:56:24 +0200 Subject: bpf: Make sure internal and UAPI bpf_redirect flags don't overlap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bpf_redirect_info is shared between the SKB and XDP redirect paths, and the two paths use the same numeric flag values in the ri->flags field (specifically, BPF_F_BROADCAST == BPF_F_NEXTHOP). This means that if skb bpf_redirect_neigh() is used with a non-NULL params argument and, subsequently, an XDP redirect is performed using the same bpf_redirect_info struct, the XDP path will get confused and end up crashing, which syzbot managed to trigger. With the stack-allocated bpf_redirect_info, the structure is no longer shared between the SKB and XDP paths, so the crash doesn't happen anymore. However, different code paths using identically-numbered flag values in the same struct field still seems like a bit of a mess, so this patch cleans that up by moving the flag definitions together and redefining the three flags in BPF_F_REDIRECT_INTERNAL to not overlap with the flags used for XDP. It also adds a BUILD_BUG_ON() check to make sure the overlap is not re-introduced by mistake. Fixes: e624d4ed4aa8 ("xdp: Extend xdp_redirect_map with broadcast support") Reported-by: syzbot+cca39e6e84a367a7e6f6@syzkaller.appspotmail.com Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Acked-by: Daniel Borkmann Closes: https://syzkaller.appspot.com/bug?extid=cca39e6e84a367a7e6f6 Link: https://lore.kernel.org/bpf/20240920125625.59465-1-toke@redhat.com --- include/uapi/linux/bpf.h | 13 +++++-------- net/core/filter.c | 8 +++++--- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c6cd7c7aeeee..e8241b320c6d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6047,11 +6047,6 @@ enum { BPF_F_MARK_ENFORCE = (1ULL << 6), }; -/* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ -enum { - BPF_F_INGRESS = (1ULL << 0), -}; - /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ enum { BPF_F_TUNINFO_IPV6 = (1ULL << 0), @@ -6198,10 +6193,12 @@ enum { BPF_F_BPRM_SECUREEXEC = (1ULL << 0), }; -/* Flags for bpf_redirect_map helper */ +/* Flags for bpf_redirect and bpf_redirect_map helpers */ enum { - BPF_F_BROADCAST = (1ULL << 3), - BPF_F_EXCLUDE_INGRESS = (1ULL << 4), + BPF_F_INGRESS = (1ULL << 0), /* used for skb path */ + BPF_F_BROADCAST = (1ULL << 3), /* used for XDP path */ + BPF_F_EXCLUDE_INGRESS = (1ULL << 4), /* used for XDP path */ +#define BPF_F_REDIRECT_FLAGS (BPF_F_INGRESS | BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS) }; #define __bpf_md_ptr(type, name) \ diff --git a/net/core/filter.c b/net/core/filter.c index cd3524cb326b..4e3f42cc6611 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2438,9 +2438,9 @@ out: /* Internal, non-exposed redirect flags. */ enum { - BPF_F_NEIGH = (1ULL << 1), - BPF_F_PEER = (1ULL << 2), - BPF_F_NEXTHOP = (1ULL << 3), + BPF_F_NEIGH = (1ULL << 16), + BPF_F_PEER = (1ULL << 17), + BPF_F_NEXTHOP = (1ULL << 18), #define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP) }; @@ -2450,6 +2450,8 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) struct sk_buff *clone; int ret; + BUILD_BUG_ON(BPF_F_REDIRECT_INTERNAL & BPF_F_REDIRECT_FLAGS); + if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) return -EINVAL; -- cgit v1.2.3 From e37ab7373696e650d3b6262a5b882aadad69bb9e Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Tue, 1 Oct 2024 20:05:15 +0000 Subject: tcp: fix to allow timestamp undo if no retransmits were sent Fix the TCP loss recovery undo logic in tcp_packet_delayed() so that it can trigger undo even if TSQ prevents a fast recovery episode from reaching tcp_retransmit_skb(). Geumhwan Yu recently reported that after this commit from 2019: commit bc9f38c8328e ("tcp: avoid unconditional congestion window undo on SYN retransmit") ...and before this fix we could have buggy scenarios like the following: + Due to reordering, a TCP connection receives some SACKs and enters a spurious fast recovery. + TSQ prevents all invocations of tcp_retransmit_skb(), because many skbs are queued in lower layers of the sending machine's network stack; thus tp->retrans_stamp remains 0. + The connection receives a TCP timestamp ECR value echoing a timestamp before the fast recovery, indicating that the fast recovery was spurious. + The connection fails to undo the spurious fast recovery because tp->retrans_stamp is 0, and thus tcp_packet_delayed() returns false, due to the new logic in the 2019 commit: commit bc9f38c8328e ("tcp: avoid unconditional congestion window undo on SYN retransmit") This fix tweaks the logic to be more similar to the tcp_packet_delayed() logic before bc9f38c8328e, except that we take care not to be fooled by the FLAG_SYN_ACKED code path zeroing out tp->retrans_stamp (the bug noted and fixed by Yuchung in bc9f38c8328e). Note that this returns the high-level behavior of tcp_packet_delayed() to again match the comment for the function, which says: "Nothing was retransmitted or returned timestamp is less than timestamp of the first retransmission." Note that this comment is in the original 2005-04-16 Linux git commit, so this is evidently long-standing behavior. Fixes: bc9f38c8328e ("tcp: avoid unconditional congestion window undo on SYN retransmit") Reported-by: Geumhwan Yu Diagnosed-by: Geumhwan Yu Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20241001200517.2756803-2-ncardwell.sw@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cc05ec1faac8..233b77890795 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2473,8 +2473,22 @@ static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp, */ static inline bool tcp_packet_delayed(const struct tcp_sock *tp) { - return tp->retrans_stamp && - tcp_tsopt_ecr_before(tp, tp->retrans_stamp); + const struct sock *sk = (const struct sock *)tp; + + if (tp->retrans_stamp && + tcp_tsopt_ecr_before(tp, tp->retrans_stamp)) + return true; /* got echoed TS before first retransmission */ + + /* Check if nothing was retransmitted (retrans_stamp==0), which may + * happen in fast recovery due to TSQ. But we ignore zero retrans_stamp + * in TCP_SYN_SENT, since when we set FLAG_SYN_ACKED we also clear + * retrans_stamp even if we had retransmitted the SYN. + */ + if (!tp->retrans_stamp && /* no record of a retransmit/SYN? */ + sk->sk_state != TCP_SYN_SENT) /* not the FLAG_SYN_ACKED case? */ + return true; /* nothing was retransmitted */ + + return false; } /* Undo procedures. */ -- cgit v1.2.3 From b41b4cbd9655bcebcce941bef3601db8110335be Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Tue, 1 Oct 2024 20:05:16 +0000 Subject: tcp: fix tcp_enter_recovery() to zero retrans_stamp when it's safe Fix tcp_enter_recovery() so that if there are no retransmits out then we zero retrans_stamp when entering fast recovery. This is necessary to fix two buggy behaviors. Currently a non-zero retrans_stamp value can persist across multiple back-to-back loss recovery episodes. This is because we generally only clears retrans_stamp if we are completely done with loss recoveries, and get to tcp_try_to_open() and find !tcp_any_retrans_done(sk). This behavior causes two bugs: (1) When a loss recovery episode (CA_Loss or CA_Recovery) is followed immediately by a new CA_Recovery, the retrans_stamp value can persist and can be a time before this new CA_Recovery episode starts. That means that timestamp-based undo will be using the wrong retrans_stamp (a value that is too old) when comparing incoming TS ecr values to retrans_stamp to see if the current fast recovery episode can be undone. (2) If there is a roughly minutes-long sequence of back-to-back fast recovery episodes, one after another (e.g. in a shallow-buffered or policed bottleneck), where each fast recovery successfully makes forward progress and recovers one window of sequence space (but leaves at least one retransmit in flight at the end of the recovery), followed by several RTOs, then the ETIMEDOUT check may be using the wrong retrans_stamp (a value set at the start of the first fast recovery in the sequence). This can cause a very premature ETIMEDOUT, killing the connection prematurely. This commit changes the code to zero retrans_stamp when entering fast recovery, when this is known to be safe (no retransmits are out in the network). That ensures that when starting a fast recovery episode, and it is safe to do so, retrans_stamp is set when we send the fast retransmit packet. That addresses both bug (1) and bug (2) by ensuring that (if no retransmits are out when we start a fast recovery) we use the initial fast retransmit of this fast recovery as the time value for undo and ETIMEDOUT calculations. This makes intuitive sense, since the start of a new fast recovery episode (in a scenario where no lost packets are out in the network) means that the connection has made forward progress since the last RTO or fast recovery, and we should thus "restart the clock" used for both undo and ETIMEDOUT logic. Note that if when we start fast recovery there *are* retransmits out in the network, there can still be undesirable (1)/(2) issues. For example, after this patch we can still have the (1) and (2) problems in cases like this: + round 1: sender sends flight 1 + round 2: sender receives SACKs and enters fast recovery 1, retransmits some packets in flight 1 and then sends some new data as flight 2 + round 3: sender receives some SACKs for flight 2, notes losses, and retransmits some packets to fill the holes in flight 2 + fast recovery has some lost retransmits in flight 1 and continues for one or more rounds sending retransmits for flight 1 and flight 2 + fast recovery 1 completes when snd_una reaches high_seq at end of flight 1 + there are still holes in the SACK scoreboard in flight 2, so we enter fast recovery 2, but some retransmits in the flight 2 sequence range are still in flight (retrans_out > 0), so we can't execute the new retrans_stamp=0 added here to clear retrans_stamp It's not yet clear how to fix these remaining (1)/(2) issues in an efficient way without breaking undo behavior, given that retrans_stamp is currently used for undo and ETIMEDOUT. Perhaps the optimal (but expensive) strategy would be to set retrans_stamp to the timestamp of the earliest outstanding retransmit when entering fast recovery. But at least this commit makes things better. Note that this does not change the semantics of retrans_stamp; it simply makes retrans_stamp accurate in some cases where it was not before: (1) Some loss recovery, followed by an immediate entry into a fast recovery, where there are no retransmits out when entering the fast recovery. (2) When a TFO server has a SYNACK retransmit that sets retrans_stamp, and then the ACK that completes the 3-way handshake has SACK blocks that trigger a fast recovery. In this case when entering fast recovery we want to zero out the retrans_stamp from the TFO SYNACK retransmit, and set the retrans_stamp based on the timestamp of the fast recovery. We introduce a tcp_retrans_stamp_cleanup() helper, because this two-line sequence already appears in 3 places and is about to appear in 2 more as a result of this bug fix patch series. Once this bug fix patches series in the net branch makes it into the net-next branch we'll update the 3 other call sites to use the new helper. This is a long-standing issue. The Fixes tag below is chosen to be the oldest commit at which the patch will apply cleanly, which is from Linux v3.5 in 2012. Fixes: 1fbc340514fc ("tcp: early retransmit: tcp_enter_recovery()") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20241001200517.2756803-3-ncardwell.sw@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 233b77890795..8e47907c76d5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2522,6 +2522,16 @@ static bool tcp_any_retrans_done(const struct sock *sk) return false; } +/* If loss recovery is finished and there are no retransmits out in the + * network, then we clear retrans_stamp so that upon the next loss recovery + * retransmits_timed_out() and timestamp-undo are using the correct value. + */ +static void tcp_retrans_stamp_cleanup(struct sock *sk) +{ + if (!tcp_any_retrans_done(sk)) + tcp_sk(sk)->retrans_stamp = 0; +} + static void DBGUNDO(struct sock *sk, const char *msg) { #if FASTRETRANS_DEBUG > 1 @@ -2889,6 +2899,9 @@ void tcp_enter_recovery(struct sock *sk, bool ece_ack) struct tcp_sock *tp = tcp_sk(sk); int mib_idx; + /* Start the clock with our fast retransmit, for undo and ETIMEDOUT. */ + tcp_retrans_stamp_cleanup(sk); + if (tcp_is_reno(tp)) mib_idx = LINUX_MIB_TCPRENORECOVERY; else -- cgit v1.2.3 From 27c80efcc20486c82698f05f00e288b44513c86b Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Tue, 1 Oct 2024 20:05:17 +0000 Subject: tcp: fix TFO SYN_RECV to not zero retrans_stamp with retransmits out Fix tcp_rcv_synrecv_state_fastopen() to not zero retrans_stamp if retransmits are outstanding. tcp_fastopen_synack_timer() sets retrans_stamp, so typically we'll need to zero retrans_stamp here to prevent spurious retransmits_timed_out(). The logic to zero retrans_stamp is from this 2019 commit: commit cd736d8b67fb ("tcp: fix retrans timestamp on passive Fast Open") However, in the corner case where the ACK of our TFO SYNACK carried some SACK blocks that caused us to enter TCP_CA_Recovery then that non-zero retrans_stamp corresponds to the active fast recovery, and we need to leave retrans_stamp with its current non-zero value, for correct ETIMEDOUT and undo behavior. Fixes: cd736d8b67fb ("tcp: fix retrans timestamp on passive Fast Open") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20241001200517.2756803-4-ncardwell.sw@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8e47907c76d5..2d844e1f867f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6684,10 +6684,17 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss && !tp->packets_out) tcp_try_undo_recovery(sk); - /* Reset rtx states to prevent spurious retransmits_timed_out() */ tcp_update_rto_time(tp); - tp->retrans_stamp = 0; inet_csk(sk)->icsk_retransmits = 0; + /* In tcp_fastopen_synack_timer() on the first SYNACK RTO we set + * retrans_stamp but don't enter CA_Loss, so in case that happened we + * need to zero retrans_stamp here to prevent spurious + * retransmits_timed_out(). However, if the ACK of our SYNACK caused us + * to enter CA_Recovery then we need to leave retrans_stamp as it was + * set entering CA_Recovery, for correct retransmits_timed_out() and + * undo behavior. + */ + tcp_retrans_stamp_cleanup(sk); /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1, * we no longer need req so release it. -- cgit v1.2.3 From bc212465326e8587325f520a052346f0b57360e6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 1 Oct 2024 14:26:58 +0100 Subject: rxrpc: Fix a race between socket set up and I/O thread creation In rxrpc_open_socket(), it sets up the socket and then sets up the I/O thread that will handle it. This is a problem, however, as there's a gap between the two phases in which a packet may come into rxrpc_encap_rcv() from the UDP packet but we oops when trying to wake the not-yet created I/O thread. As a quick fix, just make rxrpc_encap_rcv() discard the packet if there's no I/O thread yet. A better, but more intrusive fix would perhaps be to rearrange things such that the socket creation is done by the I/O thread. Fixes: a275da62e8c1 ("rxrpc: Create a per-local endpoint receive queue and I/O thread") Signed-off-by: David Howells cc: yuxuanzhe@outlook.com cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241001132702.3122709-2-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/ar-internal.h | 2 +- net/rxrpc/io_thread.c | 10 ++++++++-- net/rxrpc/local_object.c | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 80d682f89b23..d0fd37bdcfe9 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -1056,7 +1056,7 @@ bool rxrpc_direct_abort(struct sk_buff *skb, enum rxrpc_abort_reason why, int rxrpc_io_thread(void *data); static inline void rxrpc_wake_up_io_thread(struct rxrpc_local *local) { - wake_up_process(local->io_thread); + wake_up_process(READ_ONCE(local->io_thread)); } static inline bool rxrpc_protocol_error(struct sk_buff *skb, enum rxrpc_abort_reason why) diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 0300baa9afcd..07c74c77d802 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -27,11 +27,17 @@ int rxrpc_encap_rcv(struct sock *udp_sk, struct sk_buff *skb) { struct sk_buff_head *rx_queue; struct rxrpc_local *local = rcu_dereference_sk_user_data(udp_sk); + struct task_struct *io_thread; if (unlikely(!local)) { kfree_skb(skb); return 0; } + io_thread = READ_ONCE(local->io_thread); + if (!io_thread) { + kfree_skb(skb); + return 0; + } if (skb->tstamp == 0) skb->tstamp = ktime_get_real(); @@ -47,7 +53,7 @@ int rxrpc_encap_rcv(struct sock *udp_sk, struct sk_buff *skb) #endif skb_queue_tail(rx_queue, skb); - rxrpc_wake_up_io_thread(local); + wake_up_process(io_thread); return 0; } @@ -565,7 +571,7 @@ int rxrpc_io_thread(void *data) __set_current_state(TASK_RUNNING); rxrpc_see_local(local, rxrpc_local_stop); rxrpc_destroy_local(local); - local->io_thread = NULL; + WRITE_ONCE(local->io_thread, NULL); rxrpc_see_local(local, rxrpc_local_stopped); return 0; } diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 504453c688d7..f9623ace2201 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -232,7 +232,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) } wait_for_completion(&local->io_thread_ready); - local->io_thread = io_thread; + WRITE_ONCE(local->io_thread, io_thread); _leave(" = 0"); return 0; -- cgit v1.2.3 From 7a310f8d7dfe2d92a1f31ddb5357bfdd97eed273 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 1 Oct 2024 14:26:59 +0100 Subject: rxrpc: Fix uninitialised variable in rxrpc_send_data() Fix the uninitialised txb variable in rxrpc_send_data() by moving the code that loads it above all the jumps to maybe_error, txb being stored back into call->tx_pending right before the normal return. Fixes: b0f571ecd794 ("rxrpc: Fix locking in rxrpc's sendmsg") Reported-by: Dan Carpenter Closes: https://lists.infradead.org/pipermail/linux-afs/2024-October/008896.html Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241001132702.3122709-3-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/sendmsg.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 894b8fa68e5e..23d18fe5de9f 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -303,6 +303,11 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); reload: + txb = call->tx_pending; + call->tx_pending = NULL; + if (txb) + rxrpc_see_txbuf(txb, rxrpc_txbuf_see_send_more); + ret = -EPIPE; if (sk->sk_shutdown & SEND_SHUTDOWN) goto maybe_error; @@ -329,11 +334,6 @@ reload: goto maybe_error; } - txb = call->tx_pending; - call->tx_pending = NULL; - if (txb) - rxrpc_see_txbuf(txb, rxrpc_txbuf_see_send_more); - do { if (!txb) { size_t remain; -- cgit v1.2.3 From f9ff7665cd128012868098bbd07e28993e314fdb Mon Sep 17 00:00:00 2001 From: Andy Roulin Date: Tue, 1 Oct 2024 08:43:59 -0700 Subject: netfilter: br_netfilter: fix panic with metadata_dst skb Fix a kernel panic in the br_netfilter module when sending untagged traffic via a VxLAN device. This happens during the check for fragmentation in br_nf_dev_queue_xmit. It is dependent on: 1) the br_netfilter module being loaded; 2) net.bridge.bridge-nf-call-iptables set to 1; 3) a bridge with a VxLAN (single-vxlan-device) netdevice as a bridge port; 4) untagged frames with size higher than the VxLAN MTU forwarded/flooded When forwarding the untagged packet to the VxLAN bridge port, before the netfilter hooks are called, br_handle_egress_vlan_tunnel is called and changes the skb_dst to the tunnel dst. The tunnel_dst is a metadata type of dst, i.e., skb_valid_dst(skb) is false, and metadata->dst.dev is NULL. Then in the br_netfilter hooks, in br_nf_dev_queue_xmit, there's a check for frames that needs to be fragmented: frames with higher MTU than the VxLAN device end up calling br_nf_ip_fragment, which in turns call ip_skb_dst_mtu. The ip_dst_mtu tries to use the skb_dst(skb) as if it was a valid dst with valid dst->dev, thus the crash. This case was never supported in the first place, so drop the packet instead. PING 10.0.0.2 (10.0.0.2) from 0.0.0.0 h1-eth0: 2000(2028) bytes of data. [ 176.291791] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000110 [ 176.292101] Mem abort info: [ 176.292184] ESR = 0x0000000096000004 [ 176.292322] EC = 0x25: DABT (current EL), IL = 32 bits [ 176.292530] SET = 0, FnV = 0 [ 176.292709] EA = 0, S1PTW = 0 [ 176.292862] FSC = 0x04: level 0 translation fault [ 176.293013] Data abort info: [ 176.293104] ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000 [ 176.293488] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [ 176.293787] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 176.293995] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000043ef5000 [ 176.294166] [0000000000000110] pgd=0000000000000000, p4d=0000000000000000 [ 176.294827] Internal error: Oops: 0000000096000004 [#1] PREEMPT SMP [ 176.295252] Modules linked in: vxlan ip6_udp_tunnel udp_tunnel veth br_netfilter bridge stp llc ipv6 crct10dif_ce [ 176.295923] CPU: 0 PID: 188 Comm: ping Not tainted 6.8.0-rc3-g5b3fbd61b9d1 #2 [ 176.296314] Hardware name: linux,dummy-virt (DT) [ 176.296535] pstate: 80000005 (Nzcv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 176.296808] pc : br_nf_dev_queue_xmit+0x390/0x4ec [br_netfilter] [ 176.297382] lr : br_nf_dev_queue_xmit+0x2ac/0x4ec [br_netfilter] [ 176.297636] sp : ffff800080003630 [ 176.297743] x29: ffff800080003630 x28: 0000000000000008 x27: ffff6828c49ad9f8 [ 176.298093] x26: ffff6828c49ad000 x25: 0000000000000000 x24: 00000000000003e8 [ 176.298430] x23: 0000000000000000 x22: ffff6828c4960b40 x21: ffff6828c3b16d28 [ 176.298652] x20: ffff6828c3167048 x19: ffff6828c3b16d00 x18: 0000000000000014 [ 176.298926] x17: ffffb0476322f000 x16: ffffb7e164023730 x15: 0000000095744632 [ 176.299296] x14: ffff6828c3f1c880 x13: 0000000000000002 x12: ffffb7e137926a70 [ 176.299574] x11: 0000000000000001 x10: ffff6828c3f1c898 x9 : 0000000000000000 [ 176.300049] x8 : ffff6828c49bf070 x7 : 0008460f18d5f20e x6 : f20e0100bebafeca [ 176.300302] x5 : ffff6828c7f918fe x4 : ffff6828c49bf070 x3 : 0000000000000000 [ 176.300586] x2 : 0000000000000000 x1 : ffff6828c3c7ad00 x0 : ffff6828c7f918f0 [ 176.300889] Call trace: [ 176.301123] br_nf_dev_queue_xmit+0x390/0x4ec [br_netfilter] [ 176.301411] br_nf_post_routing+0x2a8/0x3e4 [br_netfilter] [ 176.301703] nf_hook_slow+0x48/0x124 [ 176.302060] br_forward_finish+0xc8/0xe8 [bridge] [ 176.302371] br_nf_hook_thresh+0x124/0x134 [br_netfilter] [ 176.302605] br_nf_forward_finish+0x118/0x22c [br_netfilter] [ 176.302824] br_nf_forward_ip.part.0+0x264/0x290 [br_netfilter] [ 176.303136] br_nf_forward+0x2b8/0x4e0 [br_netfilter] [ 176.303359] nf_hook_slow+0x48/0x124 [ 176.303803] __br_forward+0xc4/0x194 [bridge] [ 176.304013] br_flood+0xd4/0x168 [bridge] [ 176.304300] br_handle_frame_finish+0x1d4/0x5c4 [bridge] [ 176.304536] br_nf_hook_thresh+0x124/0x134 [br_netfilter] [ 176.304978] br_nf_pre_routing_finish+0x29c/0x494 [br_netfilter] [ 176.305188] br_nf_pre_routing+0x250/0x524 [br_netfilter] [ 176.305428] br_handle_frame+0x244/0x3cc [bridge] [ 176.305695] __netif_receive_skb_core.constprop.0+0x33c/0xecc [ 176.306080] __netif_receive_skb_one_core+0x40/0x8c [ 176.306197] __netif_receive_skb+0x18/0x64 [ 176.306369] process_backlog+0x80/0x124 [ 176.306540] __napi_poll+0x38/0x17c [ 176.306636] net_rx_action+0x124/0x26c [ 176.306758] __do_softirq+0x100/0x26c [ 176.307051] ____do_softirq+0x10/0x1c [ 176.307162] call_on_irq_stack+0x24/0x4c [ 176.307289] do_softirq_own_stack+0x1c/0x2c [ 176.307396] do_softirq+0x54/0x6c [ 176.307485] __local_bh_enable_ip+0x8c/0x98 [ 176.307637] __dev_queue_xmit+0x22c/0xd28 [ 176.307775] neigh_resolve_output+0xf4/0x1a0 [ 176.308018] ip_finish_output2+0x1c8/0x628 [ 176.308137] ip_do_fragment+0x5b4/0x658 [ 176.308279] ip_fragment.constprop.0+0x48/0xec [ 176.308420] __ip_finish_output+0xa4/0x254 [ 176.308593] ip_finish_output+0x34/0x130 [ 176.308814] ip_output+0x6c/0x108 [ 176.308929] ip_send_skb+0x50/0xf0 [ 176.309095] ip_push_pending_frames+0x30/0x54 [ 176.309254] raw_sendmsg+0x758/0xaec [ 176.309568] inet_sendmsg+0x44/0x70 [ 176.309667] __sys_sendto+0x110/0x178 [ 176.309758] __arm64_sys_sendto+0x28/0x38 [ 176.309918] invoke_syscall+0x48/0x110 [ 176.310211] el0_svc_common.constprop.0+0x40/0xe0 [ 176.310353] do_el0_svc+0x1c/0x28 [ 176.310434] el0_svc+0x34/0xb4 [ 176.310551] el0t_64_sync_handler+0x120/0x12c [ 176.310690] el0t_64_sync+0x190/0x194 [ 176.311066] Code: f9402e61 79402aa2 927ff821 f9400023 (f9408860) [ 176.315743] ---[ end trace 0000000000000000 ]--- [ 176.316060] Kernel panic - not syncing: Oops: Fatal exception in interrupt [ 176.316371] Kernel Offset: 0x37e0e3000000 from 0xffff800080000000 [ 176.316564] PHYS_OFFSET: 0xffff97d780000000 [ 176.316782] CPU features: 0x0,88000203,3c020000,0100421b [ 176.317210] Memory Limit: none [ 176.317527] ---[ end Kernel panic - not syncing: Oops: Fatal Exception in interrupt ]---\ Fixes: 11538d039ac6 ("bridge: vlan dst_metadata hooks in ingress and egress paths") Reviewed-by: Ido Schimmel Signed-off-by: Andy Roulin Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20241001154400.22787-2-aroulin@nvidia.com Signed-off-by: Jakub Kicinski --- net/bridge/br_netfilter_hooks.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 0e8bc0ea6175..1d458e9da660 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -879,6 +880,10 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff return br_dev_queue_push_xmit(net, sk, skb); } + /* Fragmentation on metadata/template dst is not supported */ + if (unlikely(!skb_valid_dst(skb))) + goto drop; + /* This is wrong! We should preserve the original fragment * boundaries by preserving frag_list rather than refragmenting. */ -- cgit v1.2.3 From 08d1914293dae38350b8088980e59fbc699a72fe Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 30 Sep 2024 13:26:21 -0400 Subject: Bluetooth: RFCOMM: FIX possible deadlock in rfcomm_sk_state_change rfcomm_sk_state_change attempts to use sock_lock so it must never be called with it locked but rfcomm_sock_ioctl always attempt to lock it causing the following trace: ====================================================== WARNING: possible circular locking dependency detected 6.8.0-syzkaller-08951-gfe46a7dd189e #0 Not tainted ------------------------------------------------------ syz-executor386/5093 is trying to acquire lock: ffff88807c396258 (sk_lock-AF_BLUETOOTH-BTPROTO_RFCOMM){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1671 [inline] ffff88807c396258 (sk_lock-AF_BLUETOOTH-BTPROTO_RFCOMM){+.+.}-{0:0}, at: rfcomm_sk_state_change+0x5b/0x310 net/bluetooth/rfcomm/sock.c:73 but task is already holding lock: ffff88807badfd28 (&d->lock){+.+.}-{3:3}, at: __rfcomm_dlc_close+0x226/0x6a0 net/bluetooth/rfcomm/core.c:491 Reported-by: syzbot+d7ce59b06b3eb14fd218@syzkaller.appspotmail.com Tested-by: syzbot+d7ce59b06b3eb14fd218@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=d7ce59b06b3eb14fd218 Fixes: 3241ad820dbb ("[Bluetooth] Add timestamp support to L2CAP, RFCOMM and SCO") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/rfcomm/sock.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 37d63d768afb..f48250e3f2e1 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -865,9 +865,7 @@ static int rfcomm_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned lon if (err == -ENOIOCTLCMD) { #ifdef CONFIG_BT_RFCOMM_TTY - lock_sock(sk); err = rfcomm_dev_ioctl(sk, cmd, (void __user *) arg); - release_sock(sk); #else err = -EOPNOTSUPP; #endif -- cgit v1.2.3 From 18fd04ad856df07733f5bb07e7f7168e7443d393 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 2 Oct 2024 11:17:26 -0400 Subject: Bluetooth: hci_conn: Fix UAF in hci_enhanced_setup_sync This checks if the ACL connection remains valid as it could be destroyed while hci_enhanced_setup_sync is pending on cmd_sync leading to the following trace: BUG: KASAN: slab-use-after-free in hci_enhanced_setup_sync+0x91b/0xa60 Read of size 1 at addr ffff888002328ffd by task kworker/u5:2/37 CPU: 0 UID: 0 PID: 37 Comm: kworker/u5:2 Not tainted 6.11.0-rc6-01300-g810be445d8d6 #7099 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-2.fc40 04/01/2014 Workqueue: hci0 hci_cmd_sync_work Call Trace: dump_stack_lvl+0x5d/0x80 ? hci_enhanced_setup_sync+0x91b/0xa60 print_report+0x152/0x4c0 ? hci_enhanced_setup_sync+0x91b/0xa60 ? __virt_addr_valid+0x1fa/0x420 ? hci_enhanced_setup_sync+0x91b/0xa60 kasan_report+0xda/0x1b0 ? hci_enhanced_setup_sync+0x91b/0xa60 hci_enhanced_setup_sync+0x91b/0xa60 ? __pfx_hci_enhanced_setup_sync+0x10/0x10 ? __pfx___mutex_lock+0x10/0x10 hci_cmd_sync_work+0x1c2/0x330 process_one_work+0x7d9/0x1360 ? __pfx_lock_acquire+0x10/0x10 ? __pfx_process_one_work+0x10/0x10 ? assign_work+0x167/0x240 worker_thread+0x5b7/0xf60 ? __kthread_parkme+0xac/0x1c0 ? __pfx_worker_thread+0x10/0x10 ? __pfx_worker_thread+0x10/0x10 kthread+0x293/0x360 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2f/0x70 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 Allocated by task 34: kasan_save_stack+0x30/0x50 kasan_save_track+0x14/0x30 __kasan_kmalloc+0x8f/0xa0 __hci_conn_add+0x187/0x17d0 hci_connect_sco+0x2e1/0xb90 sco_sock_connect+0x2a2/0xb80 __sys_connect+0x227/0x2a0 __x64_sys_connect+0x6d/0xb0 do_syscall_64+0x71/0x140 entry_SYSCALL_64_after_hwframe+0x76/0x7e Freed by task 37: kasan_save_stack+0x30/0x50 kasan_save_track+0x14/0x30 kasan_save_free_info+0x3b/0x60 __kasan_slab_free+0x101/0x160 kfree+0xd0/0x250 device_release+0x9a/0x210 kobject_put+0x151/0x280 hci_conn_del+0x448/0xbf0 hci_abort_conn_sync+0x46f/0x980 hci_cmd_sync_work+0x1c2/0x330 process_one_work+0x7d9/0x1360 worker_thread+0x5b7/0xf60 kthread+0x293/0x360 ret_from_fork+0x2f/0x70 ret_from_fork_asm+0x1a/0x30 Cc: stable@vger.kernel.org Fixes: e07a06b4eb41 ("Bluetooth: Convert SCO configure_datapath to hci_sync") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index d083117ee36c..c4c74b82ed21 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -289,6 +289,9 @@ static int hci_enhanced_setup_sync(struct hci_dev *hdev, void *data) kfree(conn_handle); + if (!hci_conn_valid(hdev, conn)) + return -ECANCELED; + bt_dev_dbg(hdev, "hcon %p", conn); configure_datapath_sync(hdev, &conn->codec); -- cgit v1.2.3 From 1dae9f1187189bc09ff6d25ca97ead711f7e26f9 Mon Sep 17 00:00:00 2001 From: Anastasia Kovaleva Date: Thu, 3 Oct 2024 13:44:31 +0300 Subject: net: Fix an unsafe loop on the list The kernel may crash when deleting a genetlink family if there are still listeners for that family: Oops: Kernel access of bad area, sig: 11 [#1] ... NIP [c000000000c080bc] netlink_update_socket_mc+0x3c/0xc0 LR [c000000000c0f764] __netlink_clear_multicast_users+0x74/0xc0 Call Trace: __netlink_clear_multicast_users+0x74/0xc0 genl_unregister_family+0xd4/0x2d0 Change the unsafe loop on the list to a safe one, because inside the loop there is an element removal from this list. Fixes: b8273570f802 ("genetlink: fix netns vs. netlink table locking (2)") Cc: stable@vger.kernel.org Signed-off-by: Anastasia Kovaleva Reviewed-by: Dmitry Bogdanov Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241003104431.12391-1-a.kovaleva@yadro.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 2 ++ net/netlink/af_netlink.c | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index c58ca8dd561b..db29c39e19a7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -894,6 +894,8 @@ static inline void sk_add_bind_node(struct sock *sk, hlist_for_each_entry_safe(__sk, tmp, list, sk_node) #define sk_for_each_bound(__sk, list) \ hlist_for_each_entry(__sk, list, sk_bind_node) +#define sk_for_each_bound_safe(__sk, tmp, list) \ + hlist_for_each_entry_safe(__sk, tmp, list, sk_bind_node) /** * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 0b7a89db3ab7..0a9287fadb47 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2136,8 +2136,9 @@ void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group) { struct sock *sk; struct netlink_table *tbl = &nl_table[ksk->sk_protocol]; + struct hlist_node *tmp; - sk_for_each_bound(sk, &tbl->mc_list) + sk_for_each_bound_safe(sk, tmp, &tbl->mc_list) netlink_update_socket_mc(nlk_sk(sk), group, 0); } -- cgit v1.2.3 From a194c985973276b2f280428c848f20369bb83734 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 2 Oct 2024 09:35:20 -0400 Subject: vsock/virtio: use GFP_ATOMIC under RCU read lock virtio_transport_send_pkt in now called on transport fast path, under RCU read lock. In that case, we have a bug: virtio_add_sgs is called with GFP_KERNEL, and might sleep. Pass the gfp flags as an argument, and use GFP_ATOMIC on the fast path. Link: https://lore.kernel.org/all/hfcr2aget2zojmqpr4uhlzvnep4vgskblx5b6xf2ddosbsrke7@nt34bxgp7j2x Fixes: efcd71af38be ("vsock/virtio: avoid queuing packets when intermediate queue is empty") Reported-by: Christian Brauner Cc: Stefano Garzarella Cc: Luigi Leonardi Message-ID: <3fbfb6e871f625f89eb578c7228e127437b1975a.1727876449.git.mst@redhat.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Pankaj Gupta Reviewed-by: Christian Brauner Reviewed-by: Luigi Leonardi Reviewed-by: Stefano Garzarella --- net/vmw_vsock/virtio_transport.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index f992f9a216f0..0cd965f24609 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -96,7 +96,7 @@ out_rcu: /* Caller need to hold vsock->tx_lock on vq */ static int virtio_transport_send_skb(struct sk_buff *skb, struct virtqueue *vq, - struct virtio_vsock *vsock) + struct virtio_vsock *vsock, gfp_t gfp) { int ret, in_sg = 0, out_sg = 0; struct scatterlist **sgs; @@ -140,7 +140,7 @@ static int virtio_transport_send_skb(struct sk_buff *skb, struct virtqueue *vq, } } - ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, skb, GFP_KERNEL); + ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, skb, gfp); /* Usually this means that there is no more space available in * the vq */ @@ -178,7 +178,7 @@ virtio_transport_send_pkt_work(struct work_struct *work) reply = virtio_vsock_skb_reply(skb); - ret = virtio_transport_send_skb(skb, vq, vsock); + ret = virtio_transport_send_skb(skb, vq, vsock, GFP_KERNEL); if (ret < 0) { virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb); break; @@ -221,7 +221,7 @@ static int virtio_transport_send_skb_fast_path(struct virtio_vsock *vsock, struc if (unlikely(ret == 0)) return -EBUSY; - ret = virtio_transport_send_skb(skb, vq, vsock); + ret = virtio_transport_send_skb(skb, vq, vsock, GFP_ATOMIC); if (ret == 0) virtqueue_kick(vq); -- cgit v1.2.3 From 631083143315d1b192bd7d915b967b37819e88ea Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Thu, 3 Oct 2024 18:01:51 +0100 Subject: net: explicitly clear the sk pointer, when pf->create fails We have recently noticed the exact same KASAN splat as in commit 6cd4a78d962b ("net: do not leave a dangling sk pointer, when socket creation fails"). The problem is that commit did not fully address the problem, as some pf->create implementations do not use sk_common_release in their error paths. For example, we can use the same reproducer as in the above commit, but changing ping to arping. arping uses AF_PACKET socket and if packet_create fails, it will just sk_free the allocated sk object. While we could chase all the pf->create implementations and make sure they NULL the freed sk object on error from the socket, we can't guarantee future protocols will not make the same mistake. So it is easier to just explicitly NULL the sk pointer upon return from pf->create in __sock_create. We do know that pf->create always releases the allocated sk object on error, so if the pointer is not NULL, it is definitely dangling. Fixes: 6cd4a78d962b ("net: do not leave a dangling sk pointer, when socket creation fails") Signed-off-by: Ignat Korchagin Cc: stable@vger.kernel.org Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241003170151.69445-1-ignat@cloudflare.com Signed-off-by: Jakub Kicinski --- net/socket.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index 601ad74930ef..042451f01c65 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1574,8 +1574,13 @@ int __sock_create(struct net *net, int family, int type, int protocol, rcu_read_unlock(); err = pf->create(net, sock, protocol, kern); - if (err < 0) + if (err < 0) { + /* ->create should release the allocated sock->sk object on error + * but it may leave the dangling pointer + */ + sock->sk = NULL; goto out_module_put; + } /* * Now to bump the refcnt of the [loadable] module that owns this -- cgit v1.2.3 From 3cb7cf1540ddff5473d6baeb530228d19bc97b8a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Oct 2024 18:41:30 +0000 Subject: net/sched: accept TCA_STAB only for root qdisc Most qdiscs maintain their backlog using qdisc_pkt_len(skb) on the assumption it is invariant between the enqueue() and dequeue() handlers. Unfortunately syzbot can crash a host rather easily using a TBF + SFQ combination, with an STAB on SFQ [1] We can't support TCA_STAB on arbitrary level, this would require to maintain per-qdisc storage. [1] [ 88.796496] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 88.798611] #PF: supervisor read access in kernel mode [ 88.799014] #PF: error_code(0x0000) - not-present page [ 88.799506] PGD 0 P4D 0 [ 88.799829] Oops: Oops: 0000 [#1] SMP NOPTI [ 88.800569] CPU: 14 UID: 0 PID: 2053 Comm: b371744477 Not tainted 6.12.0-rc1-virtme #1117 [ 88.801107] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 88.801779] RIP: 0010:sfq_dequeue (net/sched/sch_sfq.c:272 net/sched/sch_sfq.c:499) sch_sfq [ 88.802544] Code: 0f b7 50 12 48 8d 04 d5 00 00 00 00 48 89 d6 48 29 d0 48 8b 91 c0 01 00 00 48 c1 e0 03 48 01 c2 66 83 7a 1a 00 7e c0 48 8b 3a <4c> 8b 07 4c 89 02 49 89 50 08 48 c7 47 08 00 00 00 00 48 c7 07 00 All code ======== 0: 0f b7 50 12 movzwl 0x12(%rax),%edx 4: 48 8d 04 d5 00 00 00 lea 0x0(,%rdx,8),%rax b: 00 c: 48 89 d6 mov %rdx,%rsi f: 48 29 d0 sub %rdx,%rax 12: 48 8b 91 c0 01 00 00 mov 0x1c0(%rcx),%rdx 19: 48 c1 e0 03 shl $0x3,%rax 1d: 48 01 c2 add %rax,%rdx 20: 66 83 7a 1a 00 cmpw $0x0,0x1a(%rdx) 25: 7e c0 jle 0xffffffffffffffe7 27: 48 8b 3a mov (%rdx),%rdi 2a:* 4c 8b 07 mov (%rdi),%r8 <-- trapping instruction 2d: 4c 89 02 mov %r8,(%rdx) 30: 49 89 50 08 mov %rdx,0x8(%r8) 34: 48 c7 47 08 00 00 00 movq $0x0,0x8(%rdi) 3b: 00 3c: 48 rex.W 3d: c7 .byte 0xc7 3e: 07 (bad) ... Code starting with the faulting instruction =========================================== 0: 4c 8b 07 mov (%rdi),%r8 3: 4c 89 02 mov %r8,(%rdx) 6: 49 89 50 08 mov %rdx,0x8(%r8) a: 48 c7 47 08 00 00 00 movq $0x0,0x8(%rdi) 11: 00 12: 48 rex.W 13: c7 .byte 0xc7 14: 07 (bad) ... [ 88.803721] RSP: 0018:ffff9a1f892b7d58 EFLAGS: 00000206 [ 88.804032] RAX: 0000000000000000 RBX: ffff9a1f8420c800 RCX: ffff9a1f8420c800 [ 88.804560] RDX: ffff9a1f81bc1440 RSI: 0000000000000000 RDI: 0000000000000000 [ 88.805056] RBP: ffffffffc04bb0e0 R08: 0000000000000001 R09: 00000000ff7f9a1f [ 88.805473] R10: 000000000001001b R11: 0000000000009a1f R12: 0000000000000140 [ 88.806194] R13: 0000000000000001 R14: ffff9a1f886df400 R15: ffff9a1f886df4ac [ 88.806734] FS: 00007f445601a740(0000) GS:ffff9a2e7fd80000(0000) knlGS:0000000000000000 [ 88.807225] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 88.807672] CR2: 0000000000000000 CR3: 000000050cc46000 CR4: 00000000000006f0 [ 88.808165] Call Trace: [ 88.808459] [ 88.808710] ? __die (arch/x86/kernel/dumpstack.c:421 arch/x86/kernel/dumpstack.c:434) [ 88.809261] ? page_fault_oops (arch/x86/mm/fault.c:715) [ 88.809561] ? exc_page_fault (./arch/x86/include/asm/irqflags.h:26 ./arch/x86/include/asm/irqflags.h:87 ./arch/x86/include/asm/irqflags.h:147 arch/x86/mm/fault.c:1489 arch/x86/mm/fault.c:1539) [ 88.809806] ? asm_exc_page_fault (./arch/x86/include/asm/idtentry.h:623) [ 88.810074] ? sfq_dequeue (net/sched/sch_sfq.c:272 net/sched/sch_sfq.c:499) sch_sfq [ 88.810411] sfq_reset (net/sched/sch_sfq.c:525) sch_sfq [ 88.810671] qdisc_reset (./include/linux/skbuff.h:2135 ./include/linux/skbuff.h:2441 ./include/linux/skbuff.h:3304 ./include/linux/skbuff.h:3310 net/sched/sch_generic.c:1036) [ 88.810950] tbf_reset (./include/linux/timekeeping.h:169 net/sched/sch_tbf.c:334) sch_tbf [ 88.811208] qdisc_reset (./include/linux/skbuff.h:2135 ./include/linux/skbuff.h:2441 ./include/linux/skbuff.h:3304 ./include/linux/skbuff.h:3310 net/sched/sch_generic.c:1036) [ 88.811484] netif_set_real_num_tx_queues (./include/linux/spinlock.h:396 ./include/net/sch_generic.h:768 net/core/dev.c:2958) [ 88.811870] __tun_detach (drivers/net/tun.c:590 drivers/net/tun.c:673) [ 88.812271] tun_chr_close (drivers/net/tun.c:702 drivers/net/tun.c:3517) [ 88.812505] __fput (fs/file_table.c:432 (discriminator 1)) [ 88.812735] task_work_run (kernel/task_work.c:230) [ 88.813016] do_exit (kernel/exit.c:940) [ 88.813372] ? trace_hardirqs_on (kernel/trace/trace_preemptirq.c:58 (discriminator 4)) [ 88.813639] ? handle_mm_fault (./arch/x86/include/asm/irqflags.h:42 ./arch/x86/include/asm/irqflags.h:97 ./arch/x86/include/asm/irqflags.h:155 ./include/linux/memcontrol.h:1022 ./include/linux/memcontrol.h:1045 ./include/linux/memcontrol.h:1052 mm/memory.c:5928 mm/memory.c:6088) [ 88.813867] do_group_exit (kernel/exit.c:1070) [ 88.814138] __x64_sys_exit_group (kernel/exit.c:1099) [ 88.814490] x64_sys_call (??:?) [ 88.814791] do_syscall_64 (arch/x86/entry/common.c:52 (discriminator 1) arch/x86/entry/common.c:83 (discriminator 1)) [ 88.815012] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) [ 88.815495] RIP: 0033:0x7f44560f1975 Fixes: 175f9c1bba9b ("net_sched: Add size table for qdiscs") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Daniel Borkmann Link: https://patch.msgid.link/20241007184130.3960565-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sch_generic.h | 1 - net/sched/sch_api.c | 7 ++++++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 79edd5b5e3c9..5d74fa7e694c 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -848,7 +848,6 @@ static inline void qdisc_calculate_pkt_len(struct sk_buff *skb, static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { - qdisc_calculate_pkt_len(skb, sch); return sch->enqueue(skb, sch, to_free); } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 74afc210527d..2eefa4783879 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -593,7 +593,6 @@ out: pkt_len = 1; qdisc_skb_cb(skb)->pkt_len = pkt_len; } -EXPORT_SYMBOL(__qdisc_calculate_pkt_len); void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc) { @@ -1201,6 +1200,12 @@ skip: return -EINVAL; } + if (new && + !(parent->flags & TCQ_F_MQROOT) && + rcu_access_pointer(new->stab)) { + NL_SET_ERR_MSG(extack, "STAB not supported on a non root"); + return -EINVAL; + } err = cops->graft(parent, cl, new, &old, extack); if (err) return err; -- cgit v1.2.3 From faa34159d08089036b6119c85e279fb36abb8bb5 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Mon, 30 Sep 2024 16:15:20 +0800 Subject: net/9p/usbg: Fix build error When CONFIG_NET_9P_USBG=y but CONFIG_USB_LIBCOMPOSITE=m and CONFIG_CONFIGFS_FS=m, the following build error occurs: riscv64-unknown-linux-gnu-ld: net/9p/trans_usbg.o: in function `usb9pfs_free_func': trans_usbg.c:(.text+0x124): undefined reference to `usb_free_all_descriptors' riscv64-unknown-linux-gnu-ld: net/9p/trans_usbg.o: in function `usb9pfs_rx_complete': trans_usbg.c:(.text+0x2d8): undefined reference to `usb_interface_id' riscv64-unknown-linux-gnu-ld: trans_usbg.c:(.text+0x2f6): undefined reference to `usb_string_id' riscv64-unknown-linux-gnu-ld: net/9p/trans_usbg.o: in function `usb9pfs_func_bind': trans_usbg.c:(.text+0x31c): undefined reference to `usb_ep_autoconfig' riscv64-unknown-linux-gnu-ld: trans_usbg.c:(.text+0x336): undefined reference to `usb_ep_autoconfig' riscv64-unknown-linux-gnu-ld: trans_usbg.c:(.text+0x378): undefined reference to `usb_assign_descriptors' riscv64-unknown-linux-gnu-ld: net/9p/trans_usbg.o: in function `f_usb9pfs_opts_buflen_store': trans_usbg.c:(.text+0x49e): undefined reference to `usb_put_function_instance' riscv64-unknown-linux-gnu-ld: net/9p/trans_usbg.o: in function `usb9pfs_alloc_instance': trans_usbg.c:(.text+0x5fe): undefined reference to `config_group_init_type_name' riscv64-unknown-linux-gnu-ld: net/9p/trans_usbg.o: in function `usb9pfs_alloc': trans_usbg.c:(.text+0x7aa): undefined reference to `config_ep_by_speed' riscv64-unknown-linux-gnu-ld: trans_usbg.c:(.text+0x7ea): undefined reference to `config_ep_by_speed' riscv64-unknown-linux-gnu-ld: net/9p/trans_usbg.o: in function `usb9pfs_set_alt': trans_usbg.c:(.text+0x828): undefined reference to `alloc_ep_req' riscv64-unknown-linux-gnu-ld: net/9p/trans_usbg.o: in function `usb9pfs_modexit': trans_usbg.c:(.exit.text+0x10): undefined reference to `usb_function_unregister' riscv64-unknown-linux-gnu-ld: net/9p/trans_usbg.o: in function `usb9pfs_modinit': trans_usbg.c:(.init.text+0x1e): undefined reference to `usb_function_register' Select the config for NET_9P_USBG to fix it. Fixes: a3be076dc174 ("net/9p/usbg: Add new usb gadget function transport") Signed-off-by: Jinjie Ruan Tested-by: Kexy Biscuit Link: https://lore.kernel.org/r/20240930081520.2371424-1-ruanjinjie@huawei.com Signed-off-by: Greg Kroah-Hartman --- net/9p/Kconfig | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/9p/Kconfig b/net/9p/Kconfig index 63f988f0c9e8..ee967fd25312 100644 --- a/net/9p/Kconfig +++ b/net/9p/Kconfig @@ -43,6 +43,8 @@ config NET_9P_XEN config NET_9P_USBG bool "9P USB Gadget Transport" depends on USB_GADGET=y || USB_GADGET=NET_9P + select CONFIGFS_FS + select USB_LIBCOMPOSITE help This builds support for a transport for 9pfs over usb gadget. -- cgit v1.2.3 From 4d5c70e6155d5eae198bade4afeab3c1b15073b6 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 7 Oct 2024 12:25:11 -0400 Subject: sctp: ensure sk_state is set to CLOSED if hashing fails in sctp_listen_start If hashing fails in sctp_listen_start(), the socket remains in the LISTENING state, even though it was not added to the hash table. This can lead to a scenario where a socket appears to be listening without actually being accessible. This patch ensures that if the hashing operation fails, the sk_state is set back to CLOSED before returning an error. Note that there is no need to undo the autobind operation if hashing fails, as the bind port can still be used for next listen() call on the same socket. Fixes: 76c6d988aeb3 ("sctp: add sock_reuseport for the sock in __sctp_hash_endpoint") Reported-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 078bcb3858c7..36ee34f483d7 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -8531,6 +8531,7 @@ static int sctp_listen_start(struct sock *sk, int backlog) struct sctp_endpoint *ep = sp->ep; struct crypto_shash *tfm = NULL; char alg[32]; + int err; /* Allocate HMAC for generating cookie. */ if (!sp->hmac && sp->sctp_hmac_alg) { @@ -8558,18 +8559,25 @@ static int sctp_listen_start(struct sock *sk, int backlog) inet_sk_set_state(sk, SCTP_SS_LISTENING); if (!ep->base.bind_addr.port) { if (sctp_autobind(sk)) { - inet_sk_set_state(sk, SCTP_SS_CLOSED); - return -EAGAIN; + err = -EAGAIN; + goto err; } } else { if (sctp_get_port(sk, inet_sk(sk)->inet_num)) { - inet_sk_set_state(sk, SCTP_SS_CLOSED); - return -EADDRINUSE; + err = -EADDRINUSE; + goto err; } } WRITE_ONCE(sk->sk_max_ack_backlog, backlog); - return sctp_hash_endpoint(ep); + err = sctp_hash_endpoint(ep); + if (err) + goto err; + + return 0; +err: + inet_sk_set_state(sk, SCTP_SS_CLOSED); + return err; } /* -- cgit v1.2.3 From 0bfcb7b71e735560077a42847f69597ec7dcc326 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 7 Oct 2024 11:28:16 +0200 Subject: netfilter: xtables: avoid NFPROTO_UNSPEC where needed syzbot managed to call xt_cluster match via ebtables: WARNING: CPU: 0 PID: 11 at net/netfilter/xt_cluster.c:72 xt_cluster_mt+0x196/0x780 [..] ebt_do_table+0x174b/0x2a40 Module registers to NFPROTO_UNSPEC, but it assumes ipv4/ipv6 packet processing. As this is only useful to restrict locally terminating TCP/UDP traffic, register this for ipv4 and ipv6 family only. Pablo points out that this is a general issue, direct users of the set/getsockopt interface can call into targets/matches that were only intended for use with ip(6)tables. Check all UNSPEC matches and targets for similar issues: - matches and targets are fine except if they assume skb_network_header() is valid -- this is only true when called from inet layer: ip(6) stack pulls the ip/ipv6 header into linear data area. - targets that return XT_CONTINUE or other xtables verdicts must be restricted too, they are incompatbile with the ebtables traverser, e.g. EBT_CONTINUE is a completely different value than XT_CONTINUE. Most matches/targets are changed to register for NFPROTO_IPV4/IPV6, as they are provided for use by ip(6)tables. The MARK target is also used by arptables, so register for NFPROTO_ARP too. While at it, bail out if connbytes fails to enable the corresponding conntrack family. This change passes the selftests in iptables.git. Reported-by: syzbot+256c348558aa5cf611a9@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netfilter-devel/66fec2e2.050a0220.9ec68.0047.GAE@google.com/ Fixes: 0269ea493734 ("netfilter: xtables: add cluster match") Signed-off-by: Florian Westphal Co-developed-by: Pablo Neira Ayuso Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_CHECKSUM.c | 33 +++++++++---- net/netfilter/xt_CLASSIFY.c | 16 ++++++- net/netfilter/xt_CONNSECMARK.c | 36 +++++++++----- net/netfilter/xt_CT.c | 106 +++++++++++++++++++++++++++-------------- net/netfilter/xt_IDLETIMER.c | 59 +++++++++++++++-------- net/netfilter/xt_LED.c | 39 ++++++++++----- net/netfilter/xt_NFLOG.c | 36 +++++++++----- net/netfilter/xt_RATEEST.c | 39 ++++++++++----- net/netfilter/xt_SECMARK.c | 27 ++++++++++- net/netfilter/xt_TRACE.c | 35 +++++++++----- net/netfilter/xt_addrtype.c | 15 +++++- net/netfilter/xt_cluster.c | 33 +++++++++---- net/netfilter/xt_connbytes.c | 4 +- net/netfilter/xt_connlimit.c | 39 ++++++++++----- net/netfilter/xt_connmark.c | 28 +++++++++-- net/netfilter/xt_mark.c | 42 ++++++++++++---- 16 files changed, 422 insertions(+), 165 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c index c8a639f56168..9d99f5a3d176 100644 --- a/net/netfilter/xt_CHECKSUM.c +++ b/net/netfilter/xt_CHECKSUM.c @@ -63,24 +63,37 @@ static int checksum_tg_check(const struct xt_tgchk_param *par) return 0; } -static struct xt_target checksum_tg_reg __read_mostly = { - .name = "CHECKSUM", - .family = NFPROTO_UNSPEC, - .target = checksum_tg, - .targetsize = sizeof(struct xt_CHECKSUM_info), - .table = "mangle", - .checkentry = checksum_tg_check, - .me = THIS_MODULE, +static struct xt_target checksum_tg_reg[] __read_mostly = { + { + .name = "CHECKSUM", + .family = NFPROTO_IPV4, + .target = checksum_tg, + .targetsize = sizeof(struct xt_CHECKSUM_info), + .table = "mangle", + .checkentry = checksum_tg_check, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "CHECKSUM", + .family = NFPROTO_IPV6, + .target = checksum_tg, + .targetsize = sizeof(struct xt_CHECKSUM_info), + .table = "mangle", + .checkentry = checksum_tg_check, + .me = THIS_MODULE, + }, +#endif }; static int __init checksum_tg_init(void) { - return xt_register_target(&checksum_tg_reg); + return xt_register_targets(checksum_tg_reg, ARRAY_SIZE(checksum_tg_reg)); } static void __exit checksum_tg_exit(void) { - xt_unregister_target(&checksum_tg_reg); + xt_unregister_targets(checksum_tg_reg, ARRAY_SIZE(checksum_tg_reg)); } module_init(checksum_tg_init); diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c index 0accac98dea7..0ae8d8a1216e 100644 --- a/net/netfilter/xt_CLASSIFY.c +++ b/net/netfilter/xt_CLASSIFY.c @@ -38,9 +38,9 @@ static struct xt_target classify_tg_reg[] __read_mostly = { { .name = "CLASSIFY", .revision = 0, - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | - (1 << NF_INET_POST_ROUTING), + (1 << NF_INET_POST_ROUTING), .target = classify_tg, .targetsize = sizeof(struct xt_classify_target_info), .me = THIS_MODULE, @@ -54,6 +54,18 @@ static struct xt_target classify_tg_reg[] __read_mostly = { .targetsize = sizeof(struct xt_classify_target_info), .me = THIS_MODULE, }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "CLASSIFY", + .revision = 0, + .family = NFPROTO_IPV6, + .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | + (1 << NF_INET_POST_ROUTING), + .target = classify_tg, + .targetsize = sizeof(struct xt_classify_target_info), + .me = THIS_MODULE, + }, +#endif }; static int __init classify_tg_init(void) diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c index 76acecf3e757..1494b3ee30e1 100644 --- a/net/netfilter/xt_CONNSECMARK.c +++ b/net/netfilter/xt_CONNSECMARK.c @@ -114,25 +114,39 @@ static void connsecmark_tg_destroy(const struct xt_tgdtor_param *par) nf_ct_netns_put(par->net, par->family); } -static struct xt_target connsecmark_tg_reg __read_mostly = { - .name = "CONNSECMARK", - .revision = 0, - .family = NFPROTO_UNSPEC, - .checkentry = connsecmark_tg_check, - .destroy = connsecmark_tg_destroy, - .target = connsecmark_tg, - .targetsize = sizeof(struct xt_connsecmark_target_info), - .me = THIS_MODULE, +static struct xt_target connsecmark_tg_reg[] __read_mostly = { + { + .name = "CONNSECMARK", + .revision = 0, + .family = NFPROTO_IPV4, + .checkentry = connsecmark_tg_check, + .destroy = connsecmark_tg_destroy, + .target = connsecmark_tg, + .targetsize = sizeof(struct xt_connsecmark_target_info), + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "CONNSECMARK", + .revision = 0, + .family = NFPROTO_IPV6, + .checkentry = connsecmark_tg_check, + .destroy = connsecmark_tg_destroy, + .target = connsecmark_tg, + .targetsize = sizeof(struct xt_connsecmark_target_info), + .me = THIS_MODULE, + }, +#endif }; static int __init connsecmark_tg_init(void) { - return xt_register_target(&connsecmark_tg_reg); + return xt_register_targets(connsecmark_tg_reg, ARRAY_SIZE(connsecmark_tg_reg)); } static void __exit connsecmark_tg_exit(void) { - xt_unregister_target(&connsecmark_tg_reg); + xt_unregister_targets(connsecmark_tg_reg, ARRAY_SIZE(connsecmark_tg_reg)); } module_init(connsecmark_tg_init); diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 2be2f7a7b60f..3ba94c34297c 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -313,10 +313,30 @@ static void xt_ct_tg_destroy_v1(const struct xt_tgdtor_param *par) xt_ct_tg_destroy(par, par->targinfo); } +static unsigned int +notrack_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + /* Previously seen (loopback)? Ignore. */ + if (skb->_nfct != 0) + return XT_CONTINUE; + + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); + + return XT_CONTINUE; +} + static struct xt_target xt_ct_tg_reg[] __read_mostly = { + { + .name = "NOTRACK", + .revision = 0, + .family = NFPROTO_IPV4, + .target = notrack_tg, + .table = "raw", + .me = THIS_MODULE, + }, { .name = "CT", - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .targetsize = sizeof(struct xt_ct_target_info), .usersize = offsetof(struct xt_ct_target_info, ct), .checkentry = xt_ct_tg_check_v0, @@ -327,7 +347,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { }, { .name = "CT", - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .revision = 1, .targetsize = sizeof(struct xt_ct_target_info_v1), .usersize = offsetof(struct xt_ct_target_info, ct), @@ -339,7 +359,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { }, { .name = "CT", - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .revision = 2, .targetsize = sizeof(struct xt_ct_target_info_v1), .usersize = offsetof(struct xt_ct_target_info, ct), @@ -349,49 +369,61 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { .table = "raw", .me = THIS_MODULE, }, -}; - -static unsigned int -notrack_tg(struct sk_buff *skb, const struct xt_action_param *par) -{ - /* Previously seen (loopback)? Ignore. */ - if (skb->_nfct != 0) - return XT_CONTINUE; - - nf_ct_set(skb, NULL, IP_CT_UNTRACKED); - - return XT_CONTINUE; -} - -static struct xt_target notrack_tg_reg __read_mostly = { - .name = "NOTRACK", - .revision = 0, - .family = NFPROTO_UNSPEC, - .target = notrack_tg, - .table = "raw", - .me = THIS_MODULE, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "NOTRACK", + .revision = 0, + .family = NFPROTO_IPV6, + .target = notrack_tg, + .table = "raw", + .me = THIS_MODULE, + }, + { + .name = "CT", + .family = NFPROTO_IPV6, + .targetsize = sizeof(struct xt_ct_target_info), + .usersize = offsetof(struct xt_ct_target_info, ct), + .checkentry = xt_ct_tg_check_v0, + .destroy = xt_ct_tg_destroy_v0, + .target = xt_ct_target_v0, + .table = "raw", + .me = THIS_MODULE, + }, + { + .name = "CT", + .family = NFPROTO_IPV6, + .revision = 1, + .targetsize = sizeof(struct xt_ct_target_info_v1), + .usersize = offsetof(struct xt_ct_target_info, ct), + .checkentry = xt_ct_tg_check_v1, + .destroy = xt_ct_tg_destroy_v1, + .target = xt_ct_target_v1, + .table = "raw", + .me = THIS_MODULE, + }, + { + .name = "CT", + .family = NFPROTO_IPV6, + .revision = 2, + .targetsize = sizeof(struct xt_ct_target_info_v1), + .usersize = offsetof(struct xt_ct_target_info, ct), + .checkentry = xt_ct_tg_check_v2, + .destroy = xt_ct_tg_destroy_v1, + .target = xt_ct_target_v1, + .table = "raw", + .me = THIS_MODULE, + }, +#endif }; static int __init xt_ct_tg_init(void) { - int ret; - - ret = xt_register_target(¬rack_tg_reg); - if (ret < 0) - return ret; - - ret = xt_register_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg)); - if (ret < 0) { - xt_unregister_target(¬rack_tg_reg); - return ret; - } - return 0; + return xt_register_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg)); } static void __exit xt_ct_tg_exit(void) { xt_unregister_targets(xt_ct_tg_reg, ARRAY_SIZE(xt_ct_tg_reg)); - xt_unregister_target(¬rack_tg_reg); } module_init(xt_ct_tg_init); diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index db720efa811d..f8b25b6f5da7 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -458,28 +458,49 @@ static void idletimer_tg_destroy_v1(const struct xt_tgdtor_param *par) static struct xt_target idletimer_tg[] __read_mostly = { { - .name = "IDLETIMER", - .family = NFPROTO_UNSPEC, - .target = idletimer_tg_target, - .targetsize = sizeof(struct idletimer_tg_info), - .usersize = offsetof(struct idletimer_tg_info, timer), - .checkentry = idletimer_tg_checkentry, - .destroy = idletimer_tg_destroy, - .me = THIS_MODULE, + .name = "IDLETIMER", + .family = NFPROTO_IPV4, + .target = idletimer_tg_target, + .targetsize = sizeof(struct idletimer_tg_info), + .usersize = offsetof(struct idletimer_tg_info, timer), + .checkentry = idletimer_tg_checkentry, + .destroy = idletimer_tg_destroy, + .me = THIS_MODULE, }, { - .name = "IDLETIMER", - .family = NFPROTO_UNSPEC, - .revision = 1, - .target = idletimer_tg_target_v1, - .targetsize = sizeof(struct idletimer_tg_info_v1), - .usersize = offsetof(struct idletimer_tg_info_v1, timer), - .checkentry = idletimer_tg_checkentry_v1, - .destroy = idletimer_tg_destroy_v1, - .me = THIS_MODULE, + .name = "IDLETIMER", + .family = NFPROTO_IPV4, + .revision = 1, + .target = idletimer_tg_target_v1, + .targetsize = sizeof(struct idletimer_tg_info_v1), + .usersize = offsetof(struct idletimer_tg_info_v1, timer), + .checkentry = idletimer_tg_checkentry_v1, + .destroy = idletimer_tg_destroy_v1, + .me = THIS_MODULE, }, - - +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "IDLETIMER", + .family = NFPROTO_IPV6, + .target = idletimer_tg_target, + .targetsize = sizeof(struct idletimer_tg_info), + .usersize = offsetof(struct idletimer_tg_info, timer), + .checkentry = idletimer_tg_checkentry, + .destroy = idletimer_tg_destroy, + .me = THIS_MODULE, + }, + { + .name = "IDLETIMER", + .family = NFPROTO_IPV6, + .revision = 1, + .target = idletimer_tg_target_v1, + .targetsize = sizeof(struct idletimer_tg_info_v1), + .usersize = offsetof(struct idletimer_tg_info_v1, timer), + .checkentry = idletimer_tg_checkentry_v1, + .destroy = idletimer_tg_destroy_v1, + .me = THIS_MODULE, + }, +#endif }; static struct class *idletimer_tg_class; diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c index 36c9720ad8d6..f7b0286d106a 100644 --- a/net/netfilter/xt_LED.c +++ b/net/netfilter/xt_LED.c @@ -175,26 +175,41 @@ static void led_tg_destroy(const struct xt_tgdtor_param *par) kfree(ledinternal); } -static struct xt_target led_tg_reg __read_mostly = { - .name = "LED", - .revision = 0, - .family = NFPROTO_UNSPEC, - .target = led_tg, - .targetsize = sizeof(struct xt_led_info), - .usersize = offsetof(struct xt_led_info, internal_data), - .checkentry = led_tg_check, - .destroy = led_tg_destroy, - .me = THIS_MODULE, +static struct xt_target led_tg_reg[] __read_mostly = { + { + .name = "LED", + .revision = 0, + .family = NFPROTO_IPV4, + .target = led_tg, + .targetsize = sizeof(struct xt_led_info), + .usersize = offsetof(struct xt_led_info, internal_data), + .checkentry = led_tg_check, + .destroy = led_tg_destroy, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "LED", + .revision = 0, + .family = NFPROTO_IPV6, + .target = led_tg, + .targetsize = sizeof(struct xt_led_info), + .usersize = offsetof(struct xt_led_info, internal_data), + .checkentry = led_tg_check, + .destroy = led_tg_destroy, + .me = THIS_MODULE, + }, +#endif }; static int __init led_tg_init(void) { - return xt_register_target(&led_tg_reg); + return xt_register_targets(led_tg_reg, ARRAY_SIZE(led_tg_reg)); } static void __exit led_tg_exit(void) { - xt_unregister_target(&led_tg_reg); + xt_unregister_targets(led_tg_reg, ARRAY_SIZE(led_tg_reg)); } module_init(led_tg_init); diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index e660c3710a10..d80abd6ccaf8 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -64,25 +64,39 @@ static void nflog_tg_destroy(const struct xt_tgdtor_param *par) nf_logger_put(par->family, NF_LOG_TYPE_ULOG); } -static struct xt_target nflog_tg_reg __read_mostly = { - .name = "NFLOG", - .revision = 0, - .family = NFPROTO_UNSPEC, - .checkentry = nflog_tg_check, - .destroy = nflog_tg_destroy, - .target = nflog_tg, - .targetsize = sizeof(struct xt_nflog_info), - .me = THIS_MODULE, +static struct xt_target nflog_tg_reg[] __read_mostly = { + { + .name = "NFLOG", + .revision = 0, + .family = NFPROTO_IPV4, + .checkentry = nflog_tg_check, + .destroy = nflog_tg_destroy, + .target = nflog_tg, + .targetsize = sizeof(struct xt_nflog_info), + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "NFLOG", + .revision = 0, + .family = NFPROTO_IPV4, + .checkentry = nflog_tg_check, + .destroy = nflog_tg_destroy, + .target = nflog_tg, + .targetsize = sizeof(struct xt_nflog_info), + .me = THIS_MODULE, + }, +#endif }; static int __init nflog_tg_init(void) { - return xt_register_target(&nflog_tg_reg); + return xt_register_targets(nflog_tg_reg, ARRAY_SIZE(nflog_tg_reg)); } static void __exit nflog_tg_exit(void) { - xt_unregister_target(&nflog_tg_reg); + xt_unregister_targets(nflog_tg_reg, ARRAY_SIZE(nflog_tg_reg)); } module_init(nflog_tg_init); diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 80f6624e2355..4f49cfc27831 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -179,16 +179,31 @@ static void xt_rateest_tg_destroy(const struct xt_tgdtor_param *par) xt_rateest_put(par->net, info->est); } -static struct xt_target xt_rateest_tg_reg __read_mostly = { - .name = "RATEEST", - .revision = 0, - .family = NFPROTO_UNSPEC, - .target = xt_rateest_tg, - .checkentry = xt_rateest_tg_checkentry, - .destroy = xt_rateest_tg_destroy, - .targetsize = sizeof(struct xt_rateest_target_info), - .usersize = offsetof(struct xt_rateest_target_info, est), - .me = THIS_MODULE, +static struct xt_target xt_rateest_tg_reg[] __read_mostly = { + { + .name = "RATEEST", + .revision = 0, + .family = NFPROTO_IPV4, + .target = xt_rateest_tg, + .checkentry = xt_rateest_tg_checkentry, + .destroy = xt_rateest_tg_destroy, + .targetsize = sizeof(struct xt_rateest_target_info), + .usersize = offsetof(struct xt_rateest_target_info, est), + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "RATEEST", + .revision = 0, + .family = NFPROTO_IPV6, + .target = xt_rateest_tg, + .checkentry = xt_rateest_tg_checkentry, + .destroy = xt_rateest_tg_destroy, + .targetsize = sizeof(struct xt_rateest_target_info), + .usersize = offsetof(struct xt_rateest_target_info, est), + .me = THIS_MODULE, + }, +#endif }; static __net_init int xt_rateest_net_init(struct net *net) @@ -214,12 +229,12 @@ static int __init xt_rateest_tg_init(void) if (err) return err; - return xt_register_target(&xt_rateest_tg_reg); + return xt_register_targets(xt_rateest_tg_reg, ARRAY_SIZE(xt_rateest_tg_reg)); } static void __exit xt_rateest_tg_fini(void) { - xt_unregister_target(&xt_rateest_tg_reg); + xt_unregister_targets(xt_rateest_tg_reg, ARRAY_SIZE(xt_rateest_tg_reg)); unregister_pernet_subsys(&xt_rateest_net_ops); } diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c index 498a0bf6f044..5bc5ea505eb9 100644 --- a/net/netfilter/xt_SECMARK.c +++ b/net/netfilter/xt_SECMARK.c @@ -157,7 +157,7 @@ static struct xt_target secmark_tg_reg[] __read_mostly = { { .name = "SECMARK", .revision = 0, - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .checkentry = secmark_tg_check_v0, .destroy = secmark_tg_destroy, .target = secmark_tg_v0, @@ -167,7 +167,7 @@ static struct xt_target secmark_tg_reg[] __read_mostly = { { .name = "SECMARK", .revision = 1, - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .checkentry = secmark_tg_check_v1, .destroy = secmark_tg_destroy, .target = secmark_tg_v1, @@ -175,6 +175,29 @@ static struct xt_target secmark_tg_reg[] __read_mostly = { .usersize = offsetof(struct xt_secmark_target_info_v1, secid), .me = THIS_MODULE, }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "SECMARK", + .revision = 0, + .family = NFPROTO_IPV6, + .checkentry = secmark_tg_check_v0, + .destroy = secmark_tg_destroy, + .target = secmark_tg_v0, + .targetsize = sizeof(struct xt_secmark_target_info), + .me = THIS_MODULE, + }, + { + .name = "SECMARK", + .revision = 1, + .family = NFPROTO_IPV6, + .checkentry = secmark_tg_check_v1, + .destroy = secmark_tg_destroy, + .target = secmark_tg_v1, + .targetsize = sizeof(struct xt_secmark_target_info_v1), + .usersize = offsetof(struct xt_secmark_target_info_v1, secid), + .me = THIS_MODULE, + }, +#endif }; static int __init secmark_tg_init(void) diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c index 5582dce98cae..f3fa4f11348c 100644 --- a/net/netfilter/xt_TRACE.c +++ b/net/netfilter/xt_TRACE.c @@ -29,25 +29,38 @@ trace_tg(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static struct xt_target trace_tg_reg __read_mostly = { - .name = "TRACE", - .revision = 0, - .family = NFPROTO_UNSPEC, - .table = "raw", - .target = trace_tg, - .checkentry = trace_tg_check, - .destroy = trace_tg_destroy, - .me = THIS_MODULE, +static struct xt_target trace_tg_reg[] __read_mostly = { + { + .name = "TRACE", + .revision = 0, + .family = NFPROTO_IPV4, + .table = "raw", + .target = trace_tg, + .checkentry = trace_tg_check, + .destroy = trace_tg_destroy, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "TRACE", + .revision = 0, + .family = NFPROTO_IPV6, + .table = "raw", + .target = trace_tg, + .checkentry = trace_tg_check, + .destroy = trace_tg_destroy, + }, +#endif }; static int __init trace_tg_init(void) { - return xt_register_target(&trace_tg_reg); + return xt_register_targets(trace_tg_reg, ARRAY_SIZE(trace_tg_reg)); } static void __exit trace_tg_exit(void) { - xt_unregister_target(&trace_tg_reg); + xt_unregister_targets(trace_tg_reg, ARRAY_SIZE(trace_tg_reg)); } module_init(trace_tg_init); diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c index e9b2181e8c42..a77088943107 100644 --- a/net/netfilter/xt_addrtype.c +++ b/net/netfilter/xt_addrtype.c @@ -208,13 +208,24 @@ static struct xt_match addrtype_mt_reg[] __read_mostly = { }, { .name = "addrtype", - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .revision = 1, .match = addrtype_mt_v1, .checkentry = addrtype_mt_checkentry_v1, .matchsize = sizeof(struct xt_addrtype_info_v1), .me = THIS_MODULE - } + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "addrtype", + .family = NFPROTO_IPV6, + .revision = 1, + .match = addrtype_mt_v1, + .checkentry = addrtype_mt_checkentry_v1, + .matchsize = sizeof(struct xt_addrtype_info_v1), + .me = THIS_MODULE + }, +#endif }; static int __init addrtype_mt_init(void) diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c index a047a545371e..908fd5f2c3c8 100644 --- a/net/netfilter/xt_cluster.c +++ b/net/netfilter/xt_cluster.c @@ -146,24 +146,37 @@ static void xt_cluster_mt_destroy(const struct xt_mtdtor_param *par) nf_ct_netns_put(par->net, par->family); } -static struct xt_match xt_cluster_match __read_mostly = { - .name = "cluster", - .family = NFPROTO_UNSPEC, - .match = xt_cluster_mt, - .checkentry = xt_cluster_mt_checkentry, - .matchsize = sizeof(struct xt_cluster_match_info), - .destroy = xt_cluster_mt_destroy, - .me = THIS_MODULE, +static struct xt_match xt_cluster_match[] __read_mostly = { + { + .name = "cluster", + .family = NFPROTO_IPV4, + .match = xt_cluster_mt, + .checkentry = xt_cluster_mt_checkentry, + .matchsize = sizeof(struct xt_cluster_match_info), + .destroy = xt_cluster_mt_destroy, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "cluster", + .family = NFPROTO_IPV6, + .match = xt_cluster_mt, + .checkentry = xt_cluster_mt_checkentry, + .matchsize = sizeof(struct xt_cluster_match_info), + .destroy = xt_cluster_mt_destroy, + .me = THIS_MODULE, + }, +#endif }; static int __init xt_cluster_mt_init(void) { - return xt_register_match(&xt_cluster_match); + return xt_register_matches(xt_cluster_match, ARRAY_SIZE(xt_cluster_match)); } static void __exit xt_cluster_mt_fini(void) { - xt_unregister_match(&xt_cluster_match); + xt_unregister_matches(xt_cluster_match, ARRAY_SIZE(xt_cluster_match)); } MODULE_AUTHOR("Pablo Neira Ayuso "); diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index 93cb018c3055..2aabdcea8707 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -111,9 +111,11 @@ static int connbytes_mt_check(const struct xt_mtchk_param *par) return -EINVAL; ret = nf_ct_netns_get(par->net, par->family); - if (ret < 0) + if (ret < 0) { pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); + return ret; + } /* * This filter cannot function correctly unless connection tracking diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 0e762277bcf8..0189f8b6b0bd 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -117,26 +117,41 @@ static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) nf_ct_netns_put(par->net, par->family); } -static struct xt_match connlimit_mt_reg __read_mostly = { - .name = "connlimit", - .revision = 1, - .family = NFPROTO_UNSPEC, - .checkentry = connlimit_mt_check, - .match = connlimit_mt, - .matchsize = sizeof(struct xt_connlimit_info), - .usersize = offsetof(struct xt_connlimit_info, data), - .destroy = connlimit_mt_destroy, - .me = THIS_MODULE, +static struct xt_match connlimit_mt_reg[] __read_mostly = { + { + .name = "connlimit", + .revision = 1, + .family = NFPROTO_IPV4, + .checkentry = connlimit_mt_check, + .match = connlimit_mt, + .matchsize = sizeof(struct xt_connlimit_info), + .usersize = offsetof(struct xt_connlimit_info, data), + .destroy = connlimit_mt_destroy, + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "connlimit", + .revision = 1, + .family = NFPROTO_IPV6, + .checkentry = connlimit_mt_check, + .match = connlimit_mt, + .matchsize = sizeof(struct xt_connlimit_info), + .usersize = offsetof(struct xt_connlimit_info, data), + .destroy = connlimit_mt_destroy, + .me = THIS_MODULE, + }, +#endif }; static int __init connlimit_mt_init(void) { - return xt_register_match(&connlimit_mt_reg); + return xt_register_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg)); } static void __exit connlimit_mt_exit(void) { - xt_unregister_match(&connlimit_mt_reg); + xt_unregister_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg)); } module_init(connlimit_mt_init); diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index ad3c033db64e..4277084de2e7 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -151,7 +151,7 @@ static struct xt_target connmark_tg_reg[] __read_mostly = { { .name = "CONNMARK", .revision = 1, - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .checkentry = connmark_tg_check, .target = connmark_tg, .targetsize = sizeof(struct xt_connmark_tginfo1), @@ -161,13 +161,35 @@ static struct xt_target connmark_tg_reg[] __read_mostly = { { .name = "CONNMARK", .revision = 2, - .family = NFPROTO_UNSPEC, + .family = NFPROTO_IPV4, .checkentry = connmark_tg_check, .target = connmark_tg_v2, .targetsize = sizeof(struct xt_connmark_tginfo2), .destroy = connmark_tg_destroy, .me = THIS_MODULE, - } + }, +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "CONNMARK", + .revision = 1, + .family = NFPROTO_IPV6, + .checkentry = connmark_tg_check, + .target = connmark_tg, + .targetsize = sizeof(struct xt_connmark_tginfo1), + .destroy = connmark_tg_destroy, + .me = THIS_MODULE, + }, + { + .name = "CONNMARK", + .revision = 2, + .family = NFPROTO_IPV6, + .checkentry = connmark_tg_check, + .target = connmark_tg_v2, + .targetsize = sizeof(struct xt_connmark_tginfo2), + .destroy = connmark_tg_destroy, + .me = THIS_MODULE, + }, +#endif }; static struct xt_match connmark_mt_reg __read_mostly = { diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c index 1ad74b5920b5..f76fe04fc9a4 100644 --- a/net/netfilter/xt_mark.c +++ b/net/netfilter/xt_mark.c @@ -39,13 +39,35 @@ mark_mt(const struct sk_buff *skb, struct xt_action_param *par) return ((skb->mark & info->mask) == info->mark) ^ info->invert; } -static struct xt_target mark_tg_reg __read_mostly = { - .name = "MARK", - .revision = 2, - .family = NFPROTO_UNSPEC, - .target = mark_tg, - .targetsize = sizeof(struct xt_mark_tginfo2), - .me = THIS_MODULE, +static struct xt_target mark_tg_reg[] __read_mostly = { + { + .name = "MARK", + .revision = 2, + .family = NFPROTO_IPV4, + .target = mark_tg, + .targetsize = sizeof(struct xt_mark_tginfo2), + .me = THIS_MODULE, + }, +#if IS_ENABLED(CONFIG_IP_NF_ARPTABLES) + { + .name = "MARK", + .revision = 2, + .family = NFPROTO_ARP, + .target = mark_tg, + .targetsize = sizeof(struct xt_mark_tginfo2), + .me = THIS_MODULE, + }, +#endif +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) + { + .name = "MARK", + .revision = 2, + .family = NFPROTO_IPV4, + .target = mark_tg, + .targetsize = sizeof(struct xt_mark_tginfo2), + .me = THIS_MODULE, + }, +#endif }; static struct xt_match mark_mt_reg __read_mostly = { @@ -61,12 +83,12 @@ static int __init mark_mt_init(void) { int ret; - ret = xt_register_target(&mark_tg_reg); + ret = xt_register_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg)); if (ret < 0) return ret; ret = xt_register_match(&mark_mt_reg); if (ret < 0) { - xt_unregister_target(&mark_tg_reg); + xt_unregister_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg)); return ret; } return 0; @@ -75,7 +97,7 @@ static int __init mark_mt_init(void) static void __exit mark_mt_exit(void) { xt_unregister_match(&mark_mt_reg); - xt_unregister_target(&mark_tg_reg); + xt_unregister_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg)); } module_init(mark_mt_init); -- cgit v1.2.3 From 05ef7055debc804e8083737402127975e7244fc4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 9 Oct 2024 09:19:02 +0200 Subject: netfilter: fib: check correct rtable in vrf setups We need to init l3mdev unconditionally, else main routing table is searched and incorrect result is returned unless strict (iif keyword) matching is requested. Next patch adds a selftest for this. Fixes: 2a8a7c0eaa87 ("netfilter: nft_fib: Fix for rpath check with VRF devices") Closes: https://bugzilla.netfilter.org/show_bug.cgi?id=1761 Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nft_fib_ipv4.c | 4 +--- net/ipv6/netfilter/nft_fib_ipv6.c | 5 +++-- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 00da1332bbf1..09fff5d424ef 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -65,6 +65,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, .flowi4_scope = RT_SCOPE_UNIVERSE, .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_uid = sock_net_uid(nft_net(pkt), NULL), + .flowi4_l3mdev = l3mdev_master_ifindex_rcu(nft_in(pkt)), }; const struct net_device *oif; const struct net_device *found; @@ -83,9 +84,6 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, else oif = NULL; - if (priv->flags & NFTA_FIB_F_IIF) - fl4.flowi4_l3mdev = l3mdev_master_ifindex_rcu(oif); - if (nft_hook(pkt) == NF_INET_PRE_ROUTING && nft_fib_is_loopback(pkt->skb, nft_in(pkt))) { nft_fib_store_result(dest, priv, nft_in(pkt)); diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index 36dc14b34388..c9f1634b3838 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -41,8 +41,6 @@ static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv, if (ipv6_addr_type(&fl6->daddr) & IPV6_ADDR_LINKLOCAL) { lookup_flags |= RT6_LOOKUP_F_IFACE; fl6->flowi6_oif = get_ifindex(dev ? dev : pkt->skb->dev); - } else if (priv->flags & NFTA_FIB_F_IIF) { - fl6->flowi6_l3mdev = l3mdev_master_ifindex_rcu(dev); } if (ipv6_addr_type(&fl6->saddr) & IPV6_ADDR_UNICAST) @@ -75,6 +73,8 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv, else if (priv->flags & NFTA_FIB_F_OIF) dev = nft_out(pkt); + fl6.flowi6_l3mdev = l3mdev_master_ifindex_rcu(dev); + nft_fib6_flowi_init(&fl6, priv, pkt, dev, iph); if (dev && nf_ipv6_chk_addr(nft_net(pkt), &fl6.daddr, dev, true)) @@ -165,6 +165,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_proto = pkt->tprot, .flowi6_uid = sock_net_uid(nft_net(pkt), NULL), + .flowi6_l3mdev = l3mdev_master_ifindex_rcu(nft_in(pkt)), }; struct rt6_info *rt; int lookup_flags; -- cgit v1.2.3 From 8c924369cb56c3054dca504c2c9c3eb208272865 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 8 Oct 2024 12:43:20 +0300 Subject: net: dsa: refuse cross-chip mirroring operations In case of a tc mirred action from one switch to another, the behavior is not correct. We simply tell the source switch driver to program a mirroring entry towards mirror->to_local_port = to_dp->index, but it is not even guaranteed that the to_dp belongs to the same switch as dp. For proper cross-chip support, we would need to go through the cross-chip notifier layer in switch.c, program the entry on cascade ports, and introduce new, explicit API for cross-chip mirroring, given that intermediary switches should have introspection into the DSA tags passed through the cascade port (and not just program a port mirror on the entire cascade port). None of that exists today. Reject what is not implemented so that user space is not misled into thinking it works. Fixes: f50f212749e8 ("net: dsa: Add plumbing for port mirroring") Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241008094320.3340980-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/user.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/dsa/user.c b/net/dsa/user.c index 74eda9b30608..64f660d2334b 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -1392,6 +1392,14 @@ dsa_user_add_cls_matchall_mirred(struct net_device *dev, if (!dsa_user_dev_check(act->dev)) return -EOPNOTSUPP; + to_dp = dsa_user_to_port(act->dev); + + if (dp->ds != to_dp->ds) { + NL_SET_ERR_MSG_MOD(extack, + "Cross-chip mirroring not implemented"); + return -EOPNOTSUPP; + } + mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL); if (!mall_tc_entry) return -ENOMEM; @@ -1399,9 +1407,6 @@ dsa_user_add_cls_matchall_mirred(struct net_device *dev, mall_tc_entry->cookie = cls->cookie; mall_tc_entry->type = DSA_PORT_MALL_MIRROR; mirror = &mall_tc_entry->mirror; - - to_dp = dsa_user_to_port(act->dev); - mirror->to_local_port = to_dp->index; mirror->ingress = ingress; -- cgit v1.2.3 From e32d262c89e2b22cb0640223f953b548617ed8a6 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 8 Oct 2024 13:04:52 +0200 Subject: mptcp: handle consistently DSS corruption Bugged peer implementation can send corrupted DSS options, consistently hitting a few warning in the data path. Use DEBUG_NET assertions, to avoid the splat on some builds and handle consistently the error, dumping related MIBs and performing fallback and/or reset according to the subflow type. Fixes: 6771bfd9ee24 ("mptcp: update mptcp ack sequence from work queue") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241008-net-mptcp-fallback-fixes-v1-1-c6fb8e93e551@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/mib.c | 2 ++ net/mptcp/mib.h | 2 ++ net/mptcp/protocol.c | 24 +++++++++++++++++++++--- net/mptcp/subflow.c | 4 +++- 4 files changed, 28 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index 38c2efc82b94..ad88bd3c58df 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -32,6 +32,8 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPJoinSynTxBindErr", MPTCP_MIB_JOINSYNTXBINDERR), SNMP_MIB_ITEM("MPJoinSynTxConnectErr", MPTCP_MIB_JOINSYNTXCONNECTERR), SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH), + SNMP_MIB_ITEM("DSSCorruptionFallback", MPTCP_MIB_DSSCORRUPTIONFALLBACK), + SNMP_MIB_ITEM("DSSCorruptionReset", MPTCP_MIB_DSSCORRUPTIONRESET), SNMP_MIB_ITEM("InfiniteMapTx", MPTCP_MIB_INFINITEMAPTX), SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX), SNMP_MIB_ITEM("DSSNoMatchTCP", MPTCP_MIB_DSSTCPMISMATCH), diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index c8ffe18a8722..3206cdda8bb1 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -27,6 +27,8 @@ enum linux_mptcp_mib_field { MPTCP_MIB_JOINSYNTXBINDERR, /* Not able to bind() the address when sending a SYN + MP_JOIN */ MPTCP_MIB_JOINSYNTXCONNECTERR, /* Not able to connect() when sending a SYN + MP_JOIN */ MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */ + MPTCP_MIB_DSSCORRUPTIONFALLBACK,/* DSS corruption detected, fallback */ + MPTCP_MIB_DSSCORRUPTIONRESET, /* DSS corruption detected, MPJ subflow reset */ MPTCP_MIB_INFINITEMAPTX, /* Sent an infinite mapping */ MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */ MPTCP_MIB_DSSTCPMISMATCH, /* DSS-mapping did not map with TCP's sequence numbers */ diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index c2317919fc14..6d0e201c3eb2 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -620,6 +620,18 @@ static bool mptcp_check_data_fin(struct sock *sk) return ret; } +static void mptcp_dss_corruption(struct mptcp_sock *msk, struct sock *ssk) +{ + if (READ_ONCE(msk->allow_infinite_fallback)) { + MPTCP_INC_STATS(sock_net(ssk), + MPTCP_MIB_DSSCORRUPTIONFALLBACK); + mptcp_do_fallback(ssk); + } else { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSCORRUPTIONRESET); + mptcp_subflow_reset(ssk); + } +} + static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, struct sock *ssk, unsigned int *bytes) @@ -692,10 +704,16 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, moved += len; seq += len; - if (WARN_ON_ONCE(map_remaining < len)) - break; + if (unlikely(map_remaining < len)) { + DEBUG_NET_WARN_ON_ONCE(1); + mptcp_dss_corruption(msk, ssk); + } } else { - WARN_ON_ONCE(!fin); + if (unlikely(!fin)) { + DEBUG_NET_WARN_ON_ONCE(1); + mptcp_dss_corruption(msk, ssk); + } + sk_eat_skb(ssk, skb); done = true; } diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 1040b3b9696b..e1046a696ab5 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -975,8 +975,10 @@ static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb) unsigned int skb_consumed; skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq; - if (WARN_ON_ONCE(skb_consumed >= skb->len)) + if (unlikely(skb_consumed >= skb->len)) { + DEBUG_NET_WARN_ON_ONCE(1); return true; + } return skb->len - skb_consumed <= subflow->map_data_len - mptcp_subflow_get_map_offset(subflow); -- cgit v1.2.3 From 4dabcdf581217e60690467a37c956a5b8dbc6bd9 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 8 Oct 2024 13:04:53 +0200 Subject: tcp: fix mptcp DSS corruption due to large pmtu xmit Syzkaller was able to trigger a DSS corruption: TCP: request_sock_subflow_v4: Possible SYN flooding on port [::]:20002. Sending cookies. ------------[ cut here ]------------ WARNING: CPU: 0 PID: 5227 at net/mptcp/protocol.c:695 __mptcp_move_skbs_from_subflow+0x20a9/0x21f0 net/mptcp/protocol.c:695 Modules linked in: CPU: 0 UID: 0 PID: 5227 Comm: syz-executor350 Not tainted 6.11.0-syzkaller-08829-gaf9c191ac2a0 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/06/2024 RIP: 0010:__mptcp_move_skbs_from_subflow+0x20a9/0x21f0 net/mptcp/protocol.c:695 Code: 0f b6 dc 31 ff 89 de e8 b5 dd ea f5 89 d8 48 81 c4 50 01 00 00 5b 41 5c 41 5d 41 5e 41 5f 5d c3 cc cc cc cc e8 98 da ea f5 90 <0f> 0b 90 e9 47 ff ff ff e8 8a da ea f5 90 0f 0b 90 e9 99 e0 ff ff RSP: 0018:ffffc90000006db8 EFLAGS: 00010246 RAX: ffffffff8ba9df18 RBX: 00000000000055f0 RCX: ffff888030023c00 RDX: 0000000000000100 RSI: 00000000000081e5 RDI: 00000000000055f0 RBP: 1ffff110062bf1ae R08: ffffffff8ba9cf12 R09: 1ffff110062bf1b8 R10: dffffc0000000000 R11: ffffed10062bf1b9 R12: 0000000000000000 R13: dffffc0000000000 R14: 00000000700cec61 R15: 00000000000081e5 FS: 000055556679c380(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020287000 CR3: 0000000077892000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: move_skbs_to_msk net/mptcp/protocol.c:811 [inline] mptcp_data_ready+0x29c/0xa90 net/mptcp/protocol.c:854 subflow_data_ready+0x34a/0x920 net/mptcp/subflow.c:1490 tcp_data_queue+0x20fd/0x76c0 net/ipv4/tcp_input.c:5283 tcp_rcv_established+0xfba/0x2020 net/ipv4/tcp_input.c:6237 tcp_v4_do_rcv+0x96d/0xc70 net/ipv4/tcp_ipv4.c:1915 tcp_v4_rcv+0x2dc0/0x37f0 net/ipv4/tcp_ipv4.c:2350 ip_protocol_deliver_rcu+0x22e/0x440 net/ipv4/ip_input.c:205 ip_local_deliver_finish+0x341/0x5f0 net/ipv4/ip_input.c:233 NF_HOOK+0x3a4/0x450 include/linux/netfilter.h:314 NF_HOOK+0x3a4/0x450 include/linux/netfilter.h:314 __netif_receive_skb_one_core net/core/dev.c:5662 [inline] __netif_receive_skb+0x2bf/0x650 net/core/dev.c:5775 process_backlog+0x662/0x15b0 net/core/dev.c:6107 __napi_poll+0xcb/0x490 net/core/dev.c:6771 napi_poll net/core/dev.c:6840 [inline] net_rx_action+0x89b/0x1240 net/core/dev.c:6962 handle_softirqs+0x2c5/0x980 kernel/softirq.c:554 do_softirq+0x11b/0x1e0 kernel/softirq.c:455 __local_bh_enable_ip+0x1bb/0x200 kernel/softirq.c:382 local_bh_enable include/linux/bottom_half.h:33 [inline] rcu_read_unlock_bh include/linux/rcupdate.h:919 [inline] __dev_queue_xmit+0x1764/0x3e80 net/core/dev.c:4451 dev_queue_xmit include/linux/netdevice.h:3094 [inline] neigh_hh_output include/net/neighbour.h:526 [inline] neigh_output include/net/neighbour.h:540 [inline] ip_finish_output2+0xd41/0x1390 net/ipv4/ip_output.c:236 ip_local_out net/ipv4/ip_output.c:130 [inline] __ip_queue_xmit+0x118c/0x1b80 net/ipv4/ip_output.c:536 __tcp_transmit_skb+0x2544/0x3b30 net/ipv4/tcp_output.c:1466 tcp_transmit_skb net/ipv4/tcp_output.c:1484 [inline] tcp_mtu_probe net/ipv4/tcp_output.c:2547 [inline] tcp_write_xmit+0x641d/0x6bf0 net/ipv4/tcp_output.c:2752 __tcp_push_pending_frames+0x9b/0x360 net/ipv4/tcp_output.c:3015 tcp_push_pending_frames include/net/tcp.h:2107 [inline] tcp_data_snd_check net/ipv4/tcp_input.c:5714 [inline] tcp_rcv_established+0x1026/0x2020 net/ipv4/tcp_input.c:6239 tcp_v4_do_rcv+0x96d/0xc70 net/ipv4/tcp_ipv4.c:1915 sk_backlog_rcv include/net/sock.h:1113 [inline] __release_sock+0x214/0x350 net/core/sock.c:3072 release_sock+0x61/0x1f0 net/core/sock.c:3626 mptcp_push_release net/mptcp/protocol.c:1486 [inline] __mptcp_push_pending+0x6b5/0x9f0 net/mptcp/protocol.c:1625 mptcp_sendmsg+0x10bb/0x1b10 net/mptcp/protocol.c:1903 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x1a6/0x270 net/socket.c:745 ____sys_sendmsg+0x52a/0x7e0 net/socket.c:2603 ___sys_sendmsg net/socket.c:2657 [inline] __sys_sendmsg+0x2aa/0x390 net/socket.c:2686 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fb06e9317f9 Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffe2cfd4f98 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007fb06e97f468 RCX: 00007fb06e9317f9 RDX: 0000000000000000 RSI: 0000000020000080 RDI: 0000000000000005 RBP: 00007fb06e97f446 R08: 0000555500000000 R09: 0000555500000000 R10: 0000555500000000 R11: 0000000000000246 R12: 00007fb06e97f406 R13: 0000000000000001 R14: 00007ffe2cfd4fe0 R15: 0000000000000003 Additionally syzkaller provided a nice reproducer. The repro enables pmtu on the loopback device, leading to tcp_mtu_probe() generating very large probe packets. tcp_can_coalesce_send_queue_head() currently does not check for mptcp-level invariants, and allowed the creation of cross-DSS probes, leading to the mentioned corruption. Address the issue teaching tcp_can_coalesce_send_queue_head() about mptcp using the tcp_skb_can_collapse(), also reducing the code duplication. Fixes: 85712484110d ("tcp: coalesce/collapse must respect MPTCP extensions") Cc: stable@vger.kernel.org Reported-by: syzbot+d1bff73460e33101f0e7@syzkaller.appspotmail.com Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/513 Signed-off-by: Paolo Abeni Acked-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241008-net-mptcp-fallback-fixes-v1-2-c6fb8e93e551@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_output.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4fd746bd4d54..68804fd01daf 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2342,10 +2342,7 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len) if (len <= skb->len) break; - if (unlikely(TCP_SKB_CB(skb)->eor) || - tcp_has_tx_tstamp(skb) || - !skb_pure_zcopy_same(skb, next) || - skb_frags_readable(skb) != skb_frags_readable(next)) + if (tcp_has_tx_tstamp(skb) || !tcp_skb_can_collapse(skb, next)) return false; len -= skb->len; -- cgit v1.2.3 From 119d51e225febc8152476340a880f5415a01e99e Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 8 Oct 2024 13:04:54 +0200 Subject: mptcp: fallback when MPTCP opts are dropped after 1st data As reported by Christoph [1], before this patch, an MPTCP connection was wrongly reset when a host received a first data packet with MPTCP options after the 3wHS, but got the next ones without. According to the MPTCP v1 specs [2], a fallback should happen in this case, because the host didn't receive a DATA_ACK from the other peer, nor receive data for more than the initial window which implies a DATA_ACK being received by the other peer. The patch here re-uses the same logic as the one used in other places: by looking at allow_infinite_fallback, which is disabled at the creation of an additional subflow. It's not looking at the first DATA_ACK (or implying one received from the other side) as suggested by the RFC, but it is in continuation with what was already done, which is safer, and it fixes the reported issue. The next step, looking at this first DATA_ACK, is tracked in [4]. This patch has been validated using the following Packetdrill script: 0 socket(..., SOCK_STREAM, IPPROTO_MPTCP) = 3 +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0 bind(3, ..., ...) = 0 +0 listen(3, 1) = 0 // 3WHS is OK +0.0 < S 0:0(0) win 65535 +0.0 > S. 0:0(0) ack 1 +0.1 < . 1:1(0) ack 1 win 2048 +0 accept(3, ..., ...) = 4 // Data from the client with valid MPTCP options (no DATA_ACK: normal) +0.1 < P. 1:501(500) ack 1 win 2048 // From here, the MPTCP options will be dropped by a middlebox +0.0 > . 1:1(0) ack 501 +0.1 read(4, ..., 500) = 500 +0 write(4, ..., 100) = 100 // The server replies with data, still thinking MPTCP is being used +0.0 > P. 1:101(100) ack 501 // But the client already did a fallback to TCP, because the two previous packets have been received without MPTCP options +0.1 < . 501:501(0) ack 101 win 2048 +0.0 < P. 501:601(100) ack 101 win 2048 // The server should fallback to TCP, not reset: it didn't get a DATA_ACK, nor data for more than the initial window +0.0 > . 101:101(0) ack 601 Note that this script requires Packetdrill with MPTCP support, see [3]. Fixes: dea2b1ea9c70 ("mptcp: do not reset MP_CAPABLE subflow on mapping errors") Cc: stable@vger.kernel.org Reported-by: Christoph Paasch Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/518 [1] Link: https://datatracker.ietf.org/doc/html/rfc8684#name-fallback [2] Link: https://github.com/multipath-tcp/packetdrill [3] Link: https://github.com/multipath-tcp/mptcp_net-next/issues/519 [4] Reviewed-by: Paolo Abeni Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241008-net-mptcp-fallback-fixes-v1-3-c6fb8e93e551@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index e1046a696ab5..25dde81bcb75 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1282,7 +1282,7 @@ static bool subflow_can_fallback(struct mptcp_subflow_context *subflow) else if (READ_ONCE(msk->csum_enabled)) return !subflow->valid_csum_seen; else - return !subflow->fully_established; + return READ_ONCE(msk->allow_infinite_fallback); } static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) -- cgit v1.2.3 From db0a37b7ac27d8ca27d3dc676a16d081c16ec7b9 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 8 Oct 2024 13:04:55 +0200 Subject: mptcp: pm: do not remove closing subflows In a previous fix, the in-kernel path-manager has been modified not to retrigger the removal of a subflow if it was already closed, e.g. when the initial subflow is removed, but kept in the subflows list. To be complete, this fix should also skip the subflows that are in any closing state: mptcp_close_ssk() will initiate the closure, but the switch to the TCP_CLOSE state depends on the other peer. Fixes: 58e1b66b4e4b ("mptcp: pm: do not remove already closed subflows") Cc: stable@vger.kernel.org Suggested-by: Paolo Abeni Acked-by: Paolo Abeni Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241008-net-mptcp-fallback-fixes-v1-4-c6fb8e93e551@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 64fe0e7d87d7..f6f0a38a0750 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -860,7 +860,8 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, int how = RCV_SHUTDOWN | SEND_SHUTDOWN; u8 id = subflow_get_local_id(subflow); - if (inet_sk_state_load(ssk) == TCP_CLOSE) + if ((1 << inet_sk_state_load(ssk)) & + (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING | TCPF_CLOSE)) continue; if (rm_type == MPTCP_MIB_RMADDR && remote_id != rm_id) continue; -- cgit v1.2.3 From ac888d58869bb99753e7652be19a151df9ecb35d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Oct 2024 14:31:10 +0000 Subject: net: do not delay dst_entries_add() in dst_release() dst_entries_add() uses per-cpu data that might be freed at netns dismantle from ip6_route_net_exit() calling dst_entries_destroy() Before ip6_route_net_exit() can be called, we release all the dsts associated with this netns, via calls to dst_release(), which waits an rcu grace period before calling dst_destroy() dst_entries_add() use in dst_destroy() is racy, because dst_entries_destroy() could have been called already. Decrementing the number of dsts must happen sooner. Notes: 1) in CONFIG_XFRM case, dst_destroy() can call dst_release_immediate(child), this might also cause UAF if the child does not have DST_NOCOUNT set. IPSEC maintainers might take a look and see how to address this. 2) There is also discussion about removing this count of dst, which might happen in future kernels. Fixes: f88649721268 ("ipv4: fix dst race in sk_dst_get()") Closes: https://lore.kernel.org/lkml/CANn89iLCCGsP7SFn9HKpvnKu96Td4KD08xf7aGtiYgZnkjaL=w@mail.gmail.com/T/ Reported-by: Naresh Kamboju Tested-by: Linux Kernel Functional Testing Tested-by: Naresh Kamboju Signed-off-by: Eric Dumazet Cc: Xin Long Cc: Steffen Klassert Reviewed-by: Xin Long Link: https://patch.msgid.link/20241008143110.1064899-1-edumazet@google.com Signed-off-by: Paolo Abeni --- net/core/dst.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/dst.c b/net/core/dst.c index 95f533844f17..9552a90d4772 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -109,9 +109,6 @@ static void dst_destroy(struct dst_entry *dst) child = xdst->child; } #endif - if (!(dst->flags & DST_NOCOUNT)) - dst_entries_add(dst->ops, -1); - if (dst->ops->destroy) dst->ops->destroy(dst); netdev_put(dst->dev, &dst->dev_tracker); @@ -159,17 +156,27 @@ void dst_dev_put(struct dst_entry *dst) } EXPORT_SYMBOL(dst_dev_put); +static void dst_count_dec(struct dst_entry *dst) +{ + if (!(dst->flags & DST_NOCOUNT)) + dst_entries_add(dst->ops, -1); +} + void dst_release(struct dst_entry *dst) { - if (dst && rcuref_put(&dst->__rcuref)) + if (dst && rcuref_put(&dst->__rcuref)) { + dst_count_dec(dst); call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); + } } EXPORT_SYMBOL(dst_release); void dst_release_immediate(struct dst_entry *dst) { - if (dst && rcuref_put(&dst->__rcuref)) + if (dst && rcuref_put(&dst->__rcuref)) { + dst_count_dec(dst); dst_destroy(dst); + } } EXPORT_SYMBOL(dst_release_immediate); -- cgit v1.2.3 From 07cc7b0b942bf55ef1a471470ecda8d2a6a6541f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 8 Oct 2024 11:47:32 -0700 Subject: rtnetlink: Add bulk registration helpers for rtnetlink message handlers. Before commit addf9b90de22 ("net: rtnetlink: use rcu to free rtnl message handlers"), once rtnl_msg_handlers[protocol] was allocated, the following rtnl_register_module() for the same protocol never failed. However, after the commit, rtnl_msg_handler[protocol][msgtype] needs to be allocated in each rtnl_register_module(), so each call could fail. Many callers of rtnl_register_module() do not handle the returned error, and we need to add many error handlings. To handle that easily, let's add wrapper functions for bulk registration of rtnetlink message handlers. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- include/net/rtnetlink.h | 17 +++++++++++++++++ net/core/rtnetlink.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) (limited to 'net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index b45d57b5968a..2d3eb7cb4dff 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -29,6 +29,15 @@ static inline enum rtnl_kinds rtnl_msgtype_kind(int msgtype) return msgtype & RTNL_KIND_MASK; } +struct rtnl_msg_handler { + struct module *owner; + int protocol; + int msgtype; + rtnl_doit_func doit; + rtnl_dumpit_func dumpit; + int flags; +}; + void rtnl_register(int protocol, int msgtype, rtnl_doit_func, rtnl_dumpit_func, unsigned int flags); int rtnl_register_module(struct module *owner, int protocol, int msgtype, @@ -36,6 +45,14 @@ int rtnl_register_module(struct module *owner, int protocol, int msgtype, int rtnl_unregister(int protocol, int msgtype); void rtnl_unregister_all(int protocol); +int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n); +void __rtnl_unregister_many(const struct rtnl_msg_handler *handlers, int n); + +#define rtnl_register_many(handlers) \ + __rtnl_register_many(handlers, ARRAY_SIZE(handlers)) +#define rtnl_unregister_many(handlers) \ + __rtnl_unregister_many(handlers, ARRAY_SIZE(handlers)) + static inline int rtnl_msg_family(const struct nlmsghdr *nlh) { if (nlmsg_len(nlh) >= sizeof(struct rtgenmsg)) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f0a520987085..e30e7ea0207d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -384,6 +384,35 @@ void rtnl_unregister_all(int protocol) } EXPORT_SYMBOL_GPL(rtnl_unregister_all); +int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n) +{ + const struct rtnl_msg_handler *handler; + int i, err; + + for (i = 0, handler = handlers; i < n; i++, handler++) { + err = rtnl_register_internal(handler->owner, handler->protocol, + handler->msgtype, handler->doit, + handler->dumpit, handler->flags); + if (err) { + __rtnl_unregister_many(handlers, i); + break; + } + } + + return err; +} +EXPORT_SYMBOL_GPL(__rtnl_register_many); + +void __rtnl_unregister_many(const struct rtnl_msg_handler *handlers, int n) +{ + const struct rtnl_msg_handler *handler; + int i; + + for (i = n - 1, handler = handlers + n - 1; i >= 0; i--, handler--) + rtnl_unregister(handler->protocol, handler->msgtype); +} +EXPORT_SYMBOL_GPL(__rtnl_unregister_many); + static LIST_HEAD(link_ops); static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) -- cgit v1.2.3 From cba5e43b0b757734b1e79f624d93a71435e31136 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 8 Oct 2024 11:47:34 -0700 Subject: bridge: Handle error of rtnl_register_module(). Since introduced, br_vlan_rtnl_init() has been ignoring the returned value of rtnl_register_module(), which could fail silently. Handling the error allows users to view a module as an all-or-nothing thing in terms of the rtnetlink functionality. This prevents syzkaller from reporting spurious errors from its tests, where OOM often occurs and module is automatically loaded. Let's handle the errors by rtnl_register_many(). Fixes: 8dcea187088b ("net: bridge: vlan: add rtm definitions and dump support") Fixes: f26b296585dc ("net: bridge: vlan: add new rtm message support") Fixes: adb3ce9bcb0f ("net: bridge: vlan: add del rtm message support") Signed-off-by: Kuniyuki Iwashima Acked-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- net/bridge/br_netlink.c | 6 +++++- net/bridge/br_private.h | 5 +++-- net/bridge/br_vlan.c | 19 +++++++++---------- 3 files changed, 17 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index f17dbac7d828..6b97ae47f855 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1920,7 +1920,10 @@ int __init br_netlink_init(void) { int err; - br_vlan_rtnl_init(); + err = br_vlan_rtnl_init(); + if (err) + goto out; + rtnl_af_register(&br_af_ops); err = rtnl_link_register(&br_link_ops); @@ -1931,6 +1934,7 @@ int __init br_netlink_init(void) out_af: rtnl_af_unregister(&br_af_ops); +out: return err; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index d4bedc87b1d8..041f6e571a20 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1571,7 +1571,7 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v, void br_vlan_port_event(struct net_bridge_port *p, unsigned long event); int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr); -void br_vlan_rtnl_init(void); +int br_vlan_rtnl_init(void); void br_vlan_rtnl_uninit(void); void br_vlan_notify(const struct net_bridge *br, const struct net_bridge_port *p, @@ -1802,8 +1802,9 @@ static inline int br_vlan_bridge_event(struct net_device *dev, return 0; } -static inline void br_vlan_rtnl_init(void) +static inline int br_vlan_rtnl_init(void) { + return 0; } static inline void br_vlan_rtnl_uninit(void) diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 9c2fffb827ab..89f51ea4cabe 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -2296,19 +2296,18 @@ static int br_vlan_rtm_process(struct sk_buff *skb, struct nlmsghdr *nlh, return err; } -void br_vlan_rtnl_init(void) +static const struct rtnl_msg_handler br_vlan_rtnl_msg_handlers[] = { + {THIS_MODULE, PF_BRIDGE, RTM_NEWVLAN, br_vlan_rtm_process, NULL, 0}, + {THIS_MODULE, PF_BRIDGE, RTM_DELVLAN, br_vlan_rtm_process, NULL, 0}, + {THIS_MODULE, PF_BRIDGE, RTM_GETVLAN, NULL, br_vlan_rtm_dump, 0}, +}; + +int br_vlan_rtnl_init(void) { - rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_GETVLAN, NULL, - br_vlan_rtm_dump, 0); - rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_NEWVLAN, - br_vlan_rtm_process, NULL, 0); - rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_DELVLAN, - br_vlan_rtm_process, NULL, 0); + return rtnl_register_many(br_vlan_rtnl_msg_handlers); } void br_vlan_rtnl_uninit(void) { - rtnl_unregister(PF_BRIDGE, RTM_GETVLAN); - rtnl_unregister(PF_BRIDGE, RTM_NEWVLAN); - rtnl_unregister(PF_BRIDGE, RTM_DELVLAN); + rtnl_unregister_many(br_vlan_rtnl_msg_handlers); } -- cgit v1.2.3 From d51705614f668254cc5def7490df76f9680b4659 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 8 Oct 2024 11:47:35 -0700 Subject: mctp: Handle error of rtnl_register_module(). Since introduced, mctp has been ignoring the returned value of rtnl_register_module(), which could fail silently. Handling the error allows users to view a module as an all-or-nothing thing in terms of the rtnetlink functionality. This prevents syzkaller from reporting spurious errors from its tests, where OOM often occurs and module is automatically loaded. Let's handle the errors by rtnl_register_many(). Fixes: 583be982d934 ("mctp: Add device handling and netlink interface") Fixes: 831119f88781 ("mctp: Add neighbour netlink interface") Fixes: 06d2f4c583a7 ("mctp: Add netlink route management") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Jeremy Kerr Signed-off-by: Paolo Abeni --- include/net/mctp.h | 2 +- net/mctp/af_mctp.c | 6 +++++- net/mctp/device.c | 30 ++++++++++++++++++------------ net/mctp/neigh.c | 31 +++++++++++++++++++------------ net/mctp/route.c | 33 +++++++++++++++++++++++---------- 5 files changed, 66 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/include/net/mctp.h b/include/net/mctp.h index 7b17c52e8ce2..28d59ae94ca3 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -295,7 +295,7 @@ void mctp_neigh_remove_dev(struct mctp_dev *mdev); int mctp_routes_init(void); void mctp_routes_exit(void); -void mctp_device_init(void); +int mctp_device_init(void); void mctp_device_exit(void); #endif /* __NET_MCTP_H */ diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index 43288b408fde..f6de136008f6 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -756,10 +756,14 @@ static __init int mctp_init(void) if (rc) goto err_unreg_routes; - mctp_device_init(); + rc = mctp_device_init(); + if (rc) + goto err_unreg_neigh; return 0; +err_unreg_neigh: + mctp_neigh_exit(); err_unreg_routes: mctp_routes_exit(); err_unreg_proto: diff --git a/net/mctp/device.c b/net/mctp/device.c index acb97b257428..85cc5f31f1e7 100644 --- a/net/mctp/device.c +++ b/net/mctp/device.c @@ -524,25 +524,31 @@ static struct notifier_block mctp_dev_nb = { .priority = ADDRCONF_NOTIFY_PRIORITY, }; -void __init mctp_device_init(void) +static const struct rtnl_msg_handler mctp_device_rtnl_msg_handlers[] = { + {THIS_MODULE, PF_MCTP, RTM_NEWADDR, mctp_rtm_newaddr, NULL, 0}, + {THIS_MODULE, PF_MCTP, RTM_DELADDR, mctp_rtm_deladdr, NULL, 0}, + {THIS_MODULE, PF_MCTP, RTM_GETADDR, NULL, mctp_dump_addrinfo, 0}, +}; + +int __init mctp_device_init(void) { - register_netdevice_notifier(&mctp_dev_nb); + int err; - rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_GETADDR, - NULL, mctp_dump_addrinfo, 0); - rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_NEWADDR, - mctp_rtm_newaddr, NULL, 0); - rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_DELADDR, - mctp_rtm_deladdr, NULL, 0); + register_netdevice_notifier(&mctp_dev_nb); rtnl_af_register(&mctp_af_ops); + + err = rtnl_register_many(mctp_device_rtnl_msg_handlers); + if (err) { + rtnl_af_unregister(&mctp_af_ops); + unregister_netdevice_notifier(&mctp_dev_nb); + } + + return err; } void __exit mctp_device_exit(void) { + rtnl_unregister_many(mctp_device_rtnl_msg_handlers); rtnl_af_unregister(&mctp_af_ops); - rtnl_unregister(PF_MCTP, RTM_DELADDR); - rtnl_unregister(PF_MCTP, RTM_NEWADDR); - rtnl_unregister(PF_MCTP, RTM_GETADDR); - unregister_netdevice_notifier(&mctp_dev_nb); } diff --git a/net/mctp/neigh.c b/net/mctp/neigh.c index ffa0f9e0983f..590f642413e4 100644 --- a/net/mctp/neigh.c +++ b/net/mctp/neigh.c @@ -322,22 +322,29 @@ static struct pernet_operations mctp_net_ops = { .exit = mctp_neigh_net_exit, }; +static const struct rtnl_msg_handler mctp_neigh_rtnl_msg_handlers[] = { + {THIS_MODULE, PF_MCTP, RTM_NEWNEIGH, mctp_rtm_newneigh, NULL, 0}, + {THIS_MODULE, PF_MCTP, RTM_DELNEIGH, mctp_rtm_delneigh, NULL, 0}, + {THIS_MODULE, PF_MCTP, RTM_GETNEIGH, NULL, mctp_rtm_getneigh, 0}, +}; + int __init mctp_neigh_init(void) { - rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_NEWNEIGH, - mctp_rtm_newneigh, NULL, 0); - rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_DELNEIGH, - mctp_rtm_delneigh, NULL, 0); - rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_GETNEIGH, - NULL, mctp_rtm_getneigh, 0); - - return register_pernet_subsys(&mctp_net_ops); + int err; + + err = register_pernet_subsys(&mctp_net_ops); + if (err) + return err; + + err = rtnl_register_many(mctp_neigh_rtnl_msg_handlers); + if (err) + unregister_pernet_subsys(&mctp_net_ops); + + return err; } -void __exit mctp_neigh_exit(void) +void mctp_neigh_exit(void) { + rtnl_unregister_many(mctp_neigh_rtnl_msg_handlers); unregister_pernet_subsys(&mctp_net_ops); - rtnl_unregister(PF_MCTP, RTM_GETNEIGH); - rtnl_unregister(PF_MCTP, RTM_DELNEIGH); - rtnl_unregister(PF_MCTP, RTM_NEWNEIGH); } diff --git a/net/mctp/route.c b/net/mctp/route.c index eefd7834d9a0..597e9cf5aa64 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -1474,26 +1474,39 @@ static struct pernet_operations mctp_net_ops = { .exit = mctp_routes_net_exit, }; +static const struct rtnl_msg_handler mctp_route_rtnl_msg_handlers[] = { + {THIS_MODULE, PF_MCTP, RTM_NEWROUTE, mctp_newroute, NULL, 0}, + {THIS_MODULE, PF_MCTP, RTM_DELROUTE, mctp_delroute, NULL, 0}, + {THIS_MODULE, PF_MCTP, RTM_GETROUTE, NULL, mctp_dump_rtinfo, 0}, +}; + int __init mctp_routes_init(void) { + int err; + dev_add_pack(&mctp_packet_type); - rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_GETROUTE, - NULL, mctp_dump_rtinfo, 0); - rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_NEWROUTE, - mctp_newroute, NULL, 0); - rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_DELROUTE, - mctp_delroute, NULL, 0); + err = register_pernet_subsys(&mctp_net_ops); + if (err) + goto err_pernet; + + err = rtnl_register_many(mctp_route_rtnl_msg_handlers); + if (err) + goto err_rtnl; - return register_pernet_subsys(&mctp_net_ops); + return 0; + +err_rtnl: + unregister_pernet_subsys(&mctp_net_ops); +err_pernet: + dev_remove_pack(&mctp_packet_type); + return err; } void mctp_routes_exit(void) { + rtnl_unregister_many(mctp_route_rtnl_msg_handlers); unregister_pernet_subsys(&mctp_net_ops); - rtnl_unregister(PF_MCTP, RTM_DELROUTE); - rtnl_unregister(PF_MCTP, RTM_NEWROUTE); - rtnl_unregister(PF_MCTP, RTM_GETROUTE); dev_remove_pack(&mctp_packet_type); } -- cgit v1.2.3 From 5be2062e3080e3ff6707816caa445ec0c6eaacf7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 8 Oct 2024 11:47:36 -0700 Subject: mpls: Handle error of rtnl_register_module(). Since introduced, mpls_init() has been ignoring the returned value of rtnl_register_module(), which could fail silently. Handling the error allows users to view a module as an all-or-nothing thing in terms of the rtnetlink functionality. This prevents syzkaller from reporting spurious errors from its tests, where OOM often occurs and module is automatically loaded. Let's handle the errors by rtnl_register_many(). Fixes: 03c0566542f4 ("mpls: Netlink commands to add, remove, and dump routes") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/mpls/af_mpls.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index aba983531ed3..df62638b6498 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -2728,6 +2728,15 @@ static struct rtnl_af_ops mpls_af_ops __read_mostly = { .get_stats_af_size = mpls_get_stats_af_size, }; +static const struct rtnl_msg_handler mpls_rtnl_msg_handlers[] __initdata_or_module = { + {THIS_MODULE, PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, 0}, + {THIS_MODULE, PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, 0}, + {THIS_MODULE, PF_MPLS, RTM_GETROUTE, mpls_getroute, mpls_dump_routes, 0}, + {THIS_MODULE, PF_MPLS, RTM_GETNETCONF, + mpls_netconf_get_devconf, mpls_netconf_dump_devconf, + RTNL_FLAG_DUMP_UNLOCKED}, +}; + static int __init mpls_init(void) { int err; @@ -2746,24 +2755,25 @@ static int __init mpls_init(void) rtnl_af_register(&mpls_af_ops); - rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_NEWROUTE, - mpls_rtm_newroute, NULL, 0); - rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_DELROUTE, - mpls_rtm_delroute, NULL, 0); - rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_GETROUTE, - mpls_getroute, mpls_dump_routes, 0); - rtnl_register_module(THIS_MODULE, PF_MPLS, RTM_GETNETCONF, - mpls_netconf_get_devconf, - mpls_netconf_dump_devconf, - RTNL_FLAG_DUMP_UNLOCKED); - err = ipgre_tunnel_encap_add_mpls_ops(); + err = rtnl_register_many(mpls_rtnl_msg_handlers); if (err) + goto out_unregister_rtnl_af; + + err = ipgre_tunnel_encap_add_mpls_ops(); + if (err) { pr_err("Can't add mpls over gre tunnel ops\n"); + goto out_unregister_rtnl; + } err = 0; out: return err; +out_unregister_rtnl: + rtnl_unregister_many(mpls_rtnl_msg_handlers); +out_unregister_rtnl_af: + rtnl_af_unregister(&mpls_af_ops); + dev_remove_pack(&mpls_packet_type); out_unregister_pernet: unregister_pernet_subsys(&mpls_net_ops); goto out; -- cgit v1.2.3 From b5e837c86041bef60f36cf9f20a641a30764379a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 8 Oct 2024 11:47:37 -0700 Subject: phonet: Handle error of rtnl_register_module(). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before commit addf9b90de22 ("net: rtnetlink: use rcu to free rtnl message handlers"), once the first rtnl_register_module() allocated rtnl_msg_handlers[PF_PHONET], the following calls never failed. However, after the commit, rtnl_register_module() could fail silently to allocate rtnl_msg_handlers[PF_PHONET][msgtype] and requires error handling for each call. Handling the error allows users to view a module as an all-or-nothing thing in terms of the rtnetlink functionality. This prevents syzkaller from reporting spurious errors from its tests, where OOM often occurs and module is automatically loaded. Let's use rtnl_register_many() to handle the errors easily. Fixes: addf9b90de22 ("net: rtnetlink: use rcu to free rtnl message handlers") Signed-off-by: Kuniyuki Iwashima Acked-by: Rémi Denis-Courmont Signed-off-by: Paolo Abeni --- net/phonet/pn_netlink.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c index 7008d402499d..894e5c72d6bf 100644 --- a/net/phonet/pn_netlink.c +++ b/net/phonet/pn_netlink.c @@ -285,23 +285,17 @@ static int route_dumpit(struct sk_buff *skb, struct netlink_callback *cb) return err; } +static const struct rtnl_msg_handler phonet_rtnl_msg_handlers[] __initdata_or_module = { + {THIS_MODULE, PF_PHONET, RTM_NEWADDR, addr_doit, NULL, 0}, + {THIS_MODULE, PF_PHONET, RTM_DELADDR, addr_doit, NULL, 0}, + {THIS_MODULE, PF_PHONET, RTM_GETADDR, NULL, getaddr_dumpit, 0}, + {THIS_MODULE, PF_PHONET, RTM_NEWROUTE, route_doit, NULL, 0}, + {THIS_MODULE, PF_PHONET, RTM_DELROUTE, route_doit, NULL, 0}, + {THIS_MODULE, PF_PHONET, RTM_GETROUTE, NULL, route_dumpit, + RTNL_FLAG_DUMP_UNLOCKED}, +}; + int __init phonet_netlink_register(void) { - int err = rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_NEWADDR, - addr_doit, NULL, 0); - if (err) - return err; - - /* Further rtnl_register_module() cannot fail */ - rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_DELADDR, - addr_doit, NULL, 0); - rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_GETADDR, - NULL, getaddr_dumpit, 0); - rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_NEWROUTE, - route_doit, NULL, 0); - rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_DELROUTE, - route_doit, NULL, 0); - rtnl_register_module(THIS_MODULE, PF_PHONET, RTM_GETROUTE, - NULL, route_dumpit, RTNL_FLAG_DUMP_UNLOCKED); - return 0; + return rtnl_register_many(phonet_rtnl_msg_handlers); } -- cgit v1.2.3 From 6fd27ea183c208e478129a85e11d880fc70040f2 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Wed, 9 Oct 2024 14:55:16 +0800 Subject: net/smc: fix lacks of icsk_syn_mss with IPPROTO_SMC Eric report a panic on IPPROTO_SMC, and give the facts that when INET_PROTOSW_ICSK was set, icsk->icsk_sync_mss must be set too. Bug: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 Mem abort info: ESR = 0x0000000086000005 EC = 0x21: IABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x05: level 1 translation fault user pgtable: 4k pages, 48-bit VAs, pgdp=00000001195d1000 [0000000000000000] pgd=0800000109c46003, p4d=0800000109c46003, pud=0000000000000000 Internal error: Oops: 0000000086000005 [#1] PREEMPT SMP Modules linked in: CPU: 1 UID: 0 PID: 8037 Comm: syz.3.265 Not tainted 6.11.0-rc7-syzkaller-g5f5673607153 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/06/2024 pstate: 80400005 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : 0x0 lr : cipso_v4_sock_setattr+0x2a8/0x3c0 net/ipv4/cipso_ipv4.c:1910 sp : ffff80009b887a90 x29: ffff80009b887aa0 x28: ffff80008db94050 x27: 0000000000000000 x26: 1fffe0001aa6f5b3 x25: dfff800000000000 x24: ffff0000db75da00 x23: 0000000000000000 x22: ffff0000d8b78518 x21: 0000000000000000 x20: ffff0000d537ad80 x19: ffff0000d8b78000 x18: 1fffe000366d79ee x17: ffff8000800614a8 x16: ffff800080569b84 x15: 0000000000000001 x14: 000000008b336894 x13: 00000000cd96feaa x12: 0000000000000003 x11: 0000000000040000 x10: 00000000000020a3 x9 : 1fffe0001b16f0f1 x8 : 0000000000000000 x7 : 0000000000000000 x6 : 000000000000003f x5 : 0000000000000040 x4 : 0000000000000001 x3 : 0000000000000000 x2 : 0000000000000002 x1 : 0000000000000000 x0 : ffff0000d8b78000 Call trace: 0x0 netlbl_sock_setattr+0x2e4/0x338 net/netlabel/netlabel_kapi.c:1000 smack_netlbl_add+0xa4/0x154 security/smack/smack_lsm.c:2593 smack_socket_post_create+0xa8/0x14c security/smack/smack_lsm.c:2973 security_socket_post_create+0x94/0xd4 security/security.c:4425 __sock_create+0x4c8/0x884 net/socket.c:1587 sock_create net/socket.c:1622 [inline] __sys_socket_create net/socket.c:1659 [inline] __sys_socket+0x134/0x340 net/socket.c:1706 __do_sys_socket net/socket.c:1720 [inline] __se_sys_socket net/socket.c:1718 [inline] __arm64_sys_socket+0x7c/0x94 net/socket.c:1718 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:132 do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:151 el0_svc+0x54/0x168 arch/arm64/kernel/entry-common.c:712 el0t_64_sync_handler+0x84/0xfc arch/arm64/kernel/entry-common.c:730 el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:598 Code: ???????? ???????? ???????? ???????? (????????) ---[ end trace 0000000000000000 ]--- This patch add a toy implementation that performs a simple return to prevent such panic. This is because MSS can be set in sock_create_kern or smc_setsockopt, similar to how it's done in AF_SMC. However, for AF_SMC, there is currently no way to synchronize MSS within __sys_connect_file. This toy implementation lays the groundwork for us to support such feature for IPPROTO_SMC in the future. Fixes: d25a92ccae6b ("net/smc: Introduce IPPROTO_SMC") Reported-by: Eric Dumazet Signed-off-by: D. Wythe Reviewed-by: Eric Dumazet Reviewed-by: Wenjia Zhang Link: https://patch.msgid.link/1728456916-67035-1-git-send-email-alibuda@linux.alibaba.com Signed-off-by: Jakub Kicinski --- net/smc/smc_inet.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'net') diff --git a/net/smc/smc_inet.c b/net/smc/smc_inet.c index a5b2041600f9..a944e7dcb8b9 100644 --- a/net/smc/smc_inet.c +++ b/net/smc/smc_inet.c @@ -108,12 +108,23 @@ static struct inet_protosw smc_inet6_protosw = { }; #endif /* CONFIG_IPV6 */ +static unsigned int smc_sync_mss(struct sock *sk, u32 pmtu) +{ + /* No need pass it through to clcsock, mss can always be set by + * sock_create_kern or smc_setsockopt. + */ + return 0; +} + static int smc_inet_init_sock(struct sock *sk) { struct net *net = sock_net(sk); /* init common smc sock */ smc_sk_init(net, sk, IPPROTO_SMC); + + inet_csk(sk)->icsk_sync_mss = smc_sync_mss; + /* create clcsock */ return smc_create_clcsk(net, sk, sk->sk_family); } -- cgit v1.2.3 From 22600596b6756b166fd052d5facb66287e6f0bad Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 9 Oct 2024 14:47:13 -0400 Subject: ipv4: give an IPv4 dev to blackhole_netdev After commit 8d7017fd621d ("blackhole_netdev: use blackhole_netdev to invalidate dst entries"), blackhole_netdev was introduced to invalidate dst cache entries on the TX path whenever the cache times out or is flushed. When two UDP sockets (sk1 and sk2) send messages to the same destination simultaneously, they are using the same dst cache. If the dst cache is invalidated on one path (sk2) while the other (sk1) is still transmitting, sk1 may try to use the invalid dst entry. CPU1 CPU2 udp_sendmsg(sk1) udp_sendmsg(sk2) udp_send_skb() ip_output() <--- dst timeout or flushed dst_dev_put() ip_finish_output2() ip_neigh_for_gw() This results in a scenario where ip_neigh_for_gw() returns -EINVAL because blackhole_dev lacks an in_dev, which is needed to initialize the neigh in arp_constructor(). This error is then propagated back to userspace, breaking the UDP application. The patch fixes this issue by assigning an in_dev to blackhole_dev for IPv4, similar to what was done for IPv6 in commit e5f80fcf869a ("ipv6: give an IPv6 dev to blackhole_netdev"). This ensures that even when the dst entry is invalidated with blackhole_dev, it will not fail to create the neigh entry. As devinet_init() is called ealier than blackhole_netdev_init() in system booting, it can not assign the in_dev to blackhole_dev in devinet_init(). As Paolo suggested, add a separate late_initcall() in devinet.c to ensure inet_blackhole_dev_init() is called after blackhole_netdev_init(). Fixes: 8d7017fd621d ("blackhole_netdev: use blackhole_netdev to invalidate dst entries") Signed-off-by: Xin Long Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/3000792d45ca44e16c785ebe2b092e610e5b3df1.1728499633.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/devinet.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index ab76744383cf..7cf5f7d0d0de 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -298,17 +298,19 @@ static struct in_device *inetdev_init(struct net_device *dev) /* Account for reference dev->ip_ptr (below) */ refcount_set(&in_dev->refcnt, 1); - err = devinet_sysctl_register(in_dev); - if (err) { - in_dev->dead = 1; - neigh_parms_release(&arp_tbl, in_dev->arp_parms); - in_dev_put(in_dev); - in_dev = NULL; - goto out; + if (dev != blackhole_netdev) { + err = devinet_sysctl_register(in_dev); + if (err) { + in_dev->dead = 1; + neigh_parms_release(&arp_tbl, in_dev->arp_parms); + in_dev_put(in_dev); + in_dev = NULL; + goto out; + } + ip_mc_init_dev(in_dev); + if (dev->flags & IFF_UP) + ip_mc_up(in_dev); } - ip_mc_init_dev(in_dev); - if (dev->flags & IFF_UP) - ip_mc_up(in_dev); /* we can receive as soon as ip_ptr is set -- do this last */ rcu_assign_pointer(dev->ip_ptr, in_dev); @@ -347,6 +349,19 @@ static void inetdev_destroy(struct in_device *in_dev) in_dev_put(in_dev); } +static int __init inet_blackhole_dev_init(void) +{ + int err = 0; + + rtnl_lock(); + if (!inetdev_init(blackhole_netdev)) + err = -ENOMEM; + rtnl_unlock(); + + return err; +} +late_initcall(inet_blackhole_dev_init); + int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b) { const struct in_ifaddr *ifa; -- cgit v1.2.3 From 25c12b459db8365fee84b63f3dd7910f70627f29 Mon Sep 17 00:00:00 2001 From: Kai Shen Date: Thu, 10 Oct 2024 11:56:24 +0000 Subject: net/smc: Fix memory leak when using percpu refs This patch adds missing percpu_ref_exit when releasing percpu refs. When releasing percpu refs, percpu_ref_exit should be called. Otherwise, memory leak happens. Fixes: 79a22238b4f2 ("net/smc: Use percpu ref for wr tx reference") Signed-off-by: Kai Shen Reviewed-by: Dust Li Reviewed-by: Wenjia Zhang Link: https://patch.msgid.link/20241010115624.7769-1-KaiShen@linux.alibaba.com Signed-off-by: Jakub Kicinski --- net/smc/smc_wr.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 0021065a600a..994c0cd4fddb 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -648,8 +648,10 @@ void smc_wr_free_link(struct smc_link *lnk) smc_wr_tx_wait_no_pending_sends(lnk); percpu_ref_kill(&lnk->wr_reg_refs); wait_for_completion(&lnk->reg_ref_comp); + percpu_ref_exit(&lnk->wr_reg_refs); percpu_ref_kill(&lnk->wr_tx_refs); wait_for_completion(&lnk->tx_ref_comp); + percpu_ref_exit(&lnk->wr_tx_refs); if (lnk->wr_rx_dma_addr) { ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, @@ -912,11 +914,13 @@ int smc_wr_create_link(struct smc_link *lnk) init_waitqueue_head(&lnk->wr_reg_wait); rc = percpu_ref_init(&lnk->wr_reg_refs, smcr_wr_reg_refs_free, 0, GFP_KERNEL); if (rc) - goto dma_unmap; + goto cancel_ref; init_completion(&lnk->reg_ref_comp); init_waitqueue_head(&lnk->wr_rx_empty_wait); return rc; +cancel_ref: + percpu_ref_exit(&lnk->wr_tx_refs); dma_unmap: if (lnk->wr_rx_v2_dma_addr) { ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr, -- cgit v1.2.3 From 82ac39ebd6db0c9f7a97a934bda1e3e101a9d201 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Mon, 14 Oct 2024 19:53:21 +0800 Subject: net/smc: Fix searching in list of known pnetids in smc_pnet_add_pnetid pnetid of pi (not newly allocated pe) should be compared Fixes: e888a2e8337c ("net/smc: introduce list of pnetids for Ethernet devices") Reviewed-by: D. Wythe Reviewed-by: Wen Gu Signed-off-by: Li RongQing Reviewed-by: Simon Horman Reviewed-by: Gerd Bayer Link: https://patch.msgid.link/20241014115321.33234-1-lirongqing@baidu.com Signed-off-by: Jakub Kicinski --- net/smc/smc_pnet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 1dd362326c0a..a04aa0e882f8 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -753,7 +753,7 @@ static int smc_pnet_add_pnetid(struct net *net, u8 *pnetid) write_lock(&sn->pnetids_ndev.lock); list_for_each_entry(pi, &sn->pnetids_ndev.list, list) { - if (smc_pnet_match(pnetid, pe->pnetid)) { + if (smc_pnet_match(pnetid, pi->pnetid)) { refcount_inc(&pi->refcnt); kfree(pe); goto unlock; -- cgit v1.2.3 From 3d041393ea8c815f773020fb4a995331a69c0139 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 14 Oct 2024 16:06:00 +0200 Subject: mptcp: prevent MPC handshake on port-based signal endpoints Syzkaller reported a lockdep splat: ============================================ WARNING: possible recursive locking detected 6.11.0-rc6-syzkaller-00019-g67784a74e258 #0 Not tainted -------------------------------------------- syz-executor364/5113 is trying to acquire lock: ffff8880449f1958 (k-slock-AF_INET){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline] ffff8880449f1958 (k-slock-AF_INET){+.-.}-{2:2}, at: sk_clone_lock+0x2cd/0xf40 net/core/sock.c:2328 but task is already holding lock: ffff88803fe3cb58 (k-slock-AF_INET){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline] ffff88803fe3cb58 (k-slock-AF_INET){+.-.}-{2:2}, at: sk_clone_lock+0x2cd/0xf40 net/core/sock.c:2328 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(k-slock-AF_INET); lock(k-slock-AF_INET); *** DEADLOCK *** May be due to missing lock nesting notation 7 locks held by syz-executor364/5113: #0: ffff8880449f0e18 (sk_lock-AF_INET){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1607 [inline] #0: ffff8880449f0e18 (sk_lock-AF_INET){+.+.}-{0:0}, at: mptcp_sendmsg+0x153/0x1b10 net/mptcp/protocol.c:1806 #1: ffff88803fe39ad8 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1607 [inline] #1: ffff88803fe39ad8 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: mptcp_sendmsg_fastopen+0x11f/0x530 net/mptcp/protocol.c:1727 #2: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:326 [inline] #2: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:838 [inline] #2: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: __ip_queue_xmit+0x5f/0x1b80 net/ipv4/ip_output.c:470 #3: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:326 [inline] #3: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:838 [inline] #3: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: ip_finish_output2+0x45f/0x1390 net/ipv4/ip_output.c:228 #4: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: local_lock_acquire include/linux/local_lock_internal.h:29 [inline] #4: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: process_backlog+0x33b/0x15b0 net/core/dev.c:6104 #5: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:326 [inline] #5: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:838 [inline] #5: ffffffff8e938320 (rcu_read_lock){....}-{1:2}, at: ip_local_deliver_finish+0x230/0x5f0 net/ipv4/ip_input.c:232 #6: ffff88803fe3cb58 (k-slock-AF_INET){+.-.}-{2:2}, at: spin_lock include/linux/spinlock.h:351 [inline] #6: ffff88803fe3cb58 (k-slock-AF_INET){+.-.}-{2:2}, at: sk_clone_lock+0x2cd/0xf40 net/core/sock.c:2328 stack backtrace: CPU: 0 UID: 0 PID: 5113 Comm: syz-executor364 Not tainted 6.11.0-rc6-syzkaller-00019-g67784a74e258 #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:93 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:119 check_deadlock kernel/locking/lockdep.c:3061 [inline] validate_chain+0x15d3/0x5900 kernel/locking/lockdep.c:3855 __lock_acquire+0x137a/0x2040 kernel/locking/lockdep.c:5142 lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5759 __raw_spin_lock include/linux/spinlock_api_smp.h:133 [inline] _raw_spin_lock+0x2e/0x40 kernel/locking/spinlock.c:154 spin_lock include/linux/spinlock.h:351 [inline] sk_clone_lock+0x2cd/0xf40 net/core/sock.c:2328 mptcp_sk_clone_init+0x32/0x13c0 net/mptcp/protocol.c:3279 subflow_syn_recv_sock+0x931/0x1920 net/mptcp/subflow.c:874 tcp_check_req+0xfe4/0x1a20 net/ipv4/tcp_minisocks.c:853 tcp_v4_rcv+0x1c3e/0x37f0 net/ipv4/tcp_ipv4.c:2267 ip_protocol_deliver_rcu+0x22e/0x440 net/ipv4/ip_input.c:205 ip_local_deliver_finish+0x341/0x5f0 net/ipv4/ip_input.c:233 NF_HOOK+0x3a4/0x450 include/linux/netfilter.h:314 NF_HOOK+0x3a4/0x450 include/linux/netfilter.h:314 __netif_receive_skb_one_core net/core/dev.c:5661 [inline] __netif_receive_skb+0x2bf/0x650 net/core/dev.c:5775 process_backlog+0x662/0x15b0 net/core/dev.c:6108 __napi_poll+0xcb/0x490 net/core/dev.c:6772 napi_poll net/core/dev.c:6841 [inline] net_rx_action+0x89b/0x1240 net/core/dev.c:6963 handle_softirqs+0x2c4/0x970 kernel/softirq.c:554 do_softirq+0x11b/0x1e0 kernel/softirq.c:455 __local_bh_enable_ip+0x1bb/0x200 kernel/softirq.c:382 local_bh_enable include/linux/bottom_half.h:33 [inline] rcu_read_unlock_bh include/linux/rcupdate.h:908 [inline] __dev_queue_xmit+0x1763/0x3e90 net/core/dev.c:4450 dev_queue_xmit include/linux/netdevice.h:3105 [inline] neigh_hh_output include/net/neighbour.h:526 [inline] neigh_output include/net/neighbour.h:540 [inline] ip_finish_output2+0xd41/0x1390 net/ipv4/ip_output.c:235 ip_local_out net/ipv4/ip_output.c:129 [inline] __ip_queue_xmit+0x118c/0x1b80 net/ipv4/ip_output.c:535 __tcp_transmit_skb+0x2544/0x3b30 net/ipv4/tcp_output.c:1466 tcp_rcv_synsent_state_process net/ipv4/tcp_input.c:6542 [inline] tcp_rcv_state_process+0x2c32/0x4570 net/ipv4/tcp_input.c:6729 tcp_v4_do_rcv+0x77d/0xc70 net/ipv4/tcp_ipv4.c:1934 sk_backlog_rcv include/net/sock.h:1111 [inline] __release_sock+0x214/0x350 net/core/sock.c:3004 release_sock+0x61/0x1f0 net/core/sock.c:3558 mptcp_sendmsg_fastopen+0x1ad/0x530 net/mptcp/protocol.c:1733 mptcp_sendmsg+0x1884/0x1b10 net/mptcp/protocol.c:1812 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x1a6/0x270 net/socket.c:745 ____sys_sendmsg+0x525/0x7d0 net/socket.c:2597 ___sys_sendmsg net/socket.c:2651 [inline] __sys_sendmmsg+0x3b2/0x740 net/socket.c:2737 __do_sys_sendmmsg net/socket.c:2766 [inline] __se_sys_sendmmsg net/socket.c:2763 [inline] __x64_sys_sendmmsg+0xa0/0xb0 net/socket.c:2763 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f04fb13a6b9 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 01 1a 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffd651f42d8 EFLAGS: 00000246 ORIG_RAX: 0000000000000133 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f04fb13a6b9 RDX: 0000000000000001 RSI: 0000000020000d00 RDI: 0000000000000004 RBP: 00007ffd651f4310 R08: 0000000000000001 R09: 0000000000000001 R10: 0000000020000080 R11: 0000000000000246 R12: 00000000000f4240 R13: 00007f04fb187449 R14: 00007ffd651f42f4 R15: 00007ffd651f4300 As noted by Cong Wang, the splat is false positive, but the code path leading to the report is an unexpected one: a client is attempting an MPC handshake towards the in-kernel listener created by the in-kernel PM for a port based signal endpoint. Such connection will be never accepted; many of them can make the listener queue full and preventing the creation of MPJ subflow via such listener - its intended role. Explicitly detect this scenario at initial-syn time and drop the incoming MPC request. Fixes: 1729cf186d8a ("mptcp: create the listening socket for new port") Cc: stable@vger.kernel.org Reported-by: syzbot+f4aacdfef2c6a6529c3e@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=f4aacdfef2c6a6529c3e Cc: Cong Wang Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241014-net-mptcp-mpc-port-endp-v2-1-7faea8e6b6ae@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/mib.c | 1 + net/mptcp/mib.h | 1 + net/mptcp/pm_netlink.c | 1 + net/mptcp/protocol.h | 1 + net/mptcp/subflow.c | 11 +++++++++++ 5 files changed, 15 insertions(+) (limited to 'net') diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index ad88bd3c58df..19eb9292bd60 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -17,6 +17,7 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK), SNMP_MIB_ITEM("MPCapableSYNTXDrop", MPTCP_MIB_MPCAPABLEACTIVEDROP), SNMP_MIB_ITEM("MPCapableSYNTXDisabled", MPTCP_MIB_MPCAPABLEACTIVEDISABLED), + SNMP_MIB_ITEM("MPCapableEndpAttempt", MPTCP_MIB_MPCAPABLEENDPATTEMPT), SNMP_MIB_ITEM("MPFallbackTokenInit", MPTCP_MIB_TOKENFALLBACKINIT), SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS), SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN), diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index 3206cdda8bb1..128282982843 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -12,6 +12,7 @@ enum linux_mptcp_mib_field { MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */ MPTCP_MIB_MPCAPABLEACTIVEDROP, /* Client-side fallback due to a MPC drop */ MPTCP_MIB_MPCAPABLEACTIVEDISABLED, /* Client-side disabled due to past issues */ + MPTCP_MIB_MPCAPABLEENDPATTEMPT, /* Prohibited MPC to port-based endp */ MPTCP_MIB_TOKENFALLBACKINIT, /* Could not init/allocate token */ MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */ MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */ diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index f6f0a38a0750..1a78998fe1f4 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1121,6 +1121,7 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, */ inet_sk_state_store(newsk, TCP_LISTEN); lock_sock(ssk); + WRITE_ONCE(mptcp_subflow_ctx(ssk)->pm_listener, true); err = __inet_listen_sk(ssk, backlog); if (!err) mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 74417aae08d0..568a72702b08 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -535,6 +535,7 @@ struct mptcp_subflow_context { __unused : 8; bool data_avail; bool scheduled; + bool pm_listener; /* a listener managed by the kernel PM? */ u32 remote_nonce; u64 thmac; u32 local_nonce; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 25dde81bcb75..6170f2fff71e 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -132,6 +132,13 @@ static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason) } } +static int subflow_reset_req_endp(struct request_sock *req, struct sk_buff *skb) +{ + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEENDPATTEMPT); + subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); + return -EPERM; +} + /* Init mptcp request socket. * * Returns an error code if a JOIN has failed and a TCP reset @@ -165,6 +172,8 @@ static int subflow_check_req(struct request_sock *req, if (opt_mp_capable) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); + if (unlikely(listener->pm_listener)) + return subflow_reset_req_endp(req, skb); if (opt_mp_join) return 0; } else if (opt_mp_join) { @@ -172,6 +181,8 @@ static int subflow_check_req(struct request_sock *req, if (mp_opt.backup) SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNBACKUPRX); + } else if (unlikely(listener->pm_listener)) { + return subflow_reset_req_endp(req, skb); } if (opt_mp_capable && listener->request_mptcp) { -- cgit v1.2.3 From e8c526f2bdf1845bedaf6a478816a3d06fa78b8f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Oct 2024 15:33:12 -0700 Subject: tcp/dccp: Don't use timer_pending() in reqsk_queue_unlink(). Martin KaFai Lau reported use-after-free [0] in reqsk_timer_handler(). """ We are seeing a use-after-free from a bpf prog attached to trace_tcp_retransmit_synack. The program passes the req->sk to the bpf_sk_storage_get_tracing kernel helper which does check for null before using it. """ The commit 83fccfc3940c ("inet: fix potential deadlock in reqsk_queue_unlink()") added timer_pending() in reqsk_queue_unlink() not to call del_timer_sync() from reqsk_timer_handler(), but it introduced a small race window. Before the timer is called, expire_timers() calls detach_timer(timer, true) to clear timer->entry.pprev and marks it as not pending. If reqsk_queue_unlink() checks timer_pending() just after expire_timers() calls detach_timer(), TCP will miss del_timer_sync(); the reqsk timer will continue running and send multiple SYN+ACKs until it expires. The reported UAF could happen if req->sk is close()d earlier than the timer expiration, which is 63s by default. The scenario would be 1. inet_csk_complete_hashdance() calls inet_csk_reqsk_queue_drop(), but del_timer_sync() is missed 2. reqsk timer is executed and scheduled again 3. req->sk is accept()ed and reqsk_put() decrements rsk_refcnt, but reqsk timer still has another one, and inet_csk_accept() does not clear req->sk for non-TFO sockets 4. sk is close()d 5. reqsk timer is executed again, and BPF touches req->sk Let's not use timer_pending() by passing the caller context to __inet_csk_reqsk_queue_drop(). Note that reqsk timer is pinned, so the issue does not happen in most use cases. [1] [0] BUG: KFENCE: use-after-free read in bpf_sk_storage_get_tracing+0x2e/0x1b0 Use-after-free read at 0x00000000a891fb3a (in kfence-#1): bpf_sk_storage_get_tracing+0x2e/0x1b0 bpf_prog_5ea3e95db6da0438_tcp_retransmit_synack+0x1d20/0x1dda bpf_trace_run2+0x4c/0xc0 tcp_rtx_synack+0xf9/0x100 reqsk_timer_handler+0xda/0x3d0 run_timer_softirq+0x292/0x8a0 irq_exit_rcu+0xf5/0x320 sysvec_apic_timer_interrupt+0x6d/0x80 asm_sysvec_apic_timer_interrupt+0x16/0x20 intel_idle_irq+0x5a/0xa0 cpuidle_enter_state+0x94/0x273 cpu_startup_entry+0x15e/0x260 start_secondary+0x8a/0x90 secondary_startup_64_no_verify+0xfa/0xfb kfence-#1: 0x00000000a72cc7b6-0x00000000d97616d9, size=2376, cache=TCPv6 allocated by task 0 on cpu 9 at 260507.901592s: sk_prot_alloc+0x35/0x140 sk_clone_lock+0x1f/0x3f0 inet_csk_clone_lock+0x15/0x160 tcp_create_openreq_child+0x1f/0x410 tcp_v6_syn_recv_sock+0x1da/0x700 tcp_check_req+0x1fb/0x510 tcp_v6_rcv+0x98b/0x1420 ipv6_list_rcv+0x2258/0x26e0 napi_complete_done+0x5b1/0x2990 mlx5e_napi_poll+0x2ae/0x8d0 net_rx_action+0x13e/0x590 irq_exit_rcu+0xf5/0x320 common_interrupt+0x80/0x90 asm_common_interrupt+0x22/0x40 cpuidle_enter_state+0xfb/0x273 cpu_startup_entry+0x15e/0x260 start_secondary+0x8a/0x90 secondary_startup_64_no_verify+0xfa/0xfb freed by task 0 on cpu 9 at 260507.927527s: rcu_core_si+0x4ff/0xf10 irq_exit_rcu+0xf5/0x320 sysvec_apic_timer_interrupt+0x6d/0x80 asm_sysvec_apic_timer_interrupt+0x16/0x20 cpuidle_enter_state+0xfb/0x273 cpu_startup_entry+0x15e/0x260 start_secondary+0x8a/0x90 secondary_startup_64_no_verify+0xfa/0xfb Fixes: 83fccfc3940c ("inet: fix potential deadlock in reqsk_queue_unlink()") Reported-by: Martin KaFai Lau Closes: https://lore.kernel.org/netdev/eb6684d0-ffd9-4bdc-9196-33f690c25824@linux.dev/ Link: https://lore.kernel.org/netdev/b55e2ca0-42f2-4b7c-b445-6ffd87ca74a0@linux.dev/ [1] Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Martin KaFai Lau Link: https://patch.msgid.link/20241014223312.4254-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_connection_sock.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 2c5632d4fddb..2b698f8419fe 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1045,21 +1045,31 @@ static bool reqsk_queue_unlink(struct request_sock *req) found = __sk_nulls_del_node_init_rcu(sk); spin_unlock(lock); } - if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer)) - reqsk_put(req); + return found; } -bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) +static bool __inet_csk_reqsk_queue_drop(struct sock *sk, + struct request_sock *req, + bool from_timer) { bool unlinked = reqsk_queue_unlink(req); + if (!from_timer && timer_delete_sync(&req->rsk_timer)) + reqsk_put(req); + if (unlinked) { reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); reqsk_put(req); } + return unlinked; } + +bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) +{ + return __inet_csk_reqsk_queue_drop(sk, req, false); +} EXPORT_SYMBOL(inet_csk_reqsk_queue_drop); void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req) @@ -1152,7 +1162,7 @@ static void reqsk_timer_handler(struct timer_list *t) if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) { /* delete timer */ - inet_csk_reqsk_queue_drop(sk_listener, nreq); + __inet_csk_reqsk_queue_drop(sk_listener, nreq, true); goto no_ownership; } @@ -1178,7 +1188,8 @@ no_ownership: } drop: - inet_csk_reqsk_queue_drop_and_put(oreq->rsk_listener, oreq); + __inet_csk_reqsk_queue_drop(sk_listener, oreq, true); + reqsk_put(req); } static bool reqsk_queue_hash_req(struct request_sock *req, -- cgit v1.2.3 From 56440d7ec28d60f8da3bfa09062b3368ff9b16db Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 11 Oct 2024 17:12:17 +0000 Subject: genetlink: hold RCU in genlmsg_mcast() While running net selftests with CONFIG_PROVE_RCU_LIST=y I saw one lockdep splat [1]. genlmsg_mcast() uses for_each_net_rcu(), and must therefore hold RCU. Instead of letting all callers guard genlmsg_multicast_allns() with a rcu_read_lock()/rcu_read_unlock() pair, do it in genlmsg_mcast(). This also means the @flags parameter is useless, we need to always use GFP_ATOMIC. [1] [10882.424136] ============================= [10882.424166] WARNING: suspicious RCU usage [10882.424309] 6.12.0-rc2-virtme #1156 Not tainted [10882.424400] ----------------------------- [10882.424423] net/netlink/genetlink.c:1940 RCU-list traversed in non-reader section!! [10882.424469] other info that might help us debug this: [10882.424500] rcu_scheduler_active = 2, debug_locks = 1 [10882.424744] 2 locks held by ip/15677: [10882.424791] #0: ffffffffb6b491b0 (cb_lock){++++}-{3:3}, at: genl_rcv (net/netlink/genetlink.c:1219) [10882.426334] #1: ffffffffb6b49248 (genl_mutex){+.+.}-{3:3}, at: genl_rcv_msg (net/netlink/genetlink.c:61 net/netlink/genetlink.c:57 net/netlink/genetlink.c:1209) [10882.426465] stack backtrace: [10882.426805] CPU: 14 UID: 0 PID: 15677 Comm: ip Not tainted 6.12.0-rc2-virtme #1156 [10882.426919] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [10882.427046] Call Trace: [10882.427131] [10882.427244] dump_stack_lvl (lib/dump_stack.c:123) [10882.427335] lockdep_rcu_suspicious (kernel/locking/lockdep.c:6822) [10882.427387] genlmsg_multicast_allns (net/netlink/genetlink.c:1940 (discriminator 7) net/netlink/genetlink.c:1977 (discriminator 7)) [10882.427436] l2tp_tunnel_notify.constprop.0 (net/l2tp/l2tp_netlink.c:119) l2tp_netlink [10882.427683] l2tp_nl_cmd_tunnel_create (net/l2tp/l2tp_netlink.c:253) l2tp_netlink [10882.427748] genl_family_rcv_msg_doit (net/netlink/genetlink.c:1115) [10882.427834] genl_rcv_msg (net/netlink/genetlink.c:1195 net/netlink/genetlink.c:1210) [10882.427877] ? __pfx_l2tp_nl_cmd_tunnel_create (net/l2tp/l2tp_netlink.c:186) l2tp_netlink [10882.427927] ? __pfx_genl_rcv_msg (net/netlink/genetlink.c:1201) [10882.427959] netlink_rcv_skb (net/netlink/af_netlink.c:2551) [10882.428069] genl_rcv (net/netlink/genetlink.c:1220) [10882.428095] netlink_unicast (net/netlink/af_netlink.c:1332 net/netlink/af_netlink.c:1357) [10882.428140] netlink_sendmsg (net/netlink/af_netlink.c:1901) [10882.428210] ____sys_sendmsg (net/socket.c:729 (discriminator 1) net/socket.c:744 (discriminator 1) net/socket.c:2607 (discriminator 1)) Fixes: 33f72e6f0c67 ("l2tp : multicast notification to the registered listeners") Signed-off-by: Eric Dumazet Cc: James Chapman Cc: Tom Parkin Cc: Johannes Berg Link: https://patch.msgid.link/20241011171217.3166614-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/target/target_core_user.c | 2 +- include/net/genetlink.h | 3 +-- net/l2tp/l2tp_netlink.c | 4 ++-- net/netlink/genetlink.c | 28 ++++++++++++++-------------- net/wireless/nl80211.c | 8 ++------ 5 files changed, 20 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 7eb94894bd68..717931267bda 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -2130,7 +2130,7 @@ static int tcmu_netlink_event_send(struct tcmu_dev *udev, } ret = genlmsg_multicast_allns(&tcmu_genl_family, skb, 0, - TCMU_MCGRP_CONFIG, GFP_KERNEL); + TCMU_MCGRP_CONFIG); /* Wait during an add as the listener may not be up yet */ if (ret == 0 || diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 9ab49bfeae78..c1d91f1d20f6 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -531,13 +531,12 @@ static inline int genlmsg_multicast(const struct genl_family *family, * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: offset of multicast group in groups array - * @flags: allocation flags * * This function must hold the RTNL or rcu_read_lock(). */ int genlmsg_multicast_allns(const struct genl_family *family, struct sk_buff *skb, u32 portid, - unsigned int group, gfp_t flags); + unsigned int group); /** * genlmsg_unicast - unicast a netlink message diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index 284f1dec1b56..59457c0c14aa 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -116,7 +116,7 @@ static int l2tp_tunnel_notify(struct genl_family *family, NLM_F_ACK, tunnel, cmd); if (ret >= 0) { - ret = genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC); + ret = genlmsg_multicast_allns(family, msg, 0, 0); /* We don't care if no one is listening */ if (ret == -ESRCH) ret = 0; @@ -144,7 +144,7 @@ static int l2tp_session_notify(struct genl_family *family, NLM_F_ACK, session, cmd); if (ret >= 0) { - ret = genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC); + ret = genlmsg_multicast_allns(family, msg, 0, 0); /* We don't care if no one is listening */ if (ret == -ESRCH) ret = 0; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index feb54c63a116..07ad65774fe2 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1501,15 +1501,11 @@ static int genl_ctrl_event(int event, const struct genl_family *family, if (IS_ERR(msg)) return PTR_ERR(msg); - if (!family->netnsok) { + if (!family->netnsok) genlmsg_multicast_netns(&genl_ctrl, &init_net, msg, 0, 0, GFP_KERNEL); - } else { - rcu_read_lock(); - genlmsg_multicast_allns(&genl_ctrl, msg, 0, - 0, GFP_ATOMIC); - rcu_read_unlock(); - } + else + genlmsg_multicast_allns(&genl_ctrl, msg, 0, 0); return 0; } @@ -1929,23 +1925,23 @@ problem: core_initcall(genl_init); -static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group, - gfp_t flags) +static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group) { struct sk_buff *tmp; struct net *net, *prev = NULL; bool delivered = false; int err; + rcu_read_lock(); for_each_net_rcu(net) { if (prev) { - tmp = skb_clone(skb, flags); + tmp = skb_clone(skb, GFP_ATOMIC); if (!tmp) { err = -ENOMEM; goto error; } err = nlmsg_multicast(prev->genl_sock, tmp, - portid, group, flags); + portid, group, GFP_ATOMIC); if (!err) delivered = true; else if (err != -ESRCH) @@ -1954,27 +1950,31 @@ static int genlmsg_mcast(struct sk_buff *skb, u32 portid, unsigned long group, prev = net; } + err = nlmsg_multicast(prev->genl_sock, skb, portid, group, GFP_ATOMIC); + + rcu_read_unlock(); - err = nlmsg_multicast(prev->genl_sock, skb, portid, group, flags); if (!err) delivered = true; else if (err != -ESRCH) return err; return delivered ? 0 : -ESRCH; error: + rcu_read_unlock(); + kfree_skb(skb); return err; } int genlmsg_multicast_allns(const struct genl_family *family, struct sk_buff *skb, u32 portid, - unsigned int group, gfp_t flags) + unsigned int group) { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; group = family->mcgrp_offset + group; - return genlmsg_mcast(skb, portid, group, flags); + return genlmsg_mcast(skb, portid, group); } EXPORT_SYMBOL(genlmsg_multicast_allns); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 9ab777e0bd4d..d7d099f7118a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -17986,10 +17986,8 @@ void nl80211_common_reg_change_event(enum nl80211_commands cmd_id, genlmsg_end(msg, hdr); - rcu_read_lock(); genlmsg_multicast_allns(&nl80211_fam, msg, 0, - NL80211_MCGRP_REGULATORY, GFP_ATOMIC); - rcu_read_unlock(); + NL80211_MCGRP_REGULATORY); return; @@ -18722,10 +18720,8 @@ void nl80211_send_beacon_hint_event(struct wiphy *wiphy, genlmsg_end(msg, hdr); - rcu_read_lock(); genlmsg_multicast_allns(&nl80211_fam, msg, 0, - NL80211_MCGRP_REGULATORY, GFP_ATOMIC); - rcu_read_unlock(); + NL80211_MCGRP_REGULATORY); return; -- cgit v1.2.3 From d96016a764f6aa5c7528c3d3f9cb472ef7266951 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Fri, 11 Oct 2024 14:17:30 +0200 Subject: udp: Compute L4 checksum as usual when not segmenting the skb If: 1) the user requested USO, but 2) there is not enough payload for GSO to kick in, and 3) the egress device doesn't offer checksum offload, then we want to compute the L4 checksum in software early on. In the case when we are not taking the GSO path, but it has been requested, the software checksum fallback in skb_segment doesn't get a chance to compute the full checksum, if the egress device can't do it. As a result we end up sending UDP datagrams with only a partial checksum filled in, which the peer will discard. Fixes: 10154dbded6d ("udp: Allow GSO transmit from devices with no checksum offload") Reported-by: Ivan Babrou Signed-off-by: Jakub Sitnicki Acked-by: Willem de Bruijn Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20241011-uso-swcsum-fixup-v2-1-6e1ddc199af9@cloudflare.com Signed-off-by: Jakub Kicinski --- net/ipv4/udp.c | 4 +++- net/ipv6/udp.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8accbf4cb295..2849b273b131 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -951,8 +951,10 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen, cork->gso_size); + + /* Don't checksum the payload, skb will get segmented */ + goto csum_partial; } - goto csum_partial; } if (is_udplite) /* UDP-Lite */ diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 52dfbb2ff1a8..0cef8ae5d1ea 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1266,8 +1266,10 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen, cork->gso_size); + + /* Don't checksum the payload, skb will get segmented */ + goto csum_partial; } - goto csum_partial; } if (is_udplite) -- cgit v1.2.3 From 92f3715e1eba1d41e55be06159dc3d856b18326d Mon Sep 17 00:00:00 2001 From: Tyrone Wu Date: Fri, 11 Oct 2024 19:32:51 +0000 Subject: bpf: Fix link info netfilter flags to populate defrag flag This fix correctly populates the `bpf_link_info.netfilter.flags` field when user passes the `BPF_F_NETFILTER_IP_DEFRAG` flag. Fixes: 91721c2d02d3 ("netfilter: bpf: Support BPF_F_NETFILTER_IP_DEFRAG in netfilter link") Signed-off-by: Tyrone Wu Signed-off-by: Daniel Borkmann Acked-by: Florian Westphal Cc: Daniel Xu Link: https://lore.kernel.org/bpf/20241011193252.178997-1-wudevelops@gmail.com --- net/netfilter/nf_bpf_link.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c index 5257d5e7eb09..797fe8a9971e 100644 --- a/net/netfilter/nf_bpf_link.c +++ b/net/netfilter/nf_bpf_link.c @@ -150,11 +150,12 @@ static int bpf_nf_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link); + const struct nf_defrag_hook *hook = nf_link->defrag_hook; info->netfilter.pf = nf_link->hook_ops.pf; info->netfilter.hooknum = nf_link->hook_ops.hooknum; info->netfilter.priority = nf_link->hook_ops.priority; - info->netfilter.flags = 0; + info->netfilter.flags = hook ? BPF_F_NETFILTER_IP_DEFRAG : 0; return 0; } -- cgit v1.2.3 From a9b7b535ba192c6b77e6c15a4c82d853163eab8c Mon Sep 17 00:00:00 2001 From: Aaron Thompson Date: Fri, 4 Oct 2024 23:04:08 +0000 Subject: Bluetooth: ISO: Fix multiple init when debugfs is disabled If bt_debugfs is not created successfully, which happens if either CONFIG_DEBUG_FS or CONFIG_DEBUG_FS_ALLOW_ALL is unset, then iso_init() returns early and does not set iso_inited to true. This means that a subsequent call to iso_init() will result in duplicate calls to proto_register(), bt_sock_register(), etc. With CONFIG_LIST_HARDENED and CONFIG_BUG_ON_DATA_CORRUPTION enabled, the duplicate call to proto_register() triggers this BUG(): list_add double add: new=ffffffffc0b280d0, prev=ffffffffbab56250, next=ffffffffc0b280d0. ------------[ cut here ]------------ kernel BUG at lib/list_debug.c:35! Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI CPU: 2 PID: 887 Comm: bluetoothd Not tainted 6.10.11-1-ao-desktop #1 RIP: 0010:__list_add_valid_or_report+0x9a/0xa0 ... __list_add_valid_or_report+0x9a/0xa0 proto_register+0x2b5/0x340 iso_init+0x23/0x150 [bluetooth] set_iso_socket_func+0x68/0x1b0 [bluetooth] kmem_cache_free+0x308/0x330 hci_sock_sendmsg+0x990/0x9e0 [bluetooth] __sock_sendmsg+0x7b/0x80 sock_write_iter+0x9a/0x110 do_iter_readv_writev+0x11d/0x220 vfs_writev+0x180/0x3e0 do_writev+0xca/0x100 ... This change removes the early return. The check for iso_debugfs being NULL was unnecessary, it is always NULL when iso_inited is false. Cc: stable@vger.kernel.org Fixes: ccf74f2390d6 ("Bluetooth: Add BTPROTO_ISO socket type") Signed-off-by: Aaron Thompson Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index d5e00d0dd1a0..c9eefb43bf47 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -2301,13 +2301,9 @@ int iso_init(void) hci_register_cb(&iso_cb); - if (IS_ERR_OR_NULL(bt_debugfs)) - return 0; - - if (!iso_debugfs) { + if (!IS_ERR_OR_NULL(bt_debugfs)) iso_debugfs = debugfs_create_file("iso", 0444, bt_debugfs, NULL, &iso_debugfs_fops); - } iso_inited = true; -- cgit v1.2.3 From d458cd1221e9e56da3b2cc5518ad3225caa91f20 Mon Sep 17 00:00:00 2001 From: Aaron Thompson Date: Fri, 4 Oct 2024 23:04:09 +0000 Subject: Bluetooth: Call iso_exit() on module unload If iso_init() has been called, iso_exit() must be called on module unload. Without that, the struct proto that iso_init() registered with proto_register() becomes invalid, which could cause unpredictable problems later. In my case, with CONFIG_LIST_HARDENED and CONFIG_BUG_ON_DATA_CORRUPTION enabled, loading the module again usually triggers this BUG(): list_add corruption. next->prev should be prev (ffffffffb5355fd0), but was 0000000000000068. (next=ffffffffc0a010d0). ------------[ cut here ]------------ kernel BUG at lib/list_debug.c:29! Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 4159 Comm: modprobe Not tainted 6.10.11-4+bt2-ao-desktop #1 RIP: 0010:__list_add_valid_or_report+0x61/0xa0 ... __list_add_valid_or_report+0x61/0xa0 proto_register+0x299/0x320 hci_sock_init+0x16/0xc0 [bluetooth] bt_init+0x68/0xd0 [bluetooth] __pfx_bt_init+0x10/0x10 [bluetooth] do_one_initcall+0x80/0x2f0 do_init_module+0x8b/0x230 __do_sys_init_module+0x15f/0x190 do_syscall_64+0x68/0x110 ... Cc: stable@vger.kernel.org Fixes: ccf74f2390d6 ("Bluetooth: Add BTPROTO_ISO socket type") Signed-off-by: Aaron Thompson Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/af_bluetooth.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 67604ccec2f4..9425d0680844 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -830,6 +830,8 @@ cleanup_led: static void __exit bt_exit(void) { + iso_exit(); + mgmt_exit(); sco_exit(); -- cgit v1.2.3 From 1db4564f101b47188c1b71696bd342ef09172b22 Mon Sep 17 00:00:00 2001 From: Aaron Thompson Date: Fri, 4 Oct 2024 23:04:10 +0000 Subject: Bluetooth: Remove debugfs directory on module init failure If bt_init() fails, the debugfs directory currently is not removed. If the module is loaded again after that, the debugfs directory is not set up properly due to the existing directory. # modprobe bluetooth # ls -laF /sys/kernel/debug/bluetooth total 0 drwxr-xr-x 2 root root 0 Sep 27 14:26 ./ drwx------ 31 root root 0 Sep 27 14:25 ../ -r--r--r-- 1 root root 0 Sep 27 14:26 l2cap -r--r--r-- 1 root root 0 Sep 27 14:26 sco # modprobe -r bluetooth # ls -laF /sys/kernel/debug/bluetooth ls: cannot access '/sys/kernel/debug/bluetooth': No such file or directory # # modprobe bluetooth modprobe: ERROR: could not insert 'bluetooth': Invalid argument # dmesg | tail -n 6 Bluetooth: Core ver 2.22 NET: Registered PF_BLUETOOTH protocol family Bluetooth: HCI device and connection manager initialized Bluetooth: HCI socket layer initialized Bluetooth: Faking l2cap_init() failure for testing NET: Unregistered PF_BLUETOOTH protocol family # ls -laF /sys/kernel/debug/bluetooth total 0 drwxr-xr-x 2 root root 0 Sep 27 14:31 ./ drwx------ 31 root root 0 Sep 27 14:26 ../ # # modprobe bluetooth # dmesg | tail -n 7 Bluetooth: Core ver 2.22 debugfs: Directory 'bluetooth' with parent '/' already present! NET: Registered PF_BLUETOOTH protocol family Bluetooth: HCI device and connection manager initialized Bluetooth: HCI socket layer initialized Bluetooth: L2CAP socket layer initialized Bluetooth: SCO socket layer initialized # ls -laF /sys/kernel/debug/bluetooth total 0 drwxr-xr-x 2 root root 0 Sep 27 14:31 ./ drwx------ 31 root root 0 Sep 27 14:26 ../ # Cc: stable@vger.kernel.org Fixes: ffcecac6a738 ("Bluetooth: Create root debugfs directory during module init") Signed-off-by: Aaron Thompson Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/af_bluetooth.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 9425d0680844..e39fba5565c5 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -825,6 +825,7 @@ cleanup_sysfs: bt_sysfs_cleanup(); cleanup_led: bt_leds_cleanup(); + debugfs_remove_recursive(bt_debugfs); return err; } -- cgit v1.2.3 From 64a90991ba8d4e32e3173ddd83d0b24167a5668c Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 14 Oct 2024 17:07:08 +0800 Subject: Bluetooth: bnep: fix wild-memory-access in proto_unregister There's issue as follows: KASAN: maybe wild-memory-access in range [0xdead...108-0xdead...10f] CPU: 3 UID: 0 PID: 2805 Comm: rmmod Tainted: G W RIP: 0010:proto_unregister+0xee/0x400 Call Trace: __do_sys_delete_module+0x318/0x580 do_syscall_64+0xc1/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f As bnep_init() ignore bnep_sock_init()'s return value, and bnep_sock_init() will cleanup all resource. Then when remove bnep module will call bnep_sock_cleanup() to cleanup sock's resource. To solve above issue just return bnep_sock_init()'s return value in bnep_exit(). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Ye Bin Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/bnep/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c index a3bc0934cc13..d44987d4515c 100644 --- a/net/bluetooth/bnep/core.c +++ b/net/bluetooth/bnep/core.c @@ -745,8 +745,7 @@ static int __init bnep_init(void) if (flt[0]) BT_INFO("BNEP filters: %s", flt); - bnep_sock_init(); - return 0; + return bnep_sock_init(); } static void __exit bnep_exit(void) -- cgit v1.2.3 From 7decd1f5904a489d3ccdcf131972f94645681689 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 15 Oct 2024 10:38:47 +0200 Subject: mptcp: pm: fix UaF read in mptcp_pm_nl_rm_addr_or_subflow Syzkaller reported this splat: ================================================================== BUG: KASAN: slab-use-after-free in mptcp_pm_nl_rm_addr_or_subflow+0xb44/0xcc0 net/mptcp/pm_netlink.c:881 Read of size 4 at addr ffff8880569ac858 by task syz.1.2799/14662 CPU: 0 UID: 0 PID: 14662 Comm: syz.1.2799 Not tainted 6.12.0-rc2-syzkaller-00307-g36c254515dc6 #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:377 [inline] print_report+0xc3/0x620 mm/kasan/report.c:488 kasan_report+0xd9/0x110 mm/kasan/report.c:601 mptcp_pm_nl_rm_addr_or_subflow+0xb44/0xcc0 net/mptcp/pm_netlink.c:881 mptcp_pm_nl_rm_subflow_received net/mptcp/pm_netlink.c:914 [inline] mptcp_nl_remove_id_zero_address+0x305/0x4a0 net/mptcp/pm_netlink.c:1572 mptcp_pm_nl_del_addr_doit+0x5c9/0x770 net/mptcp/pm_netlink.c:1603 genl_family_rcv_msg_doit+0x202/0x2f0 net/netlink/genetlink.c:1115 genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline] genl_rcv_msg+0x565/0x800 net/netlink/genetlink.c:1210 netlink_rcv_skb+0x165/0x410 net/netlink/af_netlink.c:2551 genl_rcv+0x28/0x40 net/netlink/genetlink.c:1219 netlink_unicast_kernel net/netlink/af_netlink.c:1331 [inline] netlink_unicast+0x53c/0x7f0 net/netlink/af_netlink.c:1357 netlink_sendmsg+0x8b8/0xd70 net/netlink/af_netlink.c:1901 sock_sendmsg_nosec net/socket.c:729 [inline] __sock_sendmsg net/socket.c:744 [inline] ____sys_sendmsg+0x9ae/0xb40 net/socket.c:2607 ___sys_sendmsg+0x135/0x1e0 net/socket.c:2661 __sys_sendmsg+0x117/0x1f0 net/socket.c:2690 do_syscall_32_irqs_on arch/x86/entry/common.c:165 [inline] __do_fast_syscall_32+0x73/0x120 arch/x86/entry/common.c:386 do_fast_syscall_32+0x32/0x80 arch/x86/entry/common.c:411 entry_SYSENTER_compat_after_hwframe+0x84/0x8e RIP: 0023:0xf7fe4579 Code: b8 01 10 06 03 74 b4 01 10 07 03 74 b0 01 10 08 03 74 d8 01 00 00 00 00 00 00 00 00 00 00 00 00 00 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 8d b4 26 00 00 00 00 8d b4 26 00 00 00 00 RSP: 002b:00000000f574556c EFLAGS: 00000296 ORIG_RAX: 0000000000000172 RAX: ffffffffffffffda RBX: 000000000000000b RCX: 0000000020000140 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000296 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Allocated by task 5387: kasan_save_stack+0x33/0x60 mm/kasan/common.c:47 kasan_save_track+0x14/0x30 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0xaa/0xb0 mm/kasan/common.c:394 kmalloc_noprof include/linux/slab.h:878 [inline] kzalloc_noprof include/linux/slab.h:1014 [inline] subflow_create_ctx+0x87/0x2a0 net/mptcp/subflow.c:1803 subflow_ulp_init+0xc3/0x4d0 net/mptcp/subflow.c:1956 __tcp_set_ulp net/ipv4/tcp_ulp.c:146 [inline] tcp_set_ulp+0x326/0x7f0 net/ipv4/tcp_ulp.c:167 mptcp_subflow_create_socket+0x4ae/0x10a0 net/mptcp/subflow.c:1764 __mptcp_subflow_connect+0x3cc/0x1490 net/mptcp/subflow.c:1592 mptcp_pm_create_subflow_or_signal_addr+0xbda/0x23a0 net/mptcp/pm_netlink.c:642 mptcp_pm_nl_fully_established net/mptcp/pm_netlink.c:650 [inline] mptcp_pm_nl_work+0x3a1/0x4f0 net/mptcp/pm_netlink.c:943 mptcp_worker+0x15a/0x1240 net/mptcp/protocol.c:2777 process_one_work+0x958/0x1b30 kernel/workqueue.c:3229 process_scheduled_works kernel/workqueue.c:3310 [inline] worker_thread+0x6c8/0xf00 kernel/workqueue.c:3391 kthread+0x2c1/0x3a0 kernel/kthread.c:389 ret_from_fork+0x45/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Freed by task 113: kasan_save_stack+0x33/0x60 mm/kasan/common.c:47 kasan_save_track+0x14/0x30 mm/kasan/common.c:68 kasan_save_free_info+0x3b/0x60 mm/kasan/generic.c:579 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x51/0x70 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:230 [inline] slab_free_hook mm/slub.c:2342 [inline] slab_free mm/slub.c:4579 [inline] kfree+0x14f/0x4b0 mm/slub.c:4727 kvfree+0x47/0x50 mm/util.c:701 kvfree_rcu_list+0xf5/0x2c0 kernel/rcu/tree.c:3423 kvfree_rcu_drain_ready kernel/rcu/tree.c:3563 [inline] kfree_rcu_monitor+0x503/0x8b0 kernel/rcu/tree.c:3632 kfree_rcu_shrink_scan+0x245/0x3a0 kernel/rcu/tree.c:3966 do_shrink_slab+0x44f/0x11c0 mm/shrinker.c:435 shrink_slab+0x32b/0x12a0 mm/shrinker.c:662 shrink_one+0x47e/0x7b0 mm/vmscan.c:4818 shrink_many mm/vmscan.c:4879 [inline] lru_gen_shrink_node mm/vmscan.c:4957 [inline] shrink_node+0x2452/0x39d0 mm/vmscan.c:5937 kswapd_shrink_node mm/vmscan.c:6765 [inline] balance_pgdat+0xc19/0x18f0 mm/vmscan.c:6957 kswapd+0x5ea/0xbf0 mm/vmscan.c:7226 kthread+0x2c1/0x3a0 kernel/kthread.c:389 ret_from_fork+0x45/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Last potentially related work creation: kasan_save_stack+0x33/0x60 mm/kasan/common.c:47 __kasan_record_aux_stack+0xba/0xd0 mm/kasan/generic.c:541 kvfree_call_rcu+0x74/0xbe0 kernel/rcu/tree.c:3810 subflow_ulp_release+0x2ae/0x350 net/mptcp/subflow.c:2009 tcp_cleanup_ulp+0x7c/0x130 net/ipv4/tcp_ulp.c:124 tcp_v4_destroy_sock+0x1c5/0x6a0 net/ipv4/tcp_ipv4.c:2541 inet_csk_destroy_sock+0x1a3/0x440 net/ipv4/inet_connection_sock.c:1293 tcp_done+0x252/0x350 net/ipv4/tcp.c:4870 tcp_rcv_state_process+0x379b/0x4f30 net/ipv4/tcp_input.c:6933 tcp_v4_do_rcv+0x1ad/0xa90 net/ipv4/tcp_ipv4.c:1938 sk_backlog_rcv include/net/sock.h:1115 [inline] __release_sock+0x31b/0x400 net/core/sock.c:3072 __tcp_close+0x4f3/0xff0 net/ipv4/tcp.c:3142 __mptcp_close_ssk+0x331/0x14d0 net/mptcp/protocol.c:2489 mptcp_close_ssk net/mptcp/protocol.c:2543 [inline] mptcp_close_ssk+0x150/0x220 net/mptcp/protocol.c:2526 mptcp_pm_nl_rm_addr_or_subflow+0x2be/0xcc0 net/mptcp/pm_netlink.c:878 mptcp_pm_nl_rm_subflow_received net/mptcp/pm_netlink.c:914 [inline] mptcp_nl_remove_id_zero_address+0x305/0x4a0 net/mptcp/pm_netlink.c:1572 mptcp_pm_nl_del_addr_doit+0x5c9/0x770 net/mptcp/pm_netlink.c:1603 genl_family_rcv_msg_doit+0x202/0x2f0 net/netlink/genetlink.c:1115 genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline] genl_rcv_msg+0x565/0x800 net/netlink/genetlink.c:1210 netlink_rcv_skb+0x165/0x410 net/netlink/af_netlink.c:2551 genl_rcv+0x28/0x40 net/netlink/genetlink.c:1219 netlink_unicast_kernel net/netlink/af_netlink.c:1331 [inline] netlink_unicast+0x53c/0x7f0 net/netlink/af_netlink.c:1357 netlink_sendmsg+0x8b8/0xd70 net/netlink/af_netlink.c:1901 sock_sendmsg_nosec net/socket.c:729 [inline] __sock_sendmsg net/socket.c:744 [inline] ____sys_sendmsg+0x9ae/0xb40 net/socket.c:2607 ___sys_sendmsg+0x135/0x1e0 net/socket.c:2661 __sys_sendmsg+0x117/0x1f0 net/socket.c:2690 do_syscall_32_irqs_on arch/x86/entry/common.c:165 [inline] __do_fast_syscall_32+0x73/0x120 arch/x86/entry/common.c:386 do_fast_syscall_32+0x32/0x80 arch/x86/entry/common.c:411 entry_SYSENTER_compat_after_hwframe+0x84/0x8e The buggy address belongs to the object at ffff8880569ac800 which belongs to the cache kmalloc-512 of size 512 The buggy address is located 88 bytes inside of freed 512-byte region [ffff8880569ac800, ffff8880569aca00) The buggy address belongs to the physical page: page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x569ac head: order:2 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 flags: 0x4fff00000000040(head|node=1|zone=1|lastcpupid=0x7ff) page_type: f5(slab) raw: 04fff00000000040 ffff88801ac42c80 dead000000000100 dead000000000122 raw: 0000000000000000 0000000080100010 00000001f5000000 0000000000000000 head: 04fff00000000040 ffff88801ac42c80 dead000000000100 dead000000000122 head: 0000000000000000 0000000080100010 00000001f5000000 0000000000000000 head: 04fff00000000002 ffffea00015a6b01 ffffffffffffffff 0000000000000000 head: 0000000000000004 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 2, migratetype Unmovable, gfp_mask 0xd20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC), pid 10238, tgid 10238 (kworker/u32:6), ts 597403252405, free_ts 597177952947 set_page_owner include/linux/page_owner.h:32 [inline] post_alloc_hook+0x2d1/0x350 mm/page_alloc.c:1537 prep_new_page mm/page_alloc.c:1545 [inline] get_page_from_freelist+0x101e/0x3070 mm/page_alloc.c:3457 __alloc_pages_noprof+0x223/0x25a0 mm/page_alloc.c:4733 alloc_pages_mpol_noprof+0x2c9/0x610 mm/mempolicy.c:2265 alloc_slab_page mm/slub.c:2412 [inline] allocate_slab mm/slub.c:2578 [inline] new_slab+0x2ba/0x3f0 mm/slub.c:2631 ___slab_alloc+0xd1d/0x16f0 mm/slub.c:3818 __slab_alloc.constprop.0+0x56/0xb0 mm/slub.c:3908 __slab_alloc_node mm/slub.c:3961 [inline] slab_alloc_node mm/slub.c:4122 [inline] __kmalloc_cache_noprof+0x2c5/0x310 mm/slub.c:4290 kmalloc_noprof include/linux/slab.h:878 [inline] kzalloc_noprof include/linux/slab.h:1014 [inline] mld_add_delrec net/ipv6/mcast.c:743 [inline] igmp6_leave_group net/ipv6/mcast.c:2625 [inline] igmp6_group_dropped+0x4ab/0xe40 net/ipv6/mcast.c:723 __ipv6_dev_mc_dec+0x281/0x360 net/ipv6/mcast.c:979 addrconf_leave_solict net/ipv6/addrconf.c:2253 [inline] __ipv6_ifa_notify+0x3f6/0xc30 net/ipv6/addrconf.c:6283 addrconf_ifdown.isra.0+0xef9/0x1a20 net/ipv6/addrconf.c:3982 addrconf_notify+0x220/0x19c0 net/ipv6/addrconf.c:3781 notifier_call_chain+0xb9/0x410 kernel/notifier.c:93 call_netdevice_notifiers_info+0xbe/0x140 net/core/dev.c:1996 call_netdevice_notifiers_extack net/core/dev.c:2034 [inline] call_netdevice_notifiers net/core/dev.c:2048 [inline] dev_close_many+0x333/0x6a0 net/core/dev.c:1589 page last free pid 13136 tgid 13136 stack trace: reset_page_owner include/linux/page_owner.h:25 [inline] free_pages_prepare mm/page_alloc.c:1108 [inline] free_unref_page+0x5f4/0xdc0 mm/page_alloc.c:2638 stack_depot_save_flags+0x2da/0x900 lib/stackdepot.c:666 kasan_save_stack+0x42/0x60 mm/kasan/common.c:48 kasan_save_track+0x14/0x30 mm/kasan/common.c:68 unpoison_slab_object mm/kasan/common.c:319 [inline] __kasan_slab_alloc+0x89/0x90 mm/kasan/common.c:345 kasan_slab_alloc include/linux/kasan.h:247 [inline] slab_post_alloc_hook mm/slub.c:4085 [inline] slab_alloc_node mm/slub.c:4134 [inline] kmem_cache_alloc_noprof+0x121/0x2f0 mm/slub.c:4141 skb_clone+0x190/0x3f0 net/core/skbuff.c:2084 do_one_broadcast net/netlink/af_netlink.c:1462 [inline] netlink_broadcast_filtered+0xb11/0xef0 net/netlink/af_netlink.c:1540 netlink_broadcast+0x39/0x50 net/netlink/af_netlink.c:1564 uevent_net_broadcast_untagged lib/kobject_uevent.c:331 [inline] kobject_uevent_net_broadcast lib/kobject_uevent.c:410 [inline] kobject_uevent_env+0xacd/0x1670 lib/kobject_uevent.c:608 device_del+0x623/0x9f0 drivers/base/core.c:3882 snd_card_disconnect.part.0+0x58a/0x7c0 sound/core/init.c:546 snd_card_disconnect+0x1f/0x30 sound/core/init.c:495 snd_usx2y_disconnect+0xe9/0x1f0 sound/usb/usx2y/usbusx2y.c:417 usb_unbind_interface+0x1e8/0x970 drivers/usb/core/driver.c:461 device_remove drivers/base/dd.c:569 [inline] device_remove+0x122/0x170 drivers/base/dd.c:561 That's because 'subflow' is used just after 'mptcp_close_ssk(subflow)', which will initiate the release of its memory. Even if it is very likely the release and the re-utilisation will be done later on, it is of course better to avoid any issues and read the content of 'subflow' before closing it. Fixes: 1c1f72137598 ("mptcp: pm: only decrement add_addr_accepted for MPJ req") Cc: stable@vger.kernel.org Reported-by: syzbot+3c8b7a8e7df6a2a226ca@syzkaller.appspotmail.com Closes: https://lore.kernel.org/670d7337.050a0220.4cbc0.004f.GAE@google.com Signed-off-by: Matthieu Baerts (NGI0) Acked-by: Paolo Abeni Link: https://patch.msgid.link/20241015-net-mptcp-uaf-pm-rm-v1-1-c4ee5d987a64@kernel.org Signed-off-by: Paolo Abeni --- net/mptcp/pm_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 1a78998fe1f4..db586a5b3866 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -873,12 +873,12 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, i, rm_id, id, remote_id, msk->mpc_endpoint_id); spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, how); + removed |= subflow->request_join; /* the following takes care of updating the subflows counter */ mptcp_close_ssk(sk, ssk, subflow); spin_lock_bh(&msk->pm.lock); - removed |= subflow->request_join; if (rm_type == MPTCP_MIB_RMSUBFLOW) __MPTCP_INC_STATS(sock_net(sk), rm_type); } -- cgit v1.2.3 From 9c5bd93edf7b8834aecaa7c340b852d5990d7c78 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Sun, 13 Oct 2024 18:26:39 +0200 Subject: bpf, sockmap: SK_DROP on attempted redirects of unsupported af_vsock Don't mislead the callers of bpf_{sk,msg}_redirect_{map,hash}(): make sure to immediately and visibly fail the forwarding of unsupported af_vsock packets. Fixes: 634f1a7110b4 ("vsock: support sockmap") Signed-off-by: Michal Luczaj Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20241013-vsock-fixes-for-redir-v2-1-d6577bbfe742@rbox.co --- include/net/sock.h | 5 +++++ net/core/sock_map.c | 8 ++++++++ 2 files changed, 13 insertions(+) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index c58ca8dd561b..c87295f3476d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2715,6 +2715,11 @@ static inline bool sk_is_stream_unix(const struct sock *sk) return sk->sk_family == AF_UNIX && sk->sk_type == SOCK_STREAM; } +static inline bool sk_is_vsock(const struct sock *sk) +{ + return sk->sk_family == AF_VSOCK; +} + /** * sk_eat_skb - Release a skb if it is no longer needed * @sk: socket to eat this skb from diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 242c91a6e3d3..07d6aa4e39ef 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -647,6 +647,8 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, sk = __sock_map_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; + if ((flags & BPF_F_INGRESS) && sk_is_vsock(sk)) + return SK_DROP; skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; @@ -675,6 +677,8 @@ BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg, return SK_DROP; if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) return SK_DROP; + if (sk_is_vsock(sk)) + return SK_DROP; msg->flags = flags; msg->sk_redir = sk; @@ -1249,6 +1253,8 @@ BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, sk = __sock_hash_lookup_elem(map, key); if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; + if ((flags & BPF_F_INGRESS) && sk_is_vsock(sk)) + return SK_DROP; skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; @@ -1277,6 +1283,8 @@ BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg, return SK_DROP; if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk)) return SK_DROP; + if (sk_is_vsock(sk)) + return SK_DROP; msg->flags = flags; msg->sk_redir = sk; -- cgit v1.2.3 From 3543152f2d330141d9394d28855cb90b860091d2 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Sun, 13 Oct 2024 18:26:40 +0200 Subject: vsock: Update rx_bytes on read_skb() Make sure virtio_transport_inc_rx_pkt() and virtio_transport_dec_rx_pkt() calls are balanced (i.e. virtio_vsock_sock::rx_bytes doesn't lie) after vsock_transport::read_skb(). While here, also inform the peer that we've freed up space and it has more credit. Failing to update rx_bytes after packet is dequeued leads to a warning on SOCK_STREAM recv(): [ 233.396654] rx_queue is empty, but rx_bytes is non-zero [ 233.396702] WARNING: CPU: 11 PID: 40601 at net/vmw_vsock/virtio_transport_common.c:589 Fixes: 634f1a7110b4 ("vsock: support sockmap") Suggested-by: Stefano Garzarella Signed-off-by: Michal Luczaj Signed-off-by: Daniel Borkmann Reviewed-by: Stefano Garzarella Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20241013-vsock-fixes-for-redir-v2-2-d6577bbfe742@rbox.co --- net/vmw_vsock/virtio_transport_common.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 884ee128851e..2e5ad96825cc 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -1707,6 +1707,7 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto { struct virtio_vsock_sock *vvs = vsk->trans; struct sock *sk = sk_vsock(vsk); + struct virtio_vsock_hdr *hdr; struct sk_buff *skb; int off = 0; int err; @@ -1716,10 +1717,16 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto * works for types other than dgrams. */ skb = __skb_recv_datagram(sk, &vvs->rx_queue, MSG_DONTWAIT, &off, &err); + if (!skb) { + spin_unlock_bh(&vvs->rx_lock); + return err; + } + + hdr = virtio_vsock_hdr(skb); + virtio_transport_dec_rx_pkt(vvs, le32_to_cpu(hdr->len)); spin_unlock_bh(&vvs->rx_lock); - if (!skb) - return err; + virtio_transport_send_credit_update(vsk); return recv_actor(sk, skb); } -- cgit v1.2.3 From 6dafde852df8de3617d4b9f835b629aaeaccd01d Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Sun, 13 Oct 2024 18:26:41 +0200 Subject: vsock: Update msg_count on read_skb() Dequeuing via vsock_transport::read_skb() left msg_count outdated, which then confused SOCK_SEQPACKET recv(). Decrease the counter. Fixes: 634f1a7110b4 ("vsock: support sockmap") Signed-off-by: Michal Luczaj Signed-off-by: Daniel Borkmann Reviewed-by: Stefano Garzarella Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20241013-vsock-fixes-for-redir-v2-3-d6577bbfe742@rbox.co --- net/vmw_vsock/virtio_transport_common.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 2e5ad96825cc..ccbd2bc0d210 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -1723,6 +1723,9 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto } hdr = virtio_vsock_hdr(skb); + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) + vvs->msg_count--; + virtio_transport_dec_rx_pkt(vvs, le32_to_cpu(hdr->len)); spin_unlock_bh(&vvs->rx_lock); -- cgit v1.2.3 From 19039f279797efbe044cae41ee216c5fe481fc33 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Sun, 13 Oct 2024 18:26:42 +0200 Subject: bpf, vsock: Drop static vsock_bpf_prot initialization vsock_bpf_prot is set up at runtime. Remove the superfluous init. No functional change intended. Fixes: 634f1a7110b4 ("vsock: support sockmap") Signed-off-by: Michal Luczaj Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20241013-vsock-fixes-for-redir-v2-4-d6577bbfe742@rbox.co --- net/vmw_vsock/vsock_bpf.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/vsock_bpf.c b/net/vmw_vsock/vsock_bpf.c index c42c5cc18f32..4aa6e74ec295 100644 --- a/net/vmw_vsock/vsock_bpf.c +++ b/net/vmw_vsock/vsock_bpf.c @@ -114,14 +114,6 @@ static int vsock_bpf_recvmsg(struct sock *sk, struct msghdr *msg, return copied; } -/* Copy of original proto with updated sock_map methods */ -static struct proto vsock_bpf_prot = { - .close = sock_map_close, - .recvmsg = vsock_bpf_recvmsg, - .sock_is_readable = sk_msg_is_readable, - .unhash = sock_map_unhash, -}; - static void vsock_bpf_rebuild_protos(struct proto *prot, const struct proto *base) { *prot = *base; -- cgit v1.2.3