From 44badc908f2c85711cb18e45e13119c10ad3a05f Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 24 Sep 2024 09:05:45 +0100 Subject: tcp: Fix spelling mistake "emtpy" -> "empty" There is a spelling mistake in a WARN_ONCE message. Fix it. Signed-off-by: Colin Ian King Reviewed-by: Jason Xing Link: https://patch.msgid.link/20240924080545.1324962-1-colin.i.king@gmail.com Signed-off-by: Paolo Abeni --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index d1948d357dad..739a9fb83d0c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2442,7 +2442,7 @@ static inline s64 tcp_rto_delta_us(const struct sock *sk) return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp; } else { WARN_ONCE(1, - "rtx queue emtpy: " + "rtx queue empty: " "out:%u sacked:%u lost:%u retrans:%u " "tlp_high_seq:%u sk_state:%u ca_state:%u " "advmss:%u mss_cache:%u pmtu:%u\n", -- cgit v1.2.3 From e26a0c5d828b225b88f534e2fcf10bf617f85f23 Mon Sep 17 00:00:00 2001 From: Shradha Gupta Date: Sun, 29 Sep 2024 20:44:35 -0700 Subject: net: mana: Increase the DEF_RX_BUFFERS_PER_QUEUE to 1024 Through some experiments, we found out that increasing the default RX buffers count from 512 to 1024, gives slightly better throughput and significantly reduces the no_wqe_rx errs on the receiver side. Along with these, other parameters like cpu usage, retrans seg etc also show some improvement with 1024 value. Following are some snippets from the experiments ntttcp tests with 512 Rx buffers --------------------------------------- connections| throughput| no_wqe errs| --------------------------------------- 1 | 40.93Gbps | 123,211 | 16 | 180.15Gbps | 190,120 | 128 | 180.20Gbps | 173,508 | 256 | 180.27Gbps | 189,884 | ntttcp tests with 1024 Rx buffers --------------------------------------- connections| throughput| no_wqe errs| --------------------------------------- 1 | 44.22Gbps | 19,864 | 16 | 180.19Gbps | 4,430 | 128 | 180.21Gbps | 2,560 | 256 | 180.29Gbps | 1,529 | So, increasing the default RX buffers per queue count to 1024 Signed-off-by: Shradha Gupta Reviewed-by: Haiyang Zhang Reviewed-by: Pavan Chebbi Link: https://patch.msgid.link/1727667875-29908-1-git-send-email-shradhagupta@linux.microsoft.com Signed-off-by: Paolo Abeni --- include/net/mana/mana.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index f2a5200d8a0f..9b0faa24b758 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -43,7 +43,7 @@ enum TRI_STATE { * size beyond this value gets rejected by __alloc_page() call. */ #define MAX_RX_BUFFERS_PER_QUEUE 8192 -#define DEF_RX_BUFFERS_PER_QUEUE 512 +#define DEF_RX_BUFFERS_PER_QUEUE 1024 #define MIN_RX_BUFFERS_PER_QUEUE 128 /* This max value for TX buffers is derived as the maximum allocatable -- cgit v1.2.3 From 7e863e5db6185b1add0df4cb01b31a4ed1c4b738 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 1 Oct 2024 21:28:43 +0200 Subject: ipv4: Convert ip_route_input() to dscp_t. Pass a dscp_t variable to ip_route_input(), instead of a plain u8, to prevent accidental setting of ECN bits in ->flowi4_tos. Callers of ip_route_input() to consider are: * input_action_end_dx4_finish() and input_action_end_dt4() in net/ipv6/seg6_local.c. These functions set the tos parameter to 0, which is already a valid dscp_t value, so they don't need to be adjusted for the new prototype. * icmp_route_lookup(), which already has a dscp_t variable to pass as parameter. We just need to remove the inet_dscp_to_dsfield() conversion. * br_nf_pre_routing_finish(), ip_options_rcv_srr() and ip4ip6_err(), which get the DSCP directly from IPv4 headers. Define a helper to read the .tos field of struct iphdr as dscp_t, so that these function don't have to do the conversion manually. While there, declare *iph as const in br_nf_pre_routing_finish(), declare its local variables in reverse-christmas-tree order and move the "err = ip_route_input()" assignment out of the conditional to avoid checkpatch warning. Signed-off-by: Guillaume Nault Reviewed-by: David Ahern Link: https://patch.msgid.link/e9d40781d64d3d69f4c79ac8a008b8d67a033e8d.1727807926.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 5 +++++ include/net/route.h | 5 +++-- net/bridge/br_netfilter_hooks.c | 8 +++++--- net/ipv4/icmp.c | 2 +- net/ipv4/ip_options.c | 3 ++- net/ipv6/ip6_tunnel.c | 4 ++-- 6 files changed, 18 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index d92d3bc3ec0e..bab084df1567 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -424,6 +424,11 @@ int ip_decrease_ttl(struct iphdr *iph) return --iph->ttl; } +static inline dscp_t ip4h_dscp(const struct iphdr *ip4h) +{ + return inet_dsfield_to_dscp(ip4h->tos); +} + static inline int ip_mtu_locked(const struct dst_entry *dst) { const struct rtable *rt = dst_rtable(dst); diff --git a/include/net/route.h b/include/net/route.h index 1789f1e6640b..03dd28cf4bc4 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -208,12 +208,13 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 dst, __be32 src, const struct sk_buff *hint); static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin) + dscp_t dscp, struct net_device *devin) { int err; rcu_read_lock(); - err = ip_route_input_noref(skb, dst, src, tos, devin); + err = ip_route_input_noref(skb, dst, src, inet_dscp_to_dsfield(dscp), + devin); if (!err) { skb_dst_force(skb); if (!skb_dst(skb)) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 0e8bc0ea6175..c6bab2b5e834 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -369,9 +369,9 @@ br_nf_ipv4_daddr_was_changed(const struct sk_buff *skb, */ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_device *dev = skb->dev, *br_indev; - struct iphdr *iph = ip_hdr(skb); struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + struct net_device *dev = skb->dev, *br_indev; + const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; int err; @@ -389,7 +389,9 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_ } nf_bridge->in_prerouting = 0; if (br_nf_ipv4_daddr_was_changed(skb, nf_bridge)) { - if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) { + err = ip_route_input(skb, iph->daddr, iph->saddr, + ip4h_dscp(iph), dev); + if (err) { struct in_device *in_dev = __in_dev_get_rcu(dev); /* If err equals -EHOSTUNREACH the error is due to a diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 7d7b25ed8d21..23664434922e 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -545,7 +545,7 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, orefdst = skb_in->_skb_refdst; /* save old refdst */ skb_dst_set(skb_in, NULL); err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr, - inet_dscp_to_dsfield(dscp), rt2->dst.dev); + dscp, rt2->dst.dev); dst_release(&rt2->dst); rt2 = skb_rtable(skb_in); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 68aedb8877b9..81e86e5defee 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -617,7 +617,8 @@ int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev) orefdst = skb->_skb_refdst; skb_dst_set(skb, NULL); - err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, dev); + err = ip_route_input(skb, nexthop, iph->saddr, ip4h_dscp(iph), + dev); rt2 = skb_rtable(skb); if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { skb_dst_drop(skb); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index b60e13c42bca..48fd53b98972 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -630,8 +630,8 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } skb_dst_set(skb2, &rt->dst); } else { - if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, - skb2->dev) || + if (ip_route_input(skb2, eiph->daddr, eiph->saddr, + ip4h_dscp(eiph), skb2->dev) || skb_dst(skb2)->dev->type != ARPHRD_TUNNEL6) goto out; } -- cgit v1.2.3 From 66fb6386d358a04edd5c640e38b4a02b323b89d8 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 1 Oct 2024 21:28:49 +0200 Subject: ipv4: Convert ip_route_input_noref() to dscp_t. Pass a dscp_t variable to ip_route_input_noref(), instead of a plain u8, to prevent accidental setting of ECN bits in ->flowi4_tos. Callers of ip_route_input_noref() to consider are: * arp_process() in net/ipv4/arp.c. This function sets the tos parameter to 0, which is already a valid dscp_t value, so it doesn't need to be adjusted for the new prototype. * ip_route_input(), which already has a dscp_t variable to pass as parameter. We just need to remove the inet_dscp_to_dsfield() conversion. * ipvlan_l3_rcv(), bpf_lwt_input_reroute(), ip_expire(), ip_rcv_finish_core(), xfrm4_rcv_encap_finish() and xfrm4_rcv_encap(), which get the DSCP directly from IPv4 headers and can simply use the ip4h_dscp() helper. While there, declare the IPv4 header pointers as const in ipvlan_l3_rcv() and bpf_lwt_input_reroute(). Also, modify the declaration of ip_route_input_noref() in include/net/route.h so that it matches the prototype of its implementation in net/ipv4/route.c. Signed-off-by: Guillaume Nault Reviewed-by: David Ahern Link: https://patch.msgid.link/a8a747bed452519c4d0cc06af32c7e7795d7b627.1727807926.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/ipvlan/ipvlan_l3s.c | 6 ++++-- include/net/route.h | 7 +++---- net/core/lwt_bpf.c | 5 +++-- net/ipv4/ip_fragment.c | 4 ++-- net/ipv4/ip_input.c | 2 +- net/ipv4/route.c | 6 +++--- net/ipv4/xfrm4_input.c | 2 +- net/ipv4/xfrm4_protocol.c | 2 +- 8 files changed, 18 insertions(+), 16 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ipvlan/ipvlan_l3s.c b/drivers/net/ipvlan/ipvlan_l3s.c index d5b05e803219..b4ef386bdb1b 100644 --- a/drivers/net/ipvlan/ipvlan_l3s.c +++ b/drivers/net/ipvlan/ipvlan_l3s.c @@ -2,6 +2,8 @@ /* Copyright (c) 2014 Mahesh Bandewar */ +#include + #include "ipvlan.h" static unsigned int ipvlan_netid __read_mostly; @@ -48,11 +50,11 @@ static struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, switch (proto) { case AF_INET: { - struct iphdr *ip4h = ip_hdr(skb); + const struct iphdr *ip4h = ip_hdr(skb); int err; err = ip_route_input_noref(skb, ip4h->daddr, ip4h->saddr, - ip4h->tos, sdev); + ip4h_dscp(ip4h), sdev); if (unlikely(err)) goto out; break; diff --git a/include/net/route.h b/include/net/route.h index 03dd28cf4bc4..5e4374d66927 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -201,8 +201,8 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev, struct in_device *in_dev, u32 *itag); -int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin); +int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev); int ip_route_use_hint(struct sk_buff *skb, __be32 dst, __be32 src, u8 tos, struct net_device *devin, const struct sk_buff *hint); @@ -213,8 +213,7 @@ static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, int err; rcu_read_lock(); - err = ip_route_input_noref(skb, dst, src, inet_dscp_to_dsfield(dscp), - devin); + err = ip_route_input_noref(skb, dst, src, dscp, devin); if (!err) { skb_dst_force(skb); if (!skb_dst(skb)) diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 1a14f915b7a4..e0ca24a58810 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -91,12 +92,12 @@ static int bpf_lwt_input_reroute(struct sk_buff *skb) if (skb->protocol == htons(ETH_P_IP)) { struct net_device *dev = skb_dst(skb)->dev; - struct iphdr *iph = ip_hdr(skb); + const struct iphdr *iph = ip_hdr(skb); dev_hold(dev); skb_dst_drop(skb); err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, dev); + ip4h_dscp(iph), dev); dev_put(dev); } else if (skb->protocol == htons(ETH_P_IPV6)) { skb_dst_drop(skb); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index a92664a5ef2e..48e2810f1f27 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -175,8 +175,8 @@ static void ip_expire(struct timer_list *t) /* skb has no dst, perform route lookup again */ iph = ip_hdr(head); - err = ip_route_input_noref(head, iph->daddr, iph->saddr, - iph->tos, head->dev); + err = ip_route_input_noref(head, iph->daddr, iph->saddr, ip4h_dscp(iph), + head->dev); if (err) goto out; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index b6e7d4921309..c0a2490eb7c1 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -363,7 +363,7 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk, */ if (!skb_valid_dst(skb)) { err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, dev); + ip4h_dscp(iph), dev); if (unlikely(err)) goto drop_error; } else { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 723ac9181558..00bfc0a11f64 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2465,14 +2465,14 @@ static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, } int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev) + dscp_t dscp, struct net_device *dev) { struct fib_result res; int err; - tos &= INET_DSCP_MASK; rcu_read_lock(); - err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res); + err = ip_route_input_rcu(skb, daddr, saddr, inet_dscp_to_dsfield(dscp), + dev, &res); rcu_read_unlock(); return err; diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index a620618cc568..b5b06323cfd9 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -33,7 +33,7 @@ static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk, const struct iphdr *iph = ip_hdr(skb); if (ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev)) + ip4h_dscp(iph), skb->dev)) goto drop; } diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c index b146ce88c5d0..4ee624d8e66f 100644 --- a/net/ipv4/xfrm4_protocol.c +++ b/net/ipv4/xfrm4_protocol.c @@ -76,7 +76,7 @@ int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, const struct iphdr *iph = ip_hdr(skb); if (ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev)) + ip4h_dscp(iph), skb->dev)) goto drop; } -- cgit v1.2.3 From 4aecca4c76808f3736056d18ff510df80424bc9f Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Tue, 1 Oct 2024 05:57:14 -0700 Subject: net_tstamp: add SCM_TS_OPT_ID to provide OPT_ID in control message SOF_TIMESTAMPING_OPT_ID socket option flag gives a way to correlate TX timestamps and packets sent via socket. Unfortunately, there is no way to reliably predict socket timestamp ID value in case of error returned by sendmsg. For UDP sockets it's impossible because of lockless nature of UDP transmit, several threads may send packets in parallel. In case of RAW sockets MSG_MORE option makes things complicated. More details are in the conversation [1]. This patch adds new control message type to give user-space software an opportunity to control the mapping between packets and values by providing ID with each sendmsg for UDP sockets. The documentation is also added in this patch. [1] https://lore.kernel.org/netdev/CALCETrU0jB+kg0mhV6A8mrHfTE1D1pr1SD_B9Eaa9aDPfgHdtA@mail.gmail.com/ Reviewed-by: Willem de Bruijn Reviewed-by: Jason Xing Signed-off-by: Vadim Fedorenko Link: https://patch.msgid.link/20241001125716.2832769-2-vadfed@meta.com Signed-off-by: Jakub Kicinski --- Documentation/networking/timestamping.rst | 14 ++++++++++++++ arch/alpha/include/uapi/asm/socket.h | 2 ++ arch/mips/include/uapi/asm/socket.h | 2 ++ arch/parisc/include/uapi/asm/socket.h | 2 ++ arch/sparc/include/uapi/asm/socket.h | 2 ++ include/net/inet_sock.h | 4 +++- include/net/sock.h | 7 +++++++ include/uapi/asm-generic/socket.h | 2 ++ net/core/sock.c | 13 +++++++++++++ net/ipv4/ip_output.c | 19 ++++++++++++++----- net/ipv6/ip6_output.c | 20 ++++++++++++++------ 11 files changed, 75 insertions(+), 12 deletions(-) (limited to 'include/net') diff --git a/Documentation/networking/timestamping.rst b/Documentation/networking/timestamping.rst index 8199e6917671..b37bfbfc7d79 100644 --- a/Documentation/networking/timestamping.rst +++ b/Documentation/networking/timestamping.rst @@ -194,6 +194,20 @@ SOF_TIMESTAMPING_OPT_ID: among all possibly concurrently outstanding timestamp requests for that socket. + The process can optionally override the default generated ID, by + passing a specific ID with control message SCM_TS_OPT_ID (not + supported for TCP sockets):: + + struct msghdr *msg; + ... + cmsg = CMSG_FIRSTHDR(msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_TS_OPT_ID; + cmsg->cmsg_len = CMSG_LEN(sizeof(__u32)); + *((__u32 *) CMSG_DATA(cmsg)) = opt_id; + err = sendmsg(fd, msg, 0); + + SOF_TIMESTAMPING_OPT_ID_TCP: Pass this modifier along with SOF_TIMESTAMPING_OPT_ID for new TCP timestamping applications. SOF_TIMESTAMPING_OPT_ID defines how the diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 251b73c5481e..302507bf9b5d 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -146,6 +146,8 @@ #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF #define SO_DEVMEM_DONTNEED 80 +#define SCM_TS_OPT_ID 81 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 8ab7582291ab..d118d4731580 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -157,6 +157,8 @@ #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF #define SO_DEVMEM_DONTNEED 80 +#define SCM_TS_OPT_ID 81 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 38fc0b188e08..d268d69bfcd2 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -138,6 +138,8 @@ #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF #define SO_DEVMEM_DONTNEED 80 +#define SCM_TS_OPT_ID 0x404C + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 57084ed2f3c4..113cd9f353e3 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -139,6 +139,8 @@ #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF #define SO_DEVMEM_DONTNEED 0x0059 +#define SCM_TS_OPT_ID 0x005a + #if !defined(__KERNEL__) diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 394c3b66065e..f01dd273bea6 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -174,6 +174,7 @@ struct inet_cork { __s16 tos; char priority; __u16 gso_size; + u32 ts_opt_id; u64 transmit_time; u32 mark; }; @@ -241,7 +242,8 @@ struct inet_sock { struct inet_cork_full cork; }; -#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ +#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ +#define IPCORK_TS_OPT_ID 2 /* ts_opt_id field is valid, overriding sk_tskey */ enum { INET_FLAGS_PKTINFO = 0, diff --git a/include/net/sock.h b/include/net/sock.h index c58ca8dd561b..ccf28c2b70b1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -954,6 +954,12 @@ enum sock_flags { }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) +/* + * The highest bit of sk_tsflags is reserved for kernel-internal + * SOCKCM_FLAG_TS_OPT_ID. There is a check in core/sock.c to control that + * SOF_TIMESTAMPING* values do not reach this reserved area + */ +#define SOCKCM_FLAG_TS_OPT_ID BIT(31) static inline void sock_copy_flags(struct sock *nsk, const struct sock *osk) { @@ -1796,6 +1802,7 @@ struct sockcm_cookie { u64 transmit_time; u32 mark; u32 tsflags; + u32 ts_opt_id; }; static inline void sockcm_init(struct sockcm_cookie *sockc, diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 3b4e3e815602..deacfd6dd197 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -141,6 +141,8 @@ #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF #define SO_DEVMEM_DONTNEED 80 +#define SCM_TS_OPT_ID 81 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) diff --git a/net/core/sock.c b/net/core/sock.c index 039be95c40cf..846f494a17cf 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2899,6 +2899,8 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, { u32 tsflags; + BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31)); + switch (cmsg->cmsg_type) { case SO_MARK: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && @@ -2927,6 +2929,17 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, return -EINVAL; sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); break; + case SCM_TS_OPT_ID: + if (sk_is_tcp(sk)) + return -EINVAL; + tsflags = READ_ONCE(sk->sk_tsflags); + if (!(tsflags & SOF_TIMESTAMPING_OPT_ID)) + return -EINVAL; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg); + sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID; + break; /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ case SCM_RIGHTS: case SCM_CREDENTIALS: diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 49811c9281d4..0c7049f50369 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -973,7 +973,7 @@ static int __ip_append_data(struct sock *sk, unsigned int maxfraglen, fragheaderlen, maxnonfragsize; int csummode = CHECKSUM_NONE; struct rtable *rt = dst_rtable(cork->dst); - bool paged, hold_tskey, extra_uref = false; + bool paged, hold_tskey = false, extra_uref = false; unsigned int wmem_alloc_delta = 0; u32 tskey = 0; @@ -1049,10 +1049,15 @@ static int __ip_append_data(struct sock *sk, cork->length += length; - hold_tskey = cork->tx_flags & SKBTX_ANY_TSTAMP && - READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID; - if (hold_tskey) - tskey = atomic_inc_return(&sk->sk_tskey) - 1; + if (cork->tx_flags & SKBTX_ANY_TSTAMP && + READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { + if (cork->flags & IPCORK_TS_OPT_ID) { + tskey = cork->ts_opt_id; + } else { + tskey = atomic_inc_return(&sk->sk_tskey) - 1; + hold_tskey = true; + } + } /* So, what's going on in the loop below? * @@ -1327,6 +1332,10 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, cork->transmit_time = ipc->sockc.transmit_time; cork->tx_flags = 0; sock_tx_timestamp(sk, ipc->sockc.tsflags, &cork->tx_flags); + if (ipc->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { + cork->flags |= IPCORK_TS_OPT_ID; + cork->ts_opt_id = ipc->sockc.ts_opt_id; + } return 0; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index f26841f1490f..ff6bd8d85e9a 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1402,7 +1402,10 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, cork->base.tx_flags = 0; cork->base.mark = ipc6->sockc.mark; sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); - + if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { + cork->base.flags |= IPCORK_TS_OPT_ID; + cork->base.ts_opt_id = ipc6->sockc.ts_opt_id; + } cork->base.length = 0; cork->base.transmit_time = ipc6->sockc.transmit_time; @@ -1433,7 +1436,7 @@ static int __ip6_append_data(struct sock *sk, bool zc = false; u32 tskey = 0; struct rt6_info *rt = dst_rt6_info(cork->dst); - bool paged, hold_tskey, extra_uref = false; + bool paged, hold_tskey = false, extra_uref = false; struct ipv6_txoptions *opt = v6_cork->opt; int csummode = CHECKSUM_NONE; unsigned int maxnonfragsize, headersize; @@ -1543,10 +1546,15 @@ emsgsize: flags &= ~MSG_SPLICE_PAGES; } - hold_tskey = cork->tx_flags & SKBTX_ANY_TSTAMP && - READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID; - if (hold_tskey) - tskey = atomic_inc_return(&sk->sk_tskey) - 1; + if (cork->tx_flags & SKBTX_ANY_TSTAMP && + READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { + if (cork->flags & IPCORK_TS_OPT_ID) { + tskey = cork->ts_opt_id; + } else { + tskey = atomic_inc_return(&sk->sk_tskey) - 1; + hold_tskey = true; + } + } /* * Let's try using as much space as possible. -- cgit v1.2.3 From 822b5bc6db55f1c3ea51659c423784ac6919ddd4 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Tue, 1 Oct 2024 05:57:15 -0700 Subject: net_tstamp: add SCM_TS_OPT_ID for RAW sockets The last type of sockets which supports SOF_TIMESTAMPING_OPT_ID is RAW sockets. To add new option this patch converts all callers (direct and indirect) of _sock_tx_timestamp to provide sockcm_cookie instead of tsflags. And while here fix __sock_tx_timestamp to receive tsflags as __u32 instead of __u16. Reviewed-by: Willem de Bruijn Reviewed-by: Jason Xing Signed-off-by: Vadim Fedorenko Link: https://patch.msgid.link/20241001125716.2832769-3-vadfed@meta.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 27 ++++++++++++++++++--------- net/can/raw.c | 2 +- net/ipv4/ip_output.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv4/tcp.c | 7 ++++--- net/ipv6/ip6_output.c | 2 +- net/ipv6/raw.c | 2 +- net/packet/af_packet.c | 6 +++--- net/socket.c | 2 +- 9 files changed, 31 insertions(+), 21 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index ccf28c2b70b1..e282127092ab 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2660,39 +2660,48 @@ static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, sock_write_timestamp(sk, 0); } -void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags); +void __sock_tx_timestamp(__u32 tsflags, __u8 *tx_flags); /** * _sock_tx_timestamp - checks whether the outgoing packet is to be time stamped * @sk: socket sending this packet - * @tsflags: timestamping flags to use + * @sockc: pointer to socket cmsg cookie to get timestamping info * @tx_flags: completed with instructions for time stamping * @tskey: filled in with next sk_tskey (not for TCP, which uses seqno) * * Note: callers should take care of initial ``*tx_flags`` value (usually 0) */ -static inline void _sock_tx_timestamp(struct sock *sk, __u16 tsflags, +static inline void _sock_tx_timestamp(struct sock *sk, + const struct sockcm_cookie *sockc, __u8 *tx_flags, __u32 *tskey) { + __u32 tsflags = sockc->tsflags; + if (unlikely(tsflags)) { __sock_tx_timestamp(tsflags, tx_flags); if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey && - tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) - *tskey = atomic_inc_return(&sk->sk_tskey) - 1; + tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) { + if (tsflags & SOCKCM_FLAG_TS_OPT_ID) + *tskey = sockc->ts_opt_id; + else + *tskey = atomic_inc_return(&sk->sk_tskey) - 1; + } } if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) *tx_flags |= SKBTX_WIFI_STATUS; } -static inline void sock_tx_timestamp(struct sock *sk, __u16 tsflags, +static inline void sock_tx_timestamp(struct sock *sk, + const struct sockcm_cookie *sockc, __u8 *tx_flags) { - _sock_tx_timestamp(sk, tsflags, tx_flags, NULL); + _sock_tx_timestamp(sk, sockc, tx_flags, NULL); } -static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags) +static inline void skb_setup_tx_timestamp(struct sk_buff *skb, + const struct sockcm_cookie *sockc) { - _sock_tx_timestamp(skb->sk, tsflags, &skb_shinfo(skb)->tx_flags, + _sock_tx_timestamp(skb->sk, sockc, &skb_shinfo(skb)->tx_flags, &skb_shinfo(skb)->tskey); } diff --git a/net/can/raw.c b/net/can/raw.c index 00533f64d69d..255c0a8f39d6 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -966,7 +966,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) skb->mark = READ_ONCE(sk->sk_mark); skb->tstamp = sockc.transmit_time; - skb_setup_tx_timestamp(skb, sockc.tsflags); + skb_setup_tx_timestamp(skb, &sockc); err = can_send(skb, ro->loopback); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 0c7049f50369..e5c55a95063d 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1331,7 +1331,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, cork->priority = ipc->priority; cork->transmit_time = ipc->sockc.transmit_time; cork->tx_flags = 0; - sock_tx_timestamp(sk, ipc->sockc.tsflags, &cork->tx_flags); + sock_tx_timestamp(sk, &ipc->sockc, &cork->tx_flags); if (ipc->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { cork->flags |= IPCORK_TS_OPT_ID; cork->ts_opt_id = ipc->sockc.ts_opt_id; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 474dfd263c8b..0e9e01967ec9 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -370,7 +370,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, skb->ip_summed = CHECKSUM_NONE; - skb_setup_tx_timestamp(skb, sockc->tsflags); + skb_setup_tx_timestamp(skb, sockc); if (flags & MSG_CONFIRM) skb_set_dst_pending_confirm(skb, 1); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4f77bd862e95..82cc4a5633ce 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -477,15 +477,16 @@ void tcp_init_sock(struct sock *sk) } EXPORT_SYMBOL(tcp_init_sock); -static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) +static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc) { struct sk_buff *skb = tcp_write_queue_tail(sk); + u32 tsflags = sockc->tsflags; if (tsflags && skb) { struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags); + sock_tx_timestamp(sk, sockc, &shinfo->tx_flags); if (tsflags & SOF_TIMESTAMPING_TX_ACK) tcb->txstamp_ack = 1; if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) @@ -1321,7 +1322,7 @@ wait_for_space: out: if (copied) { - tcp_tx_timestamp(sk, sockc.tsflags); + tcp_tx_timestamp(sk, &sockc); tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } out_nopush: diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index ff6bd8d85e9a..205673179b3c 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1401,7 +1401,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, cork->base.gso_size = ipc6->gso_size; cork->base.tx_flags = 0; cork->base.mark = ipc6->sockc.mark; - sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); + sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags); if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { cork->base.flags |= IPCORK_TS_OPT_ID; cork->base.ts_opt_id = ipc6->sockc.ts_opt_id; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 608fa9d05b55..8476a3944a88 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -629,7 +629,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb->ip_summed = CHECKSUM_NONE; - skb_setup_tx_timestamp(skb, sockc->tsflags); + skb_setup_tx_timestamp(skb, sockc); if (flags & MSG_CONFIRM) skb_set_dst_pending_confirm(skb, 1); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index a705ec214254..f8942062f776 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2118,7 +2118,7 @@ retry: skb->priority = READ_ONCE(sk->sk_priority); skb->mark = READ_ONCE(sk->sk_mark); skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); - skb_setup_tx_timestamp(skb, sockc.tsflags); + skb_setup_tx_timestamp(skb, &sockc); if (unlikely(extra_len == 4)) skb->no_fcs = 1; @@ -2650,7 +2650,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb->priority = READ_ONCE(po->sk.sk_priority); skb->mark = READ_ONCE(po->sk.sk_mark); skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, po->sk.sk_clockid); - skb_setup_tx_timestamp(skb, sockc->tsflags); + skb_setup_tx_timestamp(skb, sockc); skb_zcopy_set_nouarg(skb, ph.raw); skb_reserve(skb, hlen); @@ -3115,7 +3115,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) goto out_free; } - skb_setup_tx_timestamp(skb, sockc.tsflags); + skb_setup_tx_timestamp(skb, &sockc); if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) && !packet_extra_vlan_len_allowed(dev, skb)) { diff --git a/net/socket.c b/net/socket.c index 601ad74930ef..3b1b65b9f471 100644 --- a/net/socket.c +++ b/net/socket.c @@ -687,7 +687,7 @@ void sock_release(struct socket *sock) } EXPORT_SYMBOL(sock_release); -void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags) +void __sock_tx_timestamp(__u32 tsflags, __u8 *tx_flags) { u8 flags = *tx_flags; -- cgit v1.2.3 From 5a9071a760a61b00260334ad576fe60debafaafc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Oct 2024 17:30:40 +0000 Subject: tcp: annotate data-races around icsk->icsk_pending icsk->icsk_pending can be read locklessly already. Following patch in the series will add another lockless read. Add smp_load_acquire() and smp_store_release() annotations because following patch will add a test in tcp_write_timer(), and READ_ONCE()/WRITE_ONCE() alone would possibly lead to races. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20241002173042.917928-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 4 ++-- net/ipv4/inet_connection_sock.c | 6 ++++-- net/ipv4/inet_diag.c | 10 ++++++---- net/ipv4/tcp_ipv4.c | 10 ++++++---- net/ipv4/tcp_output.c | 4 ++-- net/ipv4/tcp_timer.c | 4 ++-- net/ipv6/tcp_ipv6.c | 10 ++++++---- 7 files changed, 28 insertions(+), 20 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index c0deaafebfdc..914d19772704 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -197,7 +197,7 @@ static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) struct inet_connection_sock *icsk = inet_csk(sk); if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) { - icsk->icsk_pending = 0; + smp_store_release(&icsk->icsk_pending, 0); #ifdef INET_CSK_CLEAR_TIMERS sk_stop_timer(sk, &icsk->icsk_retransmit_timer); #endif @@ -229,7 +229,7 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 || what == ICSK_TIME_LOSS_PROBE || what == ICSK_TIME_REO_TIMEOUT) { - icsk->icsk_pending = what; + smp_store_release(&icsk->icsk_pending, what); icsk->icsk_timeout = jiffies + when; sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); } else if (what == ICSK_TIME_DACK) { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 2c5632d4fddb..8c53385cc808 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -775,7 +775,8 @@ void inet_csk_clear_xmit_timers(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - icsk->icsk_pending = icsk->icsk_ack.pending = 0; + smp_store_release(&icsk->icsk_pending, 0); + icsk->icsk_ack.pending = 0; sk_stop_timer(sk, &icsk->icsk_retransmit_timer); sk_stop_timer(sk, &icsk->icsk_delack_timer); @@ -790,7 +791,8 @@ void inet_csk_clear_xmit_timers_sync(struct sock *sk) /* ongoing timer handlers need to acquire socket lock. */ sock_not_owned_by_me(sk); - icsk->icsk_pending = icsk->icsk_ack.pending = 0; + smp_store_release(&icsk->icsk_pending, 0); + icsk->icsk_ack.pending = 0; sk_stop_timer_sync(sk, &icsk->icsk_retransmit_timer); sk_stop_timer_sync(sk, &icsk->icsk_delack_timer); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 67639309163d..321acc8abf17 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -247,6 +247,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct nlmsghdr *nlh; struct nlattr *attr; void *info = NULL; + u8 icsk_pending; int protocol; cb_data = cb->data; @@ -307,14 +308,15 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, goto out; } - if (icsk->icsk_pending == ICSK_TIME_RETRANS || - icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || - icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { + icsk_pending = smp_load_acquire(&icsk->icsk_pending); + if (icsk_pending == ICSK_TIME_RETRANS || + icsk_pending == ICSK_TIME_REO_TIMEOUT || + icsk_pending == ICSK_TIME_LOSS_PROBE) { r->idiag_timer = 1; r->idiag_retrans = icsk->icsk_retransmits; r->idiag_expires = jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies); - } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { + } else if (icsk_pending == ICSK_TIME_PROBE0) { r->idiag_timer = 4; r->idiag_retrans = icsk->icsk_probes_out; r->idiag_expires = diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5afe5e57c89b..985028434f64 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2900,15 +2900,17 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) __be32 src = inet->inet_rcv_saddr; __u16 destp = ntohs(inet->inet_dport); __u16 srcp = ntohs(inet->inet_sport); + u8 icsk_pending; int rx_queue; int state; - if (icsk->icsk_pending == ICSK_TIME_RETRANS || - icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || - icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { + icsk_pending = smp_load_acquire(&icsk->icsk_pending); + if (icsk_pending == ICSK_TIME_RETRANS || + icsk_pending == ICSK_TIME_REO_TIMEOUT || + icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; timer_expires = icsk->icsk_timeout; - } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { + } else if (icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; timer_expires = icsk->icsk_timeout; } else if (timer_pending(&sk->sk_timer)) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4fd746bd4d54..4d0407301603 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2960,7 +2960,7 @@ void tcp_send_loss_probe(struct sock *sk) WARN_ONCE(tp->packets_out, "invalid inflight: %u state %u cwnd %u mss %d\n", tp->packets_out, sk->sk_state, tcp_snd_cwnd(tp), mss); - inet_csk(sk)->icsk_pending = 0; + smp_store_release(&inet_csk(sk)->icsk_pending, 0); return; } @@ -2993,7 +2993,7 @@ probe_sent: NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); /* Reset s.t. tcp_rearm_rto will restart timer from now */ - inet_csk(sk)->icsk_pending = 0; + smp_store_release(&inet_csk(sk)->icsk_pending, 0); rearm_timer: tcp_rearm_rto(sk); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 79064580c8c0..56c597e763ac 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -701,11 +701,11 @@ void tcp_write_timer_handler(struct sock *sk) tcp_send_loss_probe(sk); break; case ICSK_TIME_RETRANS: - icsk->icsk_pending = 0; + smp_store_release(&icsk->icsk_pending, 0); tcp_retransmit_timer(sk); break; case ICSK_TIME_PROBE0: - icsk->icsk_pending = 0; + smp_store_release(&icsk->icsk_pending, 0); tcp_probe_timer(sk); break; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d71ab4e1efe1..7634c0be6acb 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2177,6 +2177,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) const struct tcp_sock *tp = tcp_sk(sp); const struct inet_connection_sock *icsk = inet_csk(sp); const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; + u8 icsk_pending; int rx_queue; int state; @@ -2185,12 +2186,13 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) destp = ntohs(inet->inet_dport); srcp = ntohs(inet->inet_sport); - if (icsk->icsk_pending == ICSK_TIME_RETRANS || - icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || - icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { + icsk_pending = smp_load_acquire(&icsk->icsk_pending); + if (icsk_pending == ICSK_TIME_RETRANS || + icsk_pending == ICSK_TIME_REO_TIMEOUT || + icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; timer_expires = icsk->icsk_timeout; - } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { + } else if (icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; timer_expires = icsk->icsk_timeout; } else if (timer_pending(&sp->sk_timer)) { -- cgit v1.2.3 From 81df4fa94ee8c0800ed42c47357435602ed105ad Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Oct 2024 17:30:42 +0000 Subject: tcp: add a fast path in tcp_delack_timer() delack timer is not stopped from inet_csk_clear_xmit_timer() because we do not define INET_CSK_CLEAR_TIMERS. This is a conscious choice : inet_csk_clear_xmit_timer() is often called from another cpu. Calling del_timer() would cause false sharing and lock contention. This means that very often, tcp_delack_timer() is called at the timer expiration, while there is no ACK to transmit. This can be detected very early, avoiding the socket spinlock. Notes: - test about tp->compressed_ack is racy, but in the unlikely case there is a race, the dedicated compressed_ack_timer hrtimer would close it. - Even if the fast path is not taken, reading icsk->icsk_ack.pending and tp->compressed_ack before acquiring the socket spinlock reduces acquisition time and chances of contention. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20241002173042.917928-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 5 +++-- net/ipv4/inet_connection_sock.c | 4 ++-- net/ipv4/tcp_output.c | 3 ++- net/ipv4/tcp_timer.c | 9 +++++++++ net/mptcp/protocol.c | 3 ++- 5 files changed, 18 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 914d19772704..3c82fad904d4 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -202,7 +202,7 @@ static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) sk_stop_timer(sk, &icsk->icsk_retransmit_timer); #endif } else if (what == ICSK_TIME_DACK) { - icsk->icsk_ack.pending = 0; + smp_store_release(&icsk->icsk_ack.pending, 0); icsk->icsk_ack.retry = 0; #ifdef INET_CSK_CLEAR_TIMERS sk_stop_timer(sk, &icsk->icsk_delack_timer); @@ -233,7 +233,8 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, icsk->icsk_timeout = jiffies + when; sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); } else if (what == ICSK_TIME_DACK) { - icsk->icsk_ack.pending |= ICSK_ACK_TIMER; + smp_store_release(&icsk->icsk_ack.pending, + icsk->icsk_ack.pending | ICSK_ACK_TIMER); icsk->icsk_ack.timeout = jiffies + when; sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); } else { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 8c53385cc808..12e975ed4910 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -776,7 +776,7 @@ void inet_csk_clear_xmit_timers(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); smp_store_release(&icsk->icsk_pending, 0); - icsk->icsk_ack.pending = 0; + smp_store_release(&icsk->icsk_ack.pending, 0); sk_stop_timer(sk, &icsk->icsk_retransmit_timer); sk_stop_timer(sk, &icsk->icsk_delack_timer); @@ -792,7 +792,7 @@ void inet_csk_clear_xmit_timers_sync(struct sock *sk) sock_not_owned_by_me(sk); smp_store_release(&icsk->icsk_pending, 0); - icsk->icsk_ack.pending = 0; + smp_store_release(&icsk->icsk_ack.pending, 0); sk_stop_timer_sync(sk, &icsk->icsk_retransmit_timer); sk_stop_timer_sync(sk, &icsk->icsk_delack_timer); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4d0407301603..08772395690d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -4224,7 +4224,8 @@ void tcp_send_delayed_ack(struct sock *sk) if (!time_before(timeout, icsk->icsk_ack.timeout)) timeout = icsk->icsk_ack.timeout; } - icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; + smp_store_release(&icsk->icsk_ack.pending, + icsk->icsk_ack.pending | ICSK_ACK_SCHED | ICSK_ACK_TIMER); icsk->icsk_ack.timeout = timeout; sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b7266b9101ce..c3a7442332d4 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -361,6 +361,14 @@ static void tcp_delack_timer(struct timer_list *t) from_timer(icsk, t, icsk_delack_timer); struct sock *sk = &icsk->icsk_inet.sk; + /* Avoid taking socket spinlock if there is no ACK to send. + * The compressed_ack check is racy, but a separate hrtimer + * will take care of it eventually. + */ + if (!(smp_load_acquire(&icsk->icsk_ack.pending) & ICSK_ACK_TIMER) && + !READ_ONCE(tcp_sk(sk)->compressed_ack)) + goto out; + bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { tcp_delack_timer_handler(sk); @@ -371,6 +379,7 @@ static void tcp_delack_timer(struct timer_list *t) sock_hold(sk); } bh_unlock_sock(sk); +out: sock_put(sk); } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index c2317919fc14..e85862352084 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3504,7 +3504,8 @@ static void schedule_3rdack_retransmission(struct sock *ssk) timeout += jiffies; WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER); - icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; + smp_store_release(&icsk->icsk_ack.pending, + icsk->icsk_ack.pending | ICSK_ACK_SCHED | ICSK_ACK_TIMER); icsk->icsk_ack.timeout = timeout; sk_reset_timer(ssk, &icsk->icsk_delack_timer, timeout); } -- cgit v1.2.3 From 539770616521e5b046ca7612eb79ba11b53edb1d Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 3 Oct 2024 12:52:17 +0100 Subject: net: dsa: remove obsolete phylink dsa_switch operations No driver now uses the DSA switch phylink members, so we can now remove the method pointers, but we need to leave empty shim functions to allow those drivers that do not provide phylink MAC operations structure to continue functioning. Signed-off-by: Russell King (oracle) Reviewed-by: Vladimir Oltean Tested-by: Vladimir Oltean # sja1105, felix, dsa_loop Link: https://patch.msgid.link/E1swKNV-0060oN-1b@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 15 --------------- net/dsa/dsa.c | 8 -------- net/dsa/port.c | 34 +--------------------------------- 3 files changed, 1 insertion(+), 56 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index d7a6c2930277..72ae65e7246a 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -885,21 +885,6 @@ struct dsa_switch_ops { */ void (*phylink_get_caps)(struct dsa_switch *ds, int port, struct phylink_config *config); - struct phylink_pcs *(*phylink_mac_select_pcs)(struct dsa_switch *ds, - int port, - phy_interface_t iface); - void (*phylink_mac_config)(struct dsa_switch *ds, int port, - unsigned int mode, - const struct phylink_link_state *state); - void (*phylink_mac_link_down)(struct dsa_switch *ds, int port, - unsigned int mode, - phy_interface_t interface); - void (*phylink_mac_link_up)(struct dsa_switch *ds, int port, - unsigned int mode, - phy_interface_t interface, - struct phy_device *phydev, - int speed, int duplex, - bool tx_pause, bool rx_pause); void (*phylink_fixed_state)(struct dsa_switch *ds, int port, struct phylink_link_state *state); /* diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 1664547deffd..5a7c0e565a89 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -1505,14 +1505,6 @@ static int dsa_switch_probe(struct dsa_switch *ds) if (!ds->num_ports) return -EINVAL; - if (ds->phylink_mac_ops) { - if (ds->ops->phylink_mac_select_pcs || - ds->ops->phylink_mac_config || - ds->ops->phylink_mac_link_down || - ds->ops->phylink_mac_link_up) - return -EINVAL; - } - if (np) { err = dsa_switch_parse_of(ds, np); if (err) diff --git a/net/dsa/port.c b/net/dsa/port.c index 25258b33e59e..f1e96706a701 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -1579,40 +1579,19 @@ static struct phylink_pcs * dsa_port_phylink_mac_select_pcs(struct phylink_config *config, phy_interface_t interface) { - struct dsa_port *dp = dsa_phylink_to_port(config); - struct phylink_pcs *pcs = ERR_PTR(-EOPNOTSUPP); - struct dsa_switch *ds = dp->ds; - - if (ds->ops->phylink_mac_select_pcs) - pcs = ds->ops->phylink_mac_select_pcs(ds, dp->index, interface); - - return pcs; + return ERR_PTR(-EOPNOTSUPP); } static void dsa_port_phylink_mac_config(struct phylink_config *config, unsigned int mode, const struct phylink_link_state *state) { - struct dsa_port *dp = dsa_phylink_to_port(config); - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->phylink_mac_config) - return; - - ds->ops->phylink_mac_config(ds, dp->index, mode, state); } static void dsa_port_phylink_mac_link_down(struct phylink_config *config, unsigned int mode, phy_interface_t interface) { - struct dsa_port *dp = dsa_phylink_to_port(config); - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->phylink_mac_link_down) - return; - - ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface); } static void dsa_port_phylink_mac_link_up(struct phylink_config *config, @@ -1622,14 +1601,6 @@ static void dsa_port_phylink_mac_link_up(struct phylink_config *config, int speed, int duplex, bool tx_pause, bool rx_pause) { - struct dsa_port *dp = dsa_phylink_to_port(config); - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->phylink_mac_link_up) - return; - - ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev, - speed, duplex, tx_pause, rx_pause); } static const struct phylink_mac_ops dsa_port_phylink_mac_ops = { @@ -1871,9 +1842,6 @@ static void dsa_shared_port_link_down(struct dsa_port *dp) if (ds->phylink_mac_ops && ds->phylink_mac_ops->mac_link_down) ds->phylink_mac_ops->mac_link_down(&dp->pl_config, MLO_AN_FIXED, PHY_INTERFACE_MODE_NA); - else if (ds->ops->phylink_mac_link_down) - ds->ops->phylink_mac_link_down(ds, dp->index, MLO_AN_FIXED, - PHY_INTERFACE_MODE_NA); } int dsa_shared_port_link_register_of(struct dsa_port *dp) -- cgit v1.2.3 From 76aed95319da25d6884dff01d5f0149e4b542f96 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 4 Oct 2024 15:10:29 -0700 Subject: rtnetlink: Add per-netns RTNL. The goal is to break RTNL down into per-netns mutex. This patch adds per-netns mutex and its helper functions, rtnl_net_lock() and rtnl_net_unlock(). rtnl_net_lock() acquires the global RTNL and per-netns RTNL mutex, and rtnl_net_unlock() releases them. We will replace 800+ rtnl_lock() with rtnl_net_lock() and finally removes rtnl_lock() in rtnl_net_lock(). When we need to nest per-netns RTNL mutex, we will use __rtnl_net_lock(), and its locking order is defined by rtnl_net_lock_cmp_fn() as follows: 1. init_net is first 2. netns address ascending order Note that the conversion will be done under CONFIG_DEBUG_NET_SMALL_RTNL with LOCKDEP so that we can carefully add the extra mutex without slowing down RTNL operations during conversion. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/linux/rtnetlink.h | 21 ++++++++++++++++ include/net/net_namespace.h | 4 ++++ net/Kconfig.debug | 15 ++++++++++++ net/core/net_namespace.c | 6 +++++ net/core/rtnetlink.c | 58 +++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 104 insertions(+) (limited to 'include/net') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index cdfc897f1e3c..edd840a49989 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -92,6 +92,27 @@ static inline bool lockdep_rtnl_is_held(void) #define rcu_replace_pointer_rtnl(rp, p) \ rcu_replace_pointer(rp, p, lockdep_rtnl_is_held()) +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL +void __rtnl_net_lock(struct net *net); +void __rtnl_net_unlock(struct net *net); +void rtnl_net_lock(struct net *net); +void rtnl_net_unlock(struct net *net); +int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b); +#else +static inline void __rtnl_net_lock(struct net *net) {} +static inline void __rtnl_net_unlock(struct net *net) {} + +static inline void rtnl_net_lock(struct net *net) +{ + rtnl_lock(); +} + +static inline void rtnl_net_unlock(struct net *net) +{ + rtnl_unlock(); +} +#endif + static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev) { return rtnl_dereference(dev->ingress_queue); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index e67b483cc8bb..873c0f9fdac6 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -188,6 +188,10 @@ struct net { #if IS_ENABLED(CONFIG_SMC) struct netns_smc smc; #endif +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL + /* Move to a better place when the config guard is removed. */ + struct mutex rtnl_mutex; +#endif } __randomize_layout; #include diff --git a/net/Kconfig.debug b/net/Kconfig.debug index 5e3fffe707dd..277fab8c4d77 100644 --- a/net/Kconfig.debug +++ b/net/Kconfig.debug @@ -24,3 +24,18 @@ config DEBUG_NET help Enable extra sanity checks in networking. This is mostly used by fuzzers, but is safe to select. + +config DEBUG_NET_SMALL_RTNL + bool "Add extra per-netns mutex inside RTNL" + depends on DEBUG_KERNEL && NET && LOCK_DEBUGGING_SUPPORT + select PROVE_LOCKING + default n + help + rtnl_lock() is being replaced with rtnl_net_lock() that + acquires the global RTNL and a small per-netns RTNL mutex. + + During the conversion, rtnl_net_lock() just adds an extra + mutex in every RTNL scope and slows down the operations. + + Once the conversion completes, rtnl_lock() will be removed + and rtnetlink will gain per-netns scalability. diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index e39479f1c9a4..105e3cd26763 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -334,6 +334,12 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_ idr_init(&net->netns_ids); spin_lock_init(&net->nsid_lock); mutex_init(&net->ipv4.ra_mutex); + +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL + mutex_init(&net->rtnl_mutex); + lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL); +#endif + preinit_net_sysctl(net); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 682d8d3127db..46567fa54e42 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -179,6 +179,64 @@ bool lockdep_rtnl_is_held(void) EXPORT_SYMBOL(lockdep_rtnl_is_held); #endif /* #ifdef CONFIG_PROVE_LOCKING */ +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL +void __rtnl_net_lock(struct net *net) +{ + ASSERT_RTNL(); + + mutex_lock(&net->rtnl_mutex); +} +EXPORT_SYMBOL(__rtnl_net_lock); + +void __rtnl_net_unlock(struct net *net) +{ + ASSERT_RTNL(); + + mutex_unlock(&net->rtnl_mutex); +} +EXPORT_SYMBOL(__rtnl_net_unlock); + +void rtnl_net_lock(struct net *net) +{ + rtnl_lock(); + __rtnl_net_lock(net); +} +EXPORT_SYMBOL(rtnl_net_lock); + +void rtnl_net_unlock(struct net *net) +{ + __rtnl_net_unlock(net); + rtnl_unlock(); +} +EXPORT_SYMBOL(rtnl_net_unlock); + +static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b) +{ + if (net_eq(net_a, net_b)) + return 0; + + /* always init_net first */ + if (net_eq(net_a, &init_net)) + return -1; + + if (net_eq(net_b, &init_net)) + return 1; + + /* otherwise lock in ascending order */ + return net_a < net_b ? -1 : 1; +} + +int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) +{ + const struct net *net_a, *net_b; + + net_a = container_of(a, struct net, rtnl_mutex.dep_map); + net_b = container_of(b, struct net, rtnl_mutex.dep_map); + + return rtnl_net_cmp_locks(net_a, net_b); +} +#endif + static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) -- cgit v1.2.3 From 02f220b5267042d0de649614eec84ded8aeecb4f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 20:26:55 +0200 Subject: wifi: ipw2x00/lib80211: move remaining lib80211 into libipw There's already much code in libipw that used to be shared with more drivers, but now with the prior cleanups, those old Intel ipw2x00 drivers are also the only ones using whatever is now left of lib80211. Move lib80211 entirely into libipw. Link: https://patch.msgid.link/20241007202707.915ef7b9e7c7.Ib9876d2fe3c90f11d6df458b16d0b7d4bf551a8d@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/ipw2x00/Kconfig | 6 - drivers/net/wireless/intel/ipw2x00/Makefile | 6 +- drivers/net/wireless/intel/ipw2x00/ipw2100.c | 7 +- drivers/net/wireless/intel/ipw2x00/ipw2200.c | 4 +- drivers/net/wireless/intel/ipw2x00/ipw2200.h | 2 - drivers/net/wireless/intel/ipw2x00/libipw.h | 101 ++- drivers/net/wireless/intel/ipw2x00/libipw_crypto.c | 246 +++++++ .../wireless/intel/ipw2x00/libipw_crypto_ccmp.c | 438 ++++++++++++ .../wireless/intel/ipw2x00/libipw_crypto_tkip.c | 728 ++++++++++++++++++++ .../net/wireless/intel/ipw2x00/libipw_crypto_wep.c | 247 +++++++ drivers/net/wireless/intel/ipw2x00/libipw_module.c | 36 +- drivers/net/wireless/intel/ipw2x00/libipw_rx.c | 9 +- drivers/net/wireless/intel/ipw2x00/libipw_tx.c | 4 +- drivers/net/wireless/intel/ipw2x00/libipw_wx.c | 43 +- include/net/lib80211.h | 122 ---- net/wireless/Kconfig | 33 - net/wireless/Makefile | 4 - net/wireless/lib80211.c | 257 ------- net/wireless/lib80211_crypt_ccmp.c | 448 ------------- net/wireless/lib80211_crypt_tkip.c | 738 --------------------- net/wireless/lib80211_crypt_wep.c | 256 ------- 21 files changed, 1825 insertions(+), 1910 deletions(-) create mode 100644 drivers/net/wireless/intel/ipw2x00/libipw_crypto.c create mode 100644 drivers/net/wireless/intel/ipw2x00/libipw_crypto_ccmp.c create mode 100644 drivers/net/wireless/intel/ipw2x00/libipw_crypto_tkip.c create mode 100644 drivers/net/wireless/intel/ipw2x00/libipw_crypto_wep.c delete mode 100644 include/net/lib80211.h delete mode 100644 net/wireless/lib80211.c delete mode 100644 net/wireless/lib80211_crypt_ccmp.c delete mode 100644 net/wireless/lib80211_crypt_tkip.c delete mode 100644 net/wireless/lib80211_crypt_wep.c (limited to 'include/net') diff --git a/drivers/net/wireless/intel/ipw2x00/Kconfig b/drivers/net/wireless/intel/ipw2x00/Kconfig index 1650d5865aa0..d9c042772399 100644 --- a/drivers/net/wireless/intel/ipw2x00/Kconfig +++ b/drivers/net/wireless/intel/ipw2x00/Kconfig @@ -10,7 +10,6 @@ config IPW2100 select WEXT_SPY select WEXT_PRIV select FW_LOADER - select LIB80211 select LIBIPW help A driver for the Intel PRO/Wireless 2100 Network @@ -72,7 +71,6 @@ config IPW2200 select WEXT_SPY select WEXT_PRIV select FW_LOADER - select LIB80211 select LIBIPW help A driver for the Intel PRO/Wireless 2200BG and 2915ABG Network @@ -162,10 +160,6 @@ config LIBIPW select CRYPTO select CRYPTO_MICHAEL_MIC select CRC32 - select LIB80211 - select LIB80211_CRYPT_WEP - select LIB80211_CRYPT_TKIP - select LIB80211_CRYPT_CCMP help This option enables the hardware independent IEEE 802.11 networking stack. This component is deprecated in favor of the diff --git a/drivers/net/wireless/intel/ipw2x00/Makefile b/drivers/net/wireless/intel/ipw2x00/Makefile index e1ec50359dff..60c5faccbe15 100644 --- a/drivers/net/wireless/intel/ipw2x00/Makefile +++ b/drivers/net/wireless/intel/ipw2x00/Makefile @@ -12,4 +12,8 @@ libipw-objs := \ libipw_tx.o \ libipw_rx.o \ libipw_wx.o \ - libipw_geo.o + libipw_geo.o \ + libipw_crypto.o \ + libipw_crypto_ccmp.o \ + libipw_crypto_tkip.o \ + libipw_crypto_wep.o diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c index b6636002c7d2..a89e06c1b8ee 100644 --- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c +++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c @@ -148,9 +148,6 @@ that only one external action is invoked at a time. #include #include #include - -#include - #include "ipw2100.h" #include "ipw.h" @@ -7571,7 +7568,7 @@ static int ipw2100_wx_set_auth(struct net_device *dev, struct ipw2100_priv *priv = libipw_priv(dev); struct libipw_device *ieee = priv->ieee; struct iw_param *param = &wrqu->param; - struct lib80211_crypt_data *crypt; + struct libipw_crypt_data *crypt; unsigned long flags; int ret = 0; @@ -7663,7 +7660,7 @@ static int ipw2100_wx_get_auth(struct net_device *dev, { struct ipw2100_priv *priv = libipw_priv(dev); struct libipw_device *ieee = priv->ieee; - struct lib80211_crypt_data *crypt; + struct libipw_crypt_data *crypt; struct iw_param *param = &wrqu->param; switch (param->flags & IW_AUTH_INDEX) { diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2200.c b/drivers/net/wireless/intel/ipw2x00/ipw2200.c index eed9ef17bc29..f4fd1fc784b7 100644 --- a/drivers/net/wireless/intel/ipw2x00/ipw2200.c +++ b/drivers/net/wireless/intel/ipw2x00/ipw2200.c @@ -6549,7 +6549,7 @@ static int ipw_wx_set_auth(struct net_device *dev, struct ipw_priv *priv = libipw_priv(dev); struct libipw_device *ieee = priv->ieee; struct iw_param *param = &wrqu->param; - struct lib80211_crypt_data *crypt; + struct libipw_crypt_data *crypt; unsigned long flags; int ret = 0; @@ -6648,7 +6648,7 @@ static int ipw_wx_get_auth(struct net_device *dev, { struct ipw_priv *priv = libipw_priv(dev); struct libipw_device *ieee = priv->ieee; - struct lib80211_crypt_data *crypt; + struct libipw_crypt_data *crypt; struct iw_param *param = &wrqu->param; switch (param->flags & IW_AUTH_INDEX) { diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2200.h b/drivers/net/wireless/intel/ipw2x00/ipw2200.h index 8ebf09121e17..46f119123b49 100644 --- a/drivers/net/wireless/intel/ipw2x00/ipw2200.h +++ b/drivers/net/wireless/intel/ipw2x00/ipw2200.h @@ -31,8 +31,6 @@ #include #include #include - -#include #include #define DRV_NAME "ipw2200" diff --git a/drivers/net/wireless/intel/ipw2x00/libipw.h b/drivers/net/wireless/intel/ipw2x00/libipw.h index bad080d33c07..bc727c99ff3c 100644 --- a/drivers/net/wireless/intel/ipw2x00/libipw.h +++ b/drivers/net/wireless/intel/ipw2x00/libipw.h @@ -25,8 +25,6 @@ #include /* ARRAY_SIZE */ #include #include - -#include #include #define LIBIPW_VERSION "git-1.1.13" @@ -699,6 +697,76 @@ struct libipw_geo { struct libipw_channel a[LIBIPW_52GHZ_CHANNELS]; }; +#define NUM_WEP_KEYS 4 + +enum { + IEEE80211_CRYPTO_TKIP_COUNTERMEASURES = (1 << 0), +}; + +struct module; + +struct libipw_crypto_ops { + const char *name; + struct list_head list; + + /* init new crypto context (e.g., allocate private data space, + * select IV, etc.); returns NULL on failure or pointer to allocated + * private data on success */ + void *(*init) (int keyidx); + + /* deinitialize crypto context and free allocated private data */ + void (*deinit) (void *priv); + + /* encrypt/decrypt return < 0 on error or >= 0 on success. The return + * value from decrypt_mpdu is passed as the keyidx value for + * decrypt_msdu. skb must have enough head and tail room for the + * encryption; if not, error will be returned; these functions are + * called for all MPDUs (i.e., fragments). + */ + int (*encrypt_mpdu) (struct sk_buff * skb, int hdr_len, void *priv); + int (*decrypt_mpdu) (struct sk_buff * skb, int hdr_len, void *priv); + + /* These functions are called for full MSDUs, i.e. full frames. + * These can be NULL if full MSDU operations are not needed. */ + int (*encrypt_msdu) (struct sk_buff * skb, int hdr_len, void *priv); + int (*decrypt_msdu) (struct sk_buff * skb, int keyidx, int hdr_len, + void *priv); + + int (*set_key) (void *key, int len, u8 * seq, void *priv); + int (*get_key) (void *key, int len, u8 * seq, void *priv); + + /* procfs handler for printing out key information and possible + * statistics */ + void (*print_stats) (struct seq_file *m, void *priv); + + /* Crypto specific flag get/set for configuration settings */ + unsigned long (*get_flags) (void *priv); + unsigned long (*set_flags) (unsigned long flags, void *priv); + + /* maximum number of bytes added by encryption; encrypt buf is + * allocated with extra_prefix_len bytes, copy of in_buf, and + * extra_postfix_len; encrypt need not use all this space, but + * the result must start at the beginning of the buffer and correct + * length must be returned */ + int extra_mpdu_prefix_len, extra_mpdu_postfix_len; + int extra_msdu_prefix_len, extra_msdu_postfix_len; + + struct module *owner; +}; + +struct libipw_crypt_info { + char *name; + /* Most clients will already have a lock, + so just point to that. */ + spinlock_t *lock; + + struct libipw_crypt_data *crypt[NUM_WEP_KEYS]; + int tx_keyidx; /* default TX key index (crypt[tx_keyidx]) */ + struct list_head crypt_deinit_list; + struct timer_list crypt_deinit_timer; + int crypt_quiesced; +}; + struct libipw_device { struct net_device *dev; struct wireless_dev wdev; @@ -751,7 +819,7 @@ struct libipw_device { size_t wpa_ie_len; u8 *wpa_ie; - struct lib80211_crypt_info crypt_info; + struct libipw_crypt_info crypt_info; int bcrx_sta_key; /* use individual keys to override default keys even * with RX of broad/multicast frames */ @@ -988,4 +1056,31 @@ static inline int libipw_get_scans(struct libipw_device *ieee) return ieee->scans; } +struct libipw_crypt_data { + struct list_head list; /* delayed deletion list */ + const struct libipw_crypto_ops *ops; + void *priv; + atomic_t refcnt; +}; + +int libipw_crypt_info_init(struct libipw_crypt_info *info, char *name, + spinlock_t *lock); +void libipw_crypt_info_free(struct libipw_crypt_info *info); +int libipw_register_crypto_ops(const struct libipw_crypto_ops *ops); +int libipw_unregister_crypto_ops(const struct libipw_crypto_ops *ops); +const struct libipw_crypto_ops *libipw_get_crypto_ops(const char *name); +void libipw_crypt_delayed_deinit(struct libipw_crypt_info *info, + struct libipw_crypt_data **crypt); + +/* must be called in the listed order */ +int libipw_crypto_init(void); +int libipw_crypto_ccmp_init(void); +int libipw_crypto_tkip_init(void); +int libipw_crypto_wep_init(void); + +void libipw_crypto_wep_exit(void); +void libipw_crypto_tkip_exit(void); +void libipw_crypto_ccmp_exit(void); +void libipw_crypto_exit(void); + #endif /* LIBIPW_H */ diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_crypto.c b/drivers/net/wireless/intel/ipw2x00/libipw_crypto.c new file mode 100644 index 000000000000..32639e0e8430 --- /dev/null +++ b/drivers/net/wireless/intel/ipw2x00/libipw_crypto.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * libipw -- common bits for IPW drivers + * + * Copyright(c) 2008 John W. Linville + * + * Portions copied from old ieee80211 component, w/ original copyright + * notices below: + * + * Host AP crypto routines + * + * Copyright (c) 2002-2003, Jouni Malinen + * Portions Copyright (C) 2004, Intel Corporation + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include "libipw.h" + +struct libipw_crypto_alg { + struct list_head list; + const struct libipw_crypto_ops *ops; +}; + +static LIST_HEAD(libipw_crypto_algs); +static DEFINE_SPINLOCK(libipw_crypto_lock); + +static void libipw_crypt_deinit_entries(struct libipw_crypt_info *info, + int force); +static void libipw_crypt_quiescing(struct libipw_crypt_info *info); +static void libipw_crypt_deinit_handler(struct timer_list *t); + +int libipw_crypt_info_init(struct libipw_crypt_info *info, char *name, + spinlock_t *lock) +{ + memset(info, 0, sizeof(*info)); + + info->name = name; + info->lock = lock; + + INIT_LIST_HEAD(&info->crypt_deinit_list); + timer_setup(&info->crypt_deinit_timer, libipw_crypt_deinit_handler, + 0); + + return 0; +} +EXPORT_SYMBOL(libipw_crypt_info_init); + +void libipw_crypt_info_free(struct libipw_crypt_info *info) +{ + int i; + + libipw_crypt_quiescing(info); + del_timer_sync(&info->crypt_deinit_timer); + libipw_crypt_deinit_entries(info, 1); + + for (i = 0; i < NUM_WEP_KEYS; i++) { + struct libipw_crypt_data *crypt = info->crypt[i]; + if (crypt) { + if (crypt->ops) { + crypt->ops->deinit(crypt->priv); + module_put(crypt->ops->owner); + } + kfree(crypt); + info->crypt[i] = NULL; + } + } +} +EXPORT_SYMBOL(libipw_crypt_info_free); + +static void libipw_crypt_deinit_entries(struct libipw_crypt_info *info, + int force) +{ + struct libipw_crypt_data *entry, *next; + unsigned long flags; + + spin_lock_irqsave(info->lock, flags); + list_for_each_entry_safe(entry, next, &info->crypt_deinit_list, list) { + if (atomic_read(&entry->refcnt) != 0 && !force) + continue; + + list_del(&entry->list); + + if (entry->ops) { + entry->ops->deinit(entry->priv); + module_put(entry->ops->owner); + } + kfree(entry); + } + spin_unlock_irqrestore(info->lock, flags); +} + +/* After this, crypt_deinit_list won't accept new members */ +static void libipw_crypt_quiescing(struct libipw_crypt_info *info) +{ + unsigned long flags; + + spin_lock_irqsave(info->lock, flags); + info->crypt_quiesced = 1; + spin_unlock_irqrestore(info->lock, flags); +} + +static void libipw_crypt_deinit_handler(struct timer_list *t) +{ + struct libipw_crypt_info *info = from_timer(info, t, + crypt_deinit_timer); + unsigned long flags; + + libipw_crypt_deinit_entries(info, 0); + + spin_lock_irqsave(info->lock, flags); + if (!list_empty(&info->crypt_deinit_list) && !info->crypt_quiesced) { + printk(KERN_DEBUG "%s: entries remaining in delayed crypt " + "deletion list\n", info->name); + info->crypt_deinit_timer.expires = jiffies + HZ; + add_timer(&info->crypt_deinit_timer); + } + spin_unlock_irqrestore(info->lock, flags); +} + +void libipw_crypt_delayed_deinit(struct libipw_crypt_info *info, + struct libipw_crypt_data **crypt) +{ + struct libipw_crypt_data *tmp; + unsigned long flags; + + if (*crypt == NULL) + return; + + tmp = *crypt; + *crypt = NULL; + + /* must not run ops->deinit() while there may be pending encrypt or + * decrypt operations. Use a list of delayed deinits to avoid needing + * locking. */ + + spin_lock_irqsave(info->lock, flags); + if (!info->crypt_quiesced) { + list_add(&tmp->list, &info->crypt_deinit_list); + if (!timer_pending(&info->crypt_deinit_timer)) { + info->crypt_deinit_timer.expires = jiffies + HZ; + add_timer(&info->crypt_deinit_timer); + } + } + spin_unlock_irqrestore(info->lock, flags); +} +EXPORT_SYMBOL(libipw_crypt_delayed_deinit); + +int libipw_register_crypto_ops(const struct libipw_crypto_ops *ops) +{ + unsigned long flags; + struct libipw_crypto_alg *alg; + + alg = kzalloc(sizeof(*alg), GFP_KERNEL); + if (alg == NULL) + return -ENOMEM; + + alg->ops = ops; + + spin_lock_irqsave(&libipw_crypto_lock, flags); + list_add(&alg->list, &libipw_crypto_algs); + spin_unlock_irqrestore(&libipw_crypto_lock, flags); + + printk(KERN_DEBUG "libipw_crypt: registered algorithm '%s'\n", + ops->name); + + return 0; +} +EXPORT_SYMBOL(libipw_register_crypto_ops); + +int libipw_unregister_crypto_ops(const struct libipw_crypto_ops *ops) +{ + struct libipw_crypto_alg *alg; + unsigned long flags; + + spin_lock_irqsave(&libipw_crypto_lock, flags); + list_for_each_entry(alg, &libipw_crypto_algs, list) { + if (alg->ops == ops) + goto found; + } + spin_unlock_irqrestore(&libipw_crypto_lock, flags); + return -EINVAL; + + found: + printk(KERN_DEBUG "libipw_crypt: unregistered algorithm '%s'\n", + ops->name); + list_del(&alg->list); + spin_unlock_irqrestore(&libipw_crypto_lock, flags); + kfree(alg); + return 0; +} +EXPORT_SYMBOL(libipw_unregister_crypto_ops); + +const struct libipw_crypto_ops *libipw_get_crypto_ops(const char *name) +{ + struct libipw_crypto_alg *alg; + unsigned long flags; + + spin_lock_irqsave(&libipw_crypto_lock, flags); + list_for_each_entry(alg, &libipw_crypto_algs, list) { + if (strcmp(alg->ops->name, name) == 0) + goto found; + } + spin_unlock_irqrestore(&libipw_crypto_lock, flags); + return NULL; + + found: + spin_unlock_irqrestore(&libipw_crypto_lock, flags); + return alg->ops; +} +EXPORT_SYMBOL(libipw_get_crypto_ops); + +static void *libipw_crypt_null_init(int keyidx) +{ + return (void *)1; +} + +static void libipw_crypt_null_deinit(void *priv) +{ +} + +static const struct libipw_crypto_ops libipw_crypt_null = { + .name = "NULL", + .init = libipw_crypt_null_init, + .deinit = libipw_crypt_null_deinit, + .owner = THIS_MODULE, +}; + +int __init libipw_crypto_init(void) +{ + return libipw_register_crypto_ops(&libipw_crypt_null); +} + +void libipw_crypto_exit(void) +{ + libipw_unregister_crypto_ops(&libipw_crypt_null); + BUG_ON(!list_empty(&libipw_crypto_algs)); +} diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_crypto_ccmp.c b/drivers/net/wireless/intel/ipw2x00/libipw_crypto_ccmp.c new file mode 100644 index 000000000000..bf900d8c8ad3 --- /dev/null +++ b/drivers/net/wireless/intel/ipw2x00/libipw_crypto_ccmp.c @@ -0,0 +1,438 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * libipw crypt: host-based CCMP encryption implementation for libipw + * + * Copyright (c) 2003-2004, Jouni Malinen + * Copyright (c) 2008, John W. Linville + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libipw.h" + +#define AES_BLOCK_LEN 16 +#define CCMP_HDR_LEN 8 +#define CCMP_MIC_LEN 8 +#define CCMP_TK_LEN 16 +#define CCMP_PN_LEN 6 + +struct libipw_ccmp_data { + u8 key[CCMP_TK_LEN]; + int key_set; + + u8 tx_pn[CCMP_PN_LEN]; + u8 rx_pn[CCMP_PN_LEN]; + + u32 dot11RSNAStatsCCMPFormatErrors; + u32 dot11RSNAStatsCCMPReplays; + u32 dot11RSNAStatsCCMPDecryptErrors; + + int key_idx; + + struct crypto_aead *tfm; + + /* scratch buffers for virt_to_page() (crypto API) */ + u8 tx_aad[2 * AES_BLOCK_LEN]; + u8 rx_aad[2 * AES_BLOCK_LEN]; +}; + +static void *libipw_ccmp_init(int key_idx) +{ + struct libipw_ccmp_data *priv; + + priv = kzalloc(sizeof(*priv), GFP_ATOMIC); + if (priv == NULL) + goto fail; + priv->key_idx = key_idx; + + priv->tfm = crypto_alloc_aead("ccm(aes)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(priv->tfm)) { + priv->tfm = NULL; + goto fail; + } + + return priv; + + fail: + if (priv) { + if (priv->tfm) + crypto_free_aead(priv->tfm); + kfree(priv); + } + + return NULL; +} + +static void libipw_ccmp_deinit(void *priv) +{ + struct libipw_ccmp_data *_priv = priv; + if (_priv && _priv->tfm) + crypto_free_aead(_priv->tfm); + kfree(priv); +} + +static int ccmp_init_iv_and_aad(const struct ieee80211_hdr *hdr, + const u8 *pn, u8 *iv, u8 *aad) +{ + u8 *pos, qc = 0; + size_t aad_len; + int a4_included, qc_included; + + a4_included = ieee80211_has_a4(hdr->frame_control); + qc_included = ieee80211_is_data_qos(hdr->frame_control); + + aad_len = 22; + if (a4_included) + aad_len += 6; + if (qc_included) { + pos = (u8 *) & hdr->addr4; + if (a4_included) + pos += 6; + qc = *pos & 0x0f; + aad_len += 2; + } + + /* In CCM, the initial vectors (IV) used for CTR mode encryption and CBC + * mode authentication are not allowed to collide, yet both are derived + * from the same vector. We only set L := 1 here to indicate that the + * data size can be represented in (L+1) bytes. The CCM layer will take + * care of storing the data length in the top (L+1) bytes and setting + * and clearing the other bits as is required to derive the two IVs. + */ + iv[0] = 0x1; + + /* Nonce: QC | A2 | PN */ + iv[1] = qc; + memcpy(iv + 2, hdr->addr2, ETH_ALEN); + memcpy(iv + 8, pn, CCMP_PN_LEN); + + /* AAD: + * FC with bits 4..6 and 11..13 masked to zero; 14 is always one + * A1 | A2 | A3 + * SC with bits 4..15 (seq#) masked to zero + * A4 (if present) + * QC (if present) + */ + pos = (u8 *) hdr; + aad[0] = pos[0] & 0x8f; + aad[1] = pos[1] & 0xc7; + memcpy(aad + 2, &hdr->addrs, 3 * ETH_ALEN); + pos = (u8 *) & hdr->seq_ctrl; + aad[20] = pos[0] & 0x0f; + aad[21] = 0; /* all bits masked */ + memset(aad + 22, 0, 8); + if (a4_included) + memcpy(aad + 22, hdr->addr4, ETH_ALEN); + if (qc_included) { + aad[a4_included ? 28 : 22] = qc; + /* rest of QC masked */ + } + return aad_len; +} + +static int libipw_ccmp_hdr(struct sk_buff *skb, int hdr_len, + u8 *aeskey, int keylen, void *priv) +{ + struct libipw_ccmp_data *key = priv; + int i; + u8 *pos; + + if (skb_headroom(skb) < CCMP_HDR_LEN || skb->len < hdr_len) + return -1; + + if (aeskey != NULL && keylen >= CCMP_TK_LEN) + memcpy(aeskey, key->key, CCMP_TK_LEN); + + pos = skb_push(skb, CCMP_HDR_LEN); + memmove(pos, pos + CCMP_HDR_LEN, hdr_len); + pos += hdr_len; + + i = CCMP_PN_LEN - 1; + while (i >= 0) { + key->tx_pn[i]++; + if (key->tx_pn[i] != 0) + break; + i--; + } + + *pos++ = key->tx_pn[5]; + *pos++ = key->tx_pn[4]; + *pos++ = 0; + *pos++ = (key->key_idx << 6) | (1 << 5) /* Ext IV included */ ; + *pos++ = key->tx_pn[3]; + *pos++ = key->tx_pn[2]; + *pos++ = key->tx_pn[1]; + *pos++ = key->tx_pn[0]; + + return CCMP_HDR_LEN; +} + +static int libipw_ccmp_encrypt(struct sk_buff *skb, int hdr_len, void *priv) +{ + struct libipw_ccmp_data *key = priv; + struct ieee80211_hdr *hdr; + struct aead_request *req; + struct scatterlist sg[2]; + u8 *aad = key->tx_aad; + u8 iv[AES_BLOCK_LEN]; + int len, data_len, aad_len; + int ret; + + if (skb_tailroom(skb) < CCMP_MIC_LEN || skb->len < hdr_len) + return -1; + + data_len = skb->len - hdr_len; + len = libipw_ccmp_hdr(skb, hdr_len, NULL, 0, priv); + if (len < 0) + return -1; + + req = aead_request_alloc(key->tfm, GFP_ATOMIC); + if (!req) + return -ENOMEM; + + hdr = (struct ieee80211_hdr *)skb->data; + aad_len = ccmp_init_iv_and_aad(hdr, key->tx_pn, iv, aad); + + skb_put(skb, CCMP_MIC_LEN); + + sg_init_table(sg, 2); + sg_set_buf(&sg[0], aad, aad_len); + sg_set_buf(&sg[1], skb->data + hdr_len + CCMP_HDR_LEN, + data_len + CCMP_MIC_LEN); + + aead_request_set_callback(req, 0, NULL, NULL); + aead_request_set_ad(req, aad_len); + aead_request_set_crypt(req, sg, sg, data_len, iv); + + ret = crypto_aead_encrypt(req); + aead_request_free(req); + + return ret; +} + +/* + * deal with seq counter wrapping correctly. + * refer to timer_after() for jiffies wrapping handling + */ +static inline int ccmp_replay_check(u8 *pn_n, u8 *pn_o) +{ + u32 iv32_n, iv16_n; + u32 iv32_o, iv16_o; + + iv32_n = (pn_n[0] << 24) | (pn_n[1] << 16) | (pn_n[2] << 8) | pn_n[3]; + iv16_n = (pn_n[4] << 8) | pn_n[5]; + + iv32_o = (pn_o[0] << 24) | (pn_o[1] << 16) | (pn_o[2] << 8) | pn_o[3]; + iv16_o = (pn_o[4] << 8) | pn_o[5]; + + if ((s32)iv32_n - (s32)iv32_o < 0 || + (iv32_n == iv32_o && iv16_n <= iv16_o)) + return 1; + return 0; +} + +static int libipw_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv) +{ + struct libipw_ccmp_data *key = priv; + u8 keyidx, *pos; + struct ieee80211_hdr *hdr; + struct aead_request *req; + struct scatterlist sg[2]; + u8 *aad = key->rx_aad; + u8 iv[AES_BLOCK_LEN]; + u8 pn[6]; + int aad_len, ret; + size_t data_len = skb->len - hdr_len - CCMP_HDR_LEN; + + if (skb->len < hdr_len + CCMP_HDR_LEN + CCMP_MIC_LEN) { + key->dot11RSNAStatsCCMPFormatErrors++; + return -1; + } + + hdr = (struct ieee80211_hdr *)skb->data; + pos = skb->data + hdr_len; + keyidx = pos[3]; + if (!(keyidx & (1 << 5))) { + net_dbg_ratelimited("CCMP: received packet without ExtIV flag from %pM\n", + hdr->addr2); + key->dot11RSNAStatsCCMPFormatErrors++; + return -2; + } + keyidx >>= 6; + if (key->key_idx != keyidx) { + net_dbg_ratelimited("CCMP: RX tkey->key_idx=%d frame keyidx=%d\n", + key->key_idx, keyidx); + return -6; + } + if (!key->key_set) { + net_dbg_ratelimited("CCMP: received packet from %pM with keyid=%d that does not have a configured key\n", + hdr->addr2, keyidx); + return -3; + } + + pn[0] = pos[7]; + pn[1] = pos[6]; + pn[2] = pos[5]; + pn[3] = pos[4]; + pn[4] = pos[1]; + pn[5] = pos[0]; + pos += 8; + + if (ccmp_replay_check(pn, key->rx_pn)) { +#ifdef CONFIG_LIBIPW_DEBUG + net_dbg_ratelimited("CCMP: replay detected: STA=%pM previous PN %02x%02x%02x%02x%02x%02x received PN %02x%02x%02x%02x%02x%02x\n", + hdr->addr2, + key->rx_pn[0], key->rx_pn[1], key->rx_pn[2], + key->rx_pn[3], key->rx_pn[4], key->rx_pn[5], + pn[0], pn[1], pn[2], pn[3], pn[4], pn[5]); +#endif + key->dot11RSNAStatsCCMPReplays++; + return -4; + } + + req = aead_request_alloc(key->tfm, GFP_ATOMIC); + if (!req) + return -ENOMEM; + + aad_len = ccmp_init_iv_and_aad(hdr, pn, iv, aad); + + sg_init_table(sg, 2); + sg_set_buf(&sg[0], aad, aad_len); + sg_set_buf(&sg[1], pos, data_len); + + aead_request_set_callback(req, 0, NULL, NULL); + aead_request_set_ad(req, aad_len); + aead_request_set_crypt(req, sg, sg, data_len, iv); + + ret = crypto_aead_decrypt(req); + aead_request_free(req); + + if (ret) { + net_dbg_ratelimited("CCMP: decrypt failed: STA=%pM (%d)\n", + hdr->addr2, ret); + key->dot11RSNAStatsCCMPDecryptErrors++; + return -5; + } + + memcpy(key->rx_pn, pn, CCMP_PN_LEN); + + /* Remove hdr and MIC */ + memmove(skb->data + CCMP_HDR_LEN, skb->data, hdr_len); + skb_pull(skb, CCMP_HDR_LEN); + skb_trim(skb, skb->len - CCMP_MIC_LEN); + + return keyidx; +} + +static int libipw_ccmp_set_key(void *key, int len, u8 * seq, void *priv) +{ + struct libipw_ccmp_data *data = priv; + int keyidx; + struct crypto_aead *tfm = data->tfm; + + keyidx = data->key_idx; + memset(data, 0, sizeof(*data)); + data->key_idx = keyidx; + data->tfm = tfm; + if (len == CCMP_TK_LEN) { + memcpy(data->key, key, CCMP_TK_LEN); + data->key_set = 1; + if (seq) { + data->rx_pn[0] = seq[5]; + data->rx_pn[1] = seq[4]; + data->rx_pn[2] = seq[3]; + data->rx_pn[3] = seq[2]; + data->rx_pn[4] = seq[1]; + data->rx_pn[5] = seq[0]; + } + if (crypto_aead_setauthsize(data->tfm, CCMP_MIC_LEN) || + crypto_aead_setkey(data->tfm, data->key, CCMP_TK_LEN)) + return -1; + } else if (len == 0) + data->key_set = 0; + else + return -1; + + return 0; +} + +static int libipw_ccmp_get_key(void *key, int len, u8 * seq, void *priv) +{ + struct libipw_ccmp_data *data = priv; + + if (len < CCMP_TK_LEN) + return -1; + + if (!data->key_set) + return 0; + memcpy(key, data->key, CCMP_TK_LEN); + + if (seq) { + seq[0] = data->tx_pn[5]; + seq[1] = data->tx_pn[4]; + seq[2] = data->tx_pn[3]; + seq[3] = data->tx_pn[2]; + seq[4] = data->tx_pn[1]; + seq[5] = data->tx_pn[0]; + } + + return CCMP_TK_LEN; +} + +static void libipw_ccmp_print_stats(struct seq_file *m, void *priv) +{ + struct libipw_ccmp_data *ccmp = priv; + + seq_printf(m, + "key[%d] alg=CCMP key_set=%d " + "tx_pn=%02x%02x%02x%02x%02x%02x " + "rx_pn=%02x%02x%02x%02x%02x%02x " + "format_errors=%d replays=%d decrypt_errors=%d\n", + ccmp->key_idx, ccmp->key_set, + ccmp->tx_pn[0], ccmp->tx_pn[1], ccmp->tx_pn[2], + ccmp->tx_pn[3], ccmp->tx_pn[4], ccmp->tx_pn[5], + ccmp->rx_pn[0], ccmp->rx_pn[1], ccmp->rx_pn[2], + ccmp->rx_pn[3], ccmp->rx_pn[4], ccmp->rx_pn[5], + ccmp->dot11RSNAStatsCCMPFormatErrors, + ccmp->dot11RSNAStatsCCMPReplays, + ccmp->dot11RSNAStatsCCMPDecryptErrors); +} + +static const struct libipw_crypto_ops libipw_crypt_ccmp = { + .name = "CCMP", + .init = libipw_ccmp_init, + .deinit = libipw_ccmp_deinit, + .encrypt_mpdu = libipw_ccmp_encrypt, + .decrypt_mpdu = libipw_ccmp_decrypt, + .encrypt_msdu = NULL, + .decrypt_msdu = NULL, + .set_key = libipw_ccmp_set_key, + .get_key = libipw_ccmp_get_key, + .print_stats = libipw_ccmp_print_stats, + .extra_mpdu_prefix_len = CCMP_HDR_LEN, + .extra_mpdu_postfix_len = CCMP_MIC_LEN, + .owner = THIS_MODULE, +}; + +int __init libipw_crypto_ccmp_init(void) +{ + return libipw_register_crypto_ops(&libipw_crypt_ccmp); +} + +void libipw_crypto_ccmp_exit(void) +{ + libipw_unregister_crypto_ops(&libipw_crypt_ccmp); +} diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_crypto_tkip.c b/drivers/net/wireless/intel/ipw2x00/libipw_crypto_tkip.c new file mode 100644 index 000000000000..32288697da4f --- /dev/null +++ b/drivers/net/wireless/intel/ipw2x00/libipw_crypto_tkip.c @@ -0,0 +1,728 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * libipw crypt: host-based TKIP encryption implementation for libipw + * + * Copyright (c) 2003-2004, Jouni Malinen + * Copyright (c) 2008, John W. Linville + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libipw.h" + +#define TKIP_HDR_LEN 8 + +struct libipw_tkip_data { +#define TKIP_KEY_LEN 32 + u8 key[TKIP_KEY_LEN]; + int key_set; + + u32 tx_iv32; + u16 tx_iv16; + u16 tx_ttak[5]; + int tx_phase1_done; + + u32 rx_iv32; + u16 rx_iv16; + u16 rx_ttak[5]; + int rx_phase1_done; + u32 rx_iv32_new; + u16 rx_iv16_new; + + u32 dot11RSNAStatsTKIPReplays; + u32 dot11RSNAStatsTKIPICVErrors; + u32 dot11RSNAStatsTKIPLocalMICFailures; + + int key_idx; + + struct arc4_ctx rx_ctx_arc4; + struct arc4_ctx tx_ctx_arc4; + struct crypto_shash *rx_tfm_michael; + struct crypto_shash *tx_tfm_michael; + + /* scratch buffers for virt_to_page() (crypto API) */ + u8 rx_hdr[16], tx_hdr[16]; + + unsigned long flags; +}; + +static unsigned long libipw_tkip_set_flags(unsigned long flags, void *priv) +{ + struct libipw_tkip_data *_priv = priv; + unsigned long old_flags = _priv->flags; + _priv->flags = flags; + return old_flags; +} + +static unsigned long libipw_tkip_get_flags(void *priv) +{ + struct libipw_tkip_data *_priv = priv; + return _priv->flags; +} + +static void *libipw_tkip_init(int key_idx) +{ + struct libipw_tkip_data *priv; + + if (fips_enabled) + return NULL; + + priv = kzalloc(sizeof(*priv), GFP_ATOMIC); + if (priv == NULL) + goto fail; + + priv->key_idx = key_idx; + + priv->tx_tfm_michael = crypto_alloc_shash("michael_mic", 0, 0); + if (IS_ERR(priv->tx_tfm_michael)) { + priv->tx_tfm_michael = NULL; + goto fail; + } + + priv->rx_tfm_michael = crypto_alloc_shash("michael_mic", 0, 0); + if (IS_ERR(priv->rx_tfm_michael)) { + priv->rx_tfm_michael = NULL; + goto fail; + } + + return priv; + + fail: + if (priv) { + crypto_free_shash(priv->tx_tfm_michael); + crypto_free_shash(priv->rx_tfm_michael); + kfree(priv); + } + + return NULL; +} + +static void libipw_tkip_deinit(void *priv) +{ + struct libipw_tkip_data *_priv = priv; + if (_priv) { + crypto_free_shash(_priv->tx_tfm_michael); + crypto_free_shash(_priv->rx_tfm_michael); + } + kfree_sensitive(priv); +} + +static inline u16 RotR1(u16 val) +{ + return (val >> 1) | (val << 15); +} + +static inline u8 Lo8(u16 val) +{ + return val & 0xff; +} + +static inline u8 Hi8(u16 val) +{ + return val >> 8; +} + +static inline u16 Lo16(u32 val) +{ + return val & 0xffff; +} + +static inline u16 Hi16(u32 val) +{ + return val >> 16; +} + +static inline u16 Mk16(u8 hi, u8 lo) +{ + return lo | (((u16) hi) << 8); +} + +static inline u16 Mk16_le(__le16 * v) +{ + return le16_to_cpu(*v); +} + +static const u16 Sbox[256] = { + 0xC6A5, 0xF884, 0xEE99, 0xF68D, 0xFF0D, 0xD6BD, 0xDEB1, 0x9154, + 0x6050, 0x0203, 0xCEA9, 0x567D, 0xE719, 0xB562, 0x4DE6, 0xEC9A, + 0x8F45, 0x1F9D, 0x8940, 0xFA87, 0xEF15, 0xB2EB, 0x8EC9, 0xFB0B, + 0x41EC, 0xB367, 0x5FFD, 0x45EA, 0x23BF, 0x53F7, 0xE496, 0x9B5B, + 0x75C2, 0xE11C, 0x3DAE, 0x4C6A, 0x6C5A, 0x7E41, 0xF502, 0x834F, + 0x685C, 0x51F4, 0xD134, 0xF908, 0xE293, 0xAB73, 0x6253, 0x2A3F, + 0x080C, 0x9552, 0x4665, 0x9D5E, 0x3028, 0x37A1, 0x0A0F, 0x2FB5, + 0x0E09, 0x2436, 0x1B9B, 0xDF3D, 0xCD26, 0x4E69, 0x7FCD, 0xEA9F, + 0x121B, 0x1D9E, 0x5874, 0x342E, 0x362D, 0xDCB2, 0xB4EE, 0x5BFB, + 0xA4F6, 0x764D, 0xB761, 0x7DCE, 0x527B, 0xDD3E, 0x5E71, 0x1397, + 0xA6F5, 0xB968, 0x0000, 0xC12C, 0x4060, 0xE31F, 0x79C8, 0xB6ED, + 0xD4BE, 0x8D46, 0x67D9, 0x724B, 0x94DE, 0x98D4, 0xB0E8, 0x854A, + 0xBB6B, 0xC52A, 0x4FE5, 0xED16, 0x86C5, 0x9AD7, 0x6655, 0x1194, + 0x8ACF, 0xE910, 0x0406, 0xFE81, 0xA0F0, 0x7844, 0x25BA, 0x4BE3, + 0xA2F3, 0x5DFE, 0x80C0, 0x058A, 0x3FAD, 0x21BC, 0x7048, 0xF104, + 0x63DF, 0x77C1, 0xAF75, 0x4263, 0x2030, 0xE51A, 0xFD0E, 0xBF6D, + 0x814C, 0x1814, 0x2635, 0xC32F, 0xBEE1, 0x35A2, 0x88CC, 0x2E39, + 0x9357, 0x55F2, 0xFC82, 0x7A47, 0xC8AC, 0xBAE7, 0x322B, 0xE695, + 0xC0A0, 0x1998, 0x9ED1, 0xA37F, 0x4466, 0x547E, 0x3BAB, 0x0B83, + 0x8CCA, 0xC729, 0x6BD3, 0x283C, 0xA779, 0xBCE2, 0x161D, 0xAD76, + 0xDB3B, 0x6456, 0x744E, 0x141E, 0x92DB, 0x0C0A, 0x486C, 0xB8E4, + 0x9F5D, 0xBD6E, 0x43EF, 0xC4A6, 0x39A8, 0x31A4, 0xD337, 0xF28B, + 0xD532, 0x8B43, 0x6E59, 0xDAB7, 0x018C, 0xB164, 0x9CD2, 0x49E0, + 0xD8B4, 0xACFA, 0xF307, 0xCF25, 0xCAAF, 0xF48E, 0x47E9, 0x1018, + 0x6FD5, 0xF088, 0x4A6F, 0x5C72, 0x3824, 0x57F1, 0x73C7, 0x9751, + 0xCB23, 0xA17C, 0xE89C, 0x3E21, 0x96DD, 0x61DC, 0x0D86, 0x0F85, + 0xE090, 0x7C42, 0x71C4, 0xCCAA, 0x90D8, 0x0605, 0xF701, 0x1C12, + 0xC2A3, 0x6A5F, 0xAEF9, 0x69D0, 0x1791, 0x9958, 0x3A27, 0x27B9, + 0xD938, 0xEB13, 0x2BB3, 0x2233, 0xD2BB, 0xA970, 0x0789, 0x33A7, + 0x2DB6, 0x3C22, 0x1592, 0xC920, 0x8749, 0xAAFF, 0x5078, 0xA57A, + 0x038F, 0x59F8, 0x0980, 0x1A17, 0x65DA, 0xD731, 0x84C6, 0xD0B8, + 0x82C3, 0x29B0, 0x5A77, 0x1E11, 0x7BCB, 0xA8FC, 0x6DD6, 0x2C3A, +}; + +static inline u16 _S_(u16 v) +{ + u16 t = Sbox[Hi8(v)]; + return Sbox[Lo8(v)] ^ ((t << 8) | (t >> 8)); +} + +#define PHASE1_LOOP_COUNT 8 + +static void tkip_mixing_phase1(u16 * TTAK, const u8 * TK, const u8 * TA, + u32 IV32) +{ + int i, j; + + /* Initialize the 80-bit TTAK from TSC (IV32) and TA[0..5] */ + TTAK[0] = Lo16(IV32); + TTAK[1] = Hi16(IV32); + TTAK[2] = Mk16(TA[1], TA[0]); + TTAK[3] = Mk16(TA[3], TA[2]); + TTAK[4] = Mk16(TA[5], TA[4]); + + for (i = 0; i < PHASE1_LOOP_COUNT; i++) { + j = 2 * (i & 1); + TTAK[0] += _S_(TTAK[4] ^ Mk16(TK[1 + j], TK[0 + j])); + TTAK[1] += _S_(TTAK[0] ^ Mk16(TK[5 + j], TK[4 + j])); + TTAK[2] += _S_(TTAK[1] ^ Mk16(TK[9 + j], TK[8 + j])); + TTAK[3] += _S_(TTAK[2] ^ Mk16(TK[13 + j], TK[12 + j])); + TTAK[4] += _S_(TTAK[3] ^ Mk16(TK[1 + j], TK[0 + j])) + i; + } +} + +static void tkip_mixing_phase2(u8 * WEPSeed, const u8 * TK, const u16 * TTAK, + u16 IV16) +{ + /* Make temporary area overlap WEP seed so that the final copy can be + * avoided on little endian hosts. */ + u16 *PPK = (u16 *) & WEPSeed[4]; + + /* Step 1 - make copy of TTAK and bring in TSC */ + PPK[0] = TTAK[0]; + PPK[1] = TTAK[1]; + PPK[2] = TTAK[2]; + PPK[3] = TTAK[3]; + PPK[4] = TTAK[4]; + PPK[5] = TTAK[4] + IV16; + + /* Step 2 - 96-bit bijective mixing using S-box */ + PPK[0] += _S_(PPK[5] ^ Mk16_le((__le16 *) & TK[0])); + PPK[1] += _S_(PPK[0] ^ Mk16_le((__le16 *) & TK[2])); + PPK[2] += _S_(PPK[1] ^ Mk16_le((__le16 *) & TK[4])); + PPK[3] += _S_(PPK[2] ^ Mk16_le((__le16 *) & TK[6])); + PPK[4] += _S_(PPK[3] ^ Mk16_le((__le16 *) & TK[8])); + PPK[5] += _S_(PPK[4] ^ Mk16_le((__le16 *) & TK[10])); + + PPK[0] += RotR1(PPK[5] ^ Mk16_le((__le16 *) & TK[12])); + PPK[1] += RotR1(PPK[0] ^ Mk16_le((__le16 *) & TK[14])); + PPK[2] += RotR1(PPK[1]); + PPK[3] += RotR1(PPK[2]); + PPK[4] += RotR1(PPK[3]); + PPK[5] += RotR1(PPK[4]); + + /* Step 3 - bring in last of TK bits, assign 24-bit WEP IV value + * WEPSeed[0..2] is transmitted as WEP IV */ + WEPSeed[0] = Hi8(IV16); + WEPSeed[1] = (Hi8(IV16) | 0x20) & 0x7F; + WEPSeed[2] = Lo8(IV16); + WEPSeed[3] = Lo8((PPK[5] ^ Mk16_le((__le16 *) & TK[0])) >> 1); + +#ifdef __BIG_ENDIAN + { + int i; + for (i = 0; i < 6; i++) + PPK[i] = (PPK[i] << 8) | (PPK[i] >> 8); + } +#endif +} + +static int libipw_tkip_hdr(struct sk_buff *skb, int hdr_len, + u8 * rc4key, int keylen, void *priv) +{ + struct libipw_tkip_data *tkey = priv; + u8 *pos; + struct ieee80211_hdr *hdr; + + hdr = (struct ieee80211_hdr *)skb->data; + + if (skb_headroom(skb) < TKIP_HDR_LEN || skb->len < hdr_len) + return -1; + + if (rc4key == NULL || keylen < 16) + return -1; + + if (!tkey->tx_phase1_done) { + tkip_mixing_phase1(tkey->tx_ttak, tkey->key, hdr->addr2, + tkey->tx_iv32); + tkey->tx_phase1_done = 1; + } + tkip_mixing_phase2(rc4key, tkey->key, tkey->tx_ttak, tkey->tx_iv16); + + pos = skb_push(skb, TKIP_HDR_LEN); + memmove(pos, pos + TKIP_HDR_LEN, hdr_len); + pos += hdr_len; + + *pos++ = *rc4key; + *pos++ = *(rc4key + 1); + *pos++ = *(rc4key + 2); + *pos++ = (tkey->key_idx << 6) | (1 << 5) /* Ext IV included */ ; + *pos++ = tkey->tx_iv32 & 0xff; + *pos++ = (tkey->tx_iv32 >> 8) & 0xff; + *pos++ = (tkey->tx_iv32 >> 16) & 0xff; + *pos++ = (tkey->tx_iv32 >> 24) & 0xff; + + tkey->tx_iv16++; + if (tkey->tx_iv16 == 0) { + tkey->tx_phase1_done = 0; + tkey->tx_iv32++; + } + + return TKIP_HDR_LEN; +} + +static int libipw_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv) +{ + struct libipw_tkip_data *tkey = priv; + int len; + u8 rc4key[16], *pos, *icv; + u32 crc; + + if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) { + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; + net_dbg_ratelimited("TKIP countermeasures: dropped TX packet to %pM\n", + hdr->addr1); + return -1; + } + + if (skb_tailroom(skb) < 4 || skb->len < hdr_len) + return -1; + + len = skb->len - hdr_len; + pos = skb->data + hdr_len; + + if ((libipw_tkip_hdr(skb, hdr_len, rc4key, 16, priv)) < 0) + return -1; + + crc = ~crc32_le(~0, pos, len); + icv = skb_put(skb, 4); + icv[0] = crc; + icv[1] = crc >> 8; + icv[2] = crc >> 16; + icv[3] = crc >> 24; + + arc4_setkey(&tkey->tx_ctx_arc4, rc4key, 16); + arc4_crypt(&tkey->tx_ctx_arc4, pos, pos, len + 4); + + return 0; +} + +/* + * deal with seq counter wrapping correctly. + * refer to timer_after() for jiffies wrapping handling + */ +static inline int tkip_replay_check(u32 iv32_n, u16 iv16_n, + u32 iv32_o, u16 iv16_o) +{ + if ((s32)iv32_n - (s32)iv32_o < 0 || + (iv32_n == iv32_o && iv16_n <= iv16_o)) + return 1; + return 0; +} + +static int libipw_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv) +{ + struct libipw_tkip_data *tkey = priv; + u8 rc4key[16]; + u8 keyidx, *pos; + u32 iv32; + u16 iv16; + struct ieee80211_hdr *hdr; + u8 icv[4]; + u32 crc; + int plen; + + hdr = (struct ieee80211_hdr *)skb->data; + + if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) { + net_dbg_ratelimited("TKIP countermeasures: dropped received packet from %pM\n", + hdr->addr2); + return -1; + } + + if (skb->len < hdr_len + TKIP_HDR_LEN + 4) + return -1; + + pos = skb->data + hdr_len; + keyidx = pos[3]; + if (!(keyidx & (1 << 5))) { + net_dbg_ratelimited("TKIP: received packet without ExtIV flag from %pM\n", + hdr->addr2); + return -2; + } + keyidx >>= 6; + if (tkey->key_idx != keyidx) { + net_dbg_ratelimited("TKIP: RX tkey->key_idx=%d frame keyidx=%d\n", + tkey->key_idx, keyidx); + return -6; + } + if (!tkey->key_set) { + net_dbg_ratelimited("TKIP: received packet from %pM with keyid=%d that does not have a configured key\n", + hdr->addr2, keyidx); + return -3; + } + iv16 = (pos[0] << 8) | pos[2]; + iv32 = pos[4] | (pos[5] << 8) | (pos[6] << 16) | (pos[7] << 24); + pos += TKIP_HDR_LEN; + + if (tkip_replay_check(iv32, iv16, tkey->rx_iv32, tkey->rx_iv16)) { +#ifdef CONFIG_LIBIPW_DEBUG + net_dbg_ratelimited("TKIP: replay detected: STA=%pM previous TSC %08x%04x received TSC %08x%04x\n", + hdr->addr2, tkey->rx_iv32, tkey->rx_iv16, + iv32, iv16); +#endif + tkey->dot11RSNAStatsTKIPReplays++; + return -4; + } + + if (iv32 != tkey->rx_iv32 || !tkey->rx_phase1_done) { + tkip_mixing_phase1(tkey->rx_ttak, tkey->key, hdr->addr2, iv32); + tkey->rx_phase1_done = 1; + } + tkip_mixing_phase2(rc4key, tkey->key, tkey->rx_ttak, iv16); + + plen = skb->len - hdr_len - 12; + + arc4_setkey(&tkey->rx_ctx_arc4, rc4key, 16); + arc4_crypt(&tkey->rx_ctx_arc4, pos, pos, plen + 4); + + crc = ~crc32_le(~0, pos, plen); + icv[0] = crc; + icv[1] = crc >> 8; + icv[2] = crc >> 16; + icv[3] = crc >> 24; + if (memcmp(icv, pos + plen, 4) != 0) { + if (iv32 != tkey->rx_iv32) { + /* Previously cached Phase1 result was already lost, so + * it needs to be recalculated for the next packet. */ + tkey->rx_phase1_done = 0; + } +#ifdef CONFIG_LIBIPW_DEBUG + net_dbg_ratelimited("TKIP: ICV error detected: STA=%pM\n", + hdr->addr2); +#endif + tkey->dot11RSNAStatsTKIPICVErrors++; + return -5; + } + + /* Update real counters only after Michael MIC verification has + * completed */ + tkey->rx_iv32_new = iv32; + tkey->rx_iv16_new = iv16; + + /* Remove IV and ICV */ + memmove(skb->data + TKIP_HDR_LEN, skb->data, hdr_len); + skb_pull(skb, TKIP_HDR_LEN); + skb_trim(skb, skb->len - 4); + + return keyidx; +} + +static int michael_mic(struct crypto_shash *tfm_michael, u8 *key, u8 *hdr, + u8 *data, size_t data_len, u8 *mic) +{ + SHASH_DESC_ON_STACK(desc, tfm_michael); + int err; + + if (tfm_michael == NULL) { + pr_warn("%s(): tfm_michael == NULL\n", __func__); + return -1; + } + + desc->tfm = tfm_michael; + + if (crypto_shash_setkey(tfm_michael, key, 8)) + return -1; + + err = crypto_shash_init(desc); + if (err) + goto out; + err = crypto_shash_update(desc, hdr, 16); + if (err) + goto out; + err = crypto_shash_update(desc, data, data_len); + if (err) + goto out; + err = crypto_shash_final(desc, mic); + +out: + shash_desc_zero(desc); + return err; +} + +static void michael_mic_hdr(struct sk_buff *skb, u8 * hdr) +{ + struct ieee80211_hdr *hdr11; + + hdr11 = (struct ieee80211_hdr *)skb->data; + + switch (le16_to_cpu(hdr11->frame_control) & + (IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS)) { + case IEEE80211_FCTL_TODS: + memcpy(hdr, hdr11->addr3, ETH_ALEN); /* DA */ + memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */ + break; + case IEEE80211_FCTL_FROMDS: + memcpy(hdr, hdr11->addr1, ETH_ALEN); /* DA */ + memcpy(hdr + ETH_ALEN, hdr11->addr3, ETH_ALEN); /* SA */ + break; + case IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS: + memcpy(hdr, hdr11->addr3, ETH_ALEN); /* DA */ + memcpy(hdr + ETH_ALEN, hdr11->addr4, ETH_ALEN); /* SA */ + break; + default: + memcpy(hdr, hdr11->addr1, ETH_ALEN); /* DA */ + memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */ + break; + } + + if (ieee80211_is_data_qos(hdr11->frame_control)) { + hdr[12] = le16_to_cpu(*((__le16 *)ieee80211_get_qos_ctl(hdr11))) + & IEEE80211_QOS_CTL_TID_MASK; + } else + hdr[12] = 0; /* priority */ + + hdr[13] = hdr[14] = hdr[15] = 0; /* reserved */ +} + +static int libipw_michael_mic_add(struct sk_buff *skb, int hdr_len, + void *priv) +{ + struct libipw_tkip_data *tkey = priv; + u8 *pos; + + if (skb_tailroom(skb) < 8 || skb->len < hdr_len) { + printk(KERN_DEBUG "Invalid packet for Michael MIC add " + "(tailroom=%d hdr_len=%d skb->len=%d)\n", + skb_tailroom(skb), hdr_len, skb->len); + return -1; + } + + michael_mic_hdr(skb, tkey->tx_hdr); + pos = skb_put(skb, 8); + if (michael_mic(tkey->tx_tfm_michael, &tkey->key[16], tkey->tx_hdr, + skb->data + hdr_len, skb->len - 8 - hdr_len, pos)) + return -1; + + return 0; +} + +static void libipw_michael_mic_failure(struct net_device *dev, + struct ieee80211_hdr *hdr, + int keyidx) +{ + union iwreq_data wrqu; + struct iw_michaelmicfailure ev; + + /* TODO: needed parameters: count, keyid, key type, TSC */ + memset(&ev, 0, sizeof(ev)); + ev.flags = keyidx & IW_MICFAILURE_KEY_ID; + if (hdr->addr1[0] & 0x01) + ev.flags |= IW_MICFAILURE_GROUP; + else + ev.flags |= IW_MICFAILURE_PAIRWISE; + ev.src_addr.sa_family = ARPHRD_ETHER; + memcpy(ev.src_addr.sa_data, hdr->addr2, ETH_ALEN); + memset(&wrqu, 0, sizeof(wrqu)); + wrqu.data.length = sizeof(ev); + wireless_send_event(dev, IWEVMICHAELMICFAILURE, &wrqu, (char *)&ev); +} + +static int libipw_michael_mic_verify(struct sk_buff *skb, int keyidx, + int hdr_len, void *priv) +{ + struct libipw_tkip_data *tkey = priv; + u8 mic[8]; + + if (!tkey->key_set) + return -1; + + michael_mic_hdr(skb, tkey->rx_hdr); + if (michael_mic(tkey->rx_tfm_michael, &tkey->key[24], tkey->rx_hdr, + skb->data + hdr_len, skb->len - 8 - hdr_len, mic)) + return -1; + if (memcmp(mic, skb->data + skb->len - 8, 8) != 0) { + struct ieee80211_hdr *hdr; + hdr = (struct ieee80211_hdr *)skb->data; + printk(KERN_DEBUG "%s: Michael MIC verification failed for " + "MSDU from %pM keyidx=%d\n", + skb->dev ? skb->dev->name : "N/A", hdr->addr2, + keyidx); + if (skb->dev) + libipw_michael_mic_failure(skb->dev, hdr, keyidx); + tkey->dot11RSNAStatsTKIPLocalMICFailures++; + return -1; + } + + /* Update TSC counters for RX now that the packet verification has + * completed. */ + tkey->rx_iv32 = tkey->rx_iv32_new; + tkey->rx_iv16 = tkey->rx_iv16_new; + + skb_trim(skb, skb->len - 8); + + return 0; +} + +static int libipw_tkip_set_key(void *key, int len, u8 * seq, void *priv) +{ + struct libipw_tkip_data *tkey = priv; + int keyidx; + struct crypto_shash *tfm = tkey->tx_tfm_michael; + struct arc4_ctx *tfm2 = &tkey->tx_ctx_arc4; + struct crypto_shash *tfm3 = tkey->rx_tfm_michael; + struct arc4_ctx *tfm4 = &tkey->rx_ctx_arc4; + + keyidx = tkey->key_idx; + memset(tkey, 0, sizeof(*tkey)); + tkey->key_idx = keyidx; + tkey->tx_tfm_michael = tfm; + tkey->tx_ctx_arc4 = *tfm2; + tkey->rx_tfm_michael = tfm3; + tkey->rx_ctx_arc4 = *tfm4; + if (len == TKIP_KEY_LEN) { + memcpy(tkey->key, key, TKIP_KEY_LEN); + tkey->key_set = 1; + tkey->tx_iv16 = 1; /* TSC is initialized to 1 */ + if (seq) { + tkey->rx_iv32 = (seq[5] << 24) | (seq[4] << 16) | + (seq[3] << 8) | seq[2]; + tkey->rx_iv16 = (seq[1] << 8) | seq[0]; + } + } else if (len == 0) + tkey->key_set = 0; + else + return -1; + + return 0; +} + +static int libipw_tkip_get_key(void *key, int len, u8 * seq, void *priv) +{ + struct libipw_tkip_data *tkey = priv; + + if (len < TKIP_KEY_LEN) + return -1; + + if (!tkey->key_set) + return 0; + memcpy(key, tkey->key, TKIP_KEY_LEN); + + if (seq) { + /* + * Not clear if this should return the value as is + * or - as the code previously seemed to partially + * have been written as - subtract one from it. It + * was working this way for a long time so leave it. + */ + seq[0] = tkey->tx_iv16; + seq[1] = tkey->tx_iv16 >> 8; + seq[2] = tkey->tx_iv32; + seq[3] = tkey->tx_iv32 >> 8; + seq[4] = tkey->tx_iv32 >> 16; + seq[5] = tkey->tx_iv32 >> 24; + } + + return TKIP_KEY_LEN; +} + +static void libipw_tkip_print_stats(struct seq_file *m, void *priv) +{ + struct libipw_tkip_data *tkip = priv; + seq_printf(m, + "key[%d] alg=TKIP key_set=%d " + "tx_pn=%02x%02x%02x%02x%02x%02x " + "rx_pn=%02x%02x%02x%02x%02x%02x " + "replays=%d icv_errors=%d local_mic_failures=%d\n", + tkip->key_idx, tkip->key_set, + (tkip->tx_iv32 >> 24) & 0xff, + (tkip->tx_iv32 >> 16) & 0xff, + (tkip->tx_iv32 >> 8) & 0xff, + tkip->tx_iv32 & 0xff, + (tkip->tx_iv16 >> 8) & 0xff, + tkip->tx_iv16 & 0xff, + (tkip->rx_iv32 >> 24) & 0xff, + (tkip->rx_iv32 >> 16) & 0xff, + (tkip->rx_iv32 >> 8) & 0xff, + tkip->rx_iv32 & 0xff, + (tkip->rx_iv16 >> 8) & 0xff, + tkip->rx_iv16 & 0xff, + tkip->dot11RSNAStatsTKIPReplays, + tkip->dot11RSNAStatsTKIPICVErrors, + tkip->dot11RSNAStatsTKIPLocalMICFailures); +} + +static const struct libipw_crypto_ops libipw_crypt_tkip = { + .name = "TKIP", + .init = libipw_tkip_init, + .deinit = libipw_tkip_deinit, + .encrypt_mpdu = libipw_tkip_encrypt, + .decrypt_mpdu = libipw_tkip_decrypt, + .encrypt_msdu = libipw_michael_mic_add, + .decrypt_msdu = libipw_michael_mic_verify, + .set_key = libipw_tkip_set_key, + .get_key = libipw_tkip_get_key, + .print_stats = libipw_tkip_print_stats, + .extra_mpdu_prefix_len = 4 + 4, /* IV + ExtIV */ + .extra_mpdu_postfix_len = 4, /* ICV */ + .extra_msdu_postfix_len = 8, /* MIC */ + .get_flags = libipw_tkip_get_flags, + .set_flags = libipw_tkip_set_flags, + .owner = THIS_MODULE, +}; + +int __init libipw_crypto_tkip_init(void) +{ + return libipw_register_crypto_ops(&libipw_crypt_tkip); +} + +void libipw_crypto_tkip_exit(void) +{ + libipw_unregister_crypto_ops(&libipw_crypt_tkip); +} diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_crypto_wep.c b/drivers/net/wireless/intel/ipw2x00/libipw_crypto_wep.c new file mode 100644 index 000000000000..c3a4ccb9de17 --- /dev/null +++ b/drivers/net/wireless/intel/ipw2x00/libipw_crypto_wep.c @@ -0,0 +1,247 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * libipw crypt: host-based WEP encryption implementation for libipw + * + * Copyright (c) 2002-2004, Jouni Malinen + * Copyright (c) 2008, John W. Linville + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libipw.h" + +struct libipw_wep_data { + u32 iv; +#define WEP_KEY_LEN 13 + u8 key[WEP_KEY_LEN + 1]; + u8 key_len; + u8 key_idx; + struct arc4_ctx tx_ctx; + struct arc4_ctx rx_ctx; +}; + +static void *libipw_wep_init(int keyidx) +{ + struct libipw_wep_data *priv; + + if (fips_enabled) + return NULL; + + priv = kzalloc(sizeof(*priv), GFP_ATOMIC); + if (priv == NULL) + return NULL; + priv->key_idx = keyidx; + + /* start WEP IV from a random value */ + get_random_bytes(&priv->iv, 4); + + return priv; +} + +static void libipw_wep_deinit(void *priv) +{ + kfree_sensitive(priv); +} + +/* Add WEP IV/key info to a frame that has at least 4 bytes of headroom */ +static int libipw_wep_build_iv(struct sk_buff *skb, int hdr_len, + u8 *key, int keylen, void *priv) +{ + struct libipw_wep_data *wep = priv; + u32 klen; + u8 *pos; + + if (skb_headroom(skb) < 4 || skb->len < hdr_len) + return -1; + + pos = skb_push(skb, 4); + memmove(pos, pos + 4, hdr_len); + pos += hdr_len; + + klen = 3 + wep->key_len; + + wep->iv++; + + /* Fluhrer, Mantin, and Shamir have reported weaknesses in the key + * scheduling algorithm of RC4. At least IVs (KeyByte + 3, 0xff, N) + * can be used to speedup attacks, so avoid using them. */ + if ((wep->iv & 0xff00) == 0xff00) { + u8 B = (wep->iv >> 16) & 0xff; + if (B >= 3 && B < klen) + wep->iv += 0x0100; + } + + /* Prepend 24-bit IV to RC4 key and TX frame */ + *pos++ = (wep->iv >> 16) & 0xff; + *pos++ = (wep->iv >> 8) & 0xff; + *pos++ = wep->iv & 0xff; + *pos++ = wep->key_idx << 6; + + return 0; +} + +/* Perform WEP encryption on given skb that has at least 4 bytes of headroom + * for IV and 4 bytes of tailroom for ICV. Both IV and ICV will be transmitted, + * so the payload length increases with 8 bytes. + * + * WEP frame payload: IV + TX key idx, RC4(data), ICV = RC4(CRC32(data)) + */ +static int libipw_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv) +{ + struct libipw_wep_data *wep = priv; + u32 crc, klen, len; + u8 *pos, *icv; + u8 key[WEP_KEY_LEN + 3]; + + /* other checks are in libipw_wep_build_iv */ + if (skb_tailroom(skb) < 4) + return -1; + + /* add the IV to the frame */ + if (libipw_wep_build_iv(skb, hdr_len, NULL, 0, priv)) + return -1; + + /* Copy the IV into the first 3 bytes of the key */ + skb_copy_from_linear_data_offset(skb, hdr_len, key, 3); + + /* Copy rest of the WEP key (the secret part) */ + memcpy(key + 3, wep->key, wep->key_len); + + len = skb->len - hdr_len - 4; + pos = skb->data + hdr_len + 4; + klen = 3 + wep->key_len; + + /* Append little-endian CRC32 over only the data and encrypt it to produce ICV */ + crc = ~crc32_le(~0, pos, len); + icv = skb_put(skb, 4); + icv[0] = crc; + icv[1] = crc >> 8; + icv[2] = crc >> 16; + icv[3] = crc >> 24; + + arc4_setkey(&wep->tx_ctx, key, klen); + arc4_crypt(&wep->tx_ctx, pos, pos, len + 4); + + return 0; +} + +/* Perform WEP decryption on given buffer. Buffer includes whole WEP part of + * the frame: IV (4 bytes), encrypted payload (including SNAP header), + * ICV (4 bytes). len includes both IV and ICV. + * + * Returns 0 if frame was decrypted successfully and ICV was correct and -1 on + * failure. If frame is OK, IV and ICV will be removed. + */ +static int libipw_wep_decrypt(struct sk_buff *skb, int hdr_len, void *priv) +{ + struct libipw_wep_data *wep = priv; + u32 crc, klen, plen; + u8 key[WEP_KEY_LEN + 3]; + u8 keyidx, *pos, icv[4]; + + if (skb->len < hdr_len + 8) + return -1; + + pos = skb->data + hdr_len; + key[0] = *pos++; + key[1] = *pos++; + key[2] = *pos++; + keyidx = *pos++ >> 6; + if (keyidx != wep->key_idx) + return -1; + + klen = 3 + wep->key_len; + + /* Copy rest of the WEP key (the secret part) */ + memcpy(key + 3, wep->key, wep->key_len); + + /* Apply RC4 to data and compute CRC32 over decrypted data */ + plen = skb->len - hdr_len - 8; + + arc4_setkey(&wep->rx_ctx, key, klen); + arc4_crypt(&wep->rx_ctx, pos, pos, plen + 4); + + crc = ~crc32_le(~0, pos, plen); + icv[0] = crc; + icv[1] = crc >> 8; + icv[2] = crc >> 16; + icv[3] = crc >> 24; + if (memcmp(icv, pos + plen, 4) != 0) { + /* ICV mismatch - drop frame */ + return -2; + } + + /* Remove IV and ICV */ + memmove(skb->data + 4, skb->data, hdr_len); + skb_pull(skb, 4); + skb_trim(skb, skb->len - 4); + + return 0; +} + +static int libipw_wep_set_key(void *key, int len, u8 * seq, void *priv) +{ + struct libipw_wep_data *wep = priv; + + if (len < 0 || len > WEP_KEY_LEN) + return -1; + + memcpy(wep->key, key, len); + wep->key_len = len; + + return 0; +} + +static int libipw_wep_get_key(void *key, int len, u8 * seq, void *priv) +{ + struct libipw_wep_data *wep = priv; + + if (len < wep->key_len) + return -1; + + memcpy(key, wep->key, wep->key_len); + + return wep->key_len; +} + +static void libipw_wep_print_stats(struct seq_file *m, void *priv) +{ + struct libipw_wep_data *wep = priv; + seq_printf(m, "key[%d] alg=WEP len=%d\n", wep->key_idx, wep->key_len); +} + +static const struct libipw_crypto_ops libipw_crypt_wep = { + .name = "WEP", + .init = libipw_wep_init, + .deinit = libipw_wep_deinit, + .encrypt_mpdu = libipw_wep_encrypt, + .decrypt_mpdu = libipw_wep_decrypt, + .encrypt_msdu = NULL, + .decrypt_msdu = NULL, + .set_key = libipw_wep_set_key, + .get_key = libipw_wep_get_key, + .print_stats = libipw_wep_print_stats, + .extra_mpdu_prefix_len = 4, /* IV */ + .extra_mpdu_postfix_len = 4, /* ICV */ + .owner = THIS_MODULE, +}; + +int __init libipw_crypto_wep_init(void) +{ + return libipw_register_crypto_ops(&libipw_crypt_wep); +} + +void __exit libipw_crypto_wep_exit(void) +{ + libipw_unregister_crypto_ops(&libipw_crypt_wep); +} diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_module.c b/drivers/net/wireless/intel/ipw2x00/libipw_module.c index 43bab92a4148..0a16127bfd68 100644 --- a/drivers/net/wireless/intel/ipw2x00/libipw_module.c +++ b/drivers/net/wireless/intel/ipw2x00/libipw_module.c @@ -169,7 +169,7 @@ struct net_device *alloc_libipw(int sizeof_priv, int monitor) spin_lock_init(&ieee->lock); - lib80211_crypt_info_init(&ieee->crypt_info, dev->name, &ieee->lock); + libipw_crypt_info_init(&ieee->crypt_info, dev->name, &ieee->lock); ieee->wpa_enabled = 0; ieee->drop_unencrypted = 0; @@ -191,7 +191,7 @@ void free_libipw(struct net_device *dev, int monitor) { struct libipw_device *ieee = netdev_priv(dev); - lib80211_crypt_info_free(&ieee->crypt_info); + libipw_crypt_info_free(&ieee->crypt_info); libipw_networks_free(ieee); @@ -251,6 +251,7 @@ static const struct proc_ops debug_level_proc_ops = { static int __init libipw_init(void) { + int err; #ifdef CONFIG_LIBIPW_DEBUG struct proc_dir_entry *e; @@ -273,7 +274,33 @@ static int __init libipw_init(void) printk(KERN_INFO DRV_NAME ": " DRV_DESCRIPTION ", " DRV_VERSION "\n"); printk(KERN_INFO DRV_NAME ": " DRV_COPYRIGHT "\n"); + err = libipw_crypto_init(); + if (err) + goto remove_debugfs; + err = libipw_crypto_ccmp_init(); + if (err) + goto uninit_crypto; + err = libipw_crypto_tkip_init(); + if (err) + goto uninit_crypto_ccmp; + err = libipw_crypto_wep_init(); + if (err) + goto uninit_crypto_tkip; + return 0; +uninit_crypto_tkip: + libipw_crypto_tkip_exit(); +uninit_crypto_ccmp: + libipw_crypto_ccmp_exit(); +uninit_crypto: + libipw_crypto_exit(); +remove_debugfs: +#ifdef CONFIG_LIBIPW_DEBUG + remove_proc_entry("debug_level", libipw_proc); + remove_proc_entry(DRV_PROCNAME, init_net.proc_net); + libipw_proc = NULL; +#endif + return err; } static void __exit libipw_exit(void) @@ -285,6 +312,11 @@ static void __exit libipw_exit(void) libipw_proc = NULL; } #endif /* CONFIG_LIBIPW_DEBUG */ + + libipw_crypto_ccmp_exit(); + libipw_crypto_tkip_exit(); + libipw_crypto_wep_exit(); + libipw_crypto_exit(); } #ifdef CONFIG_LIBIPW_DEBUG diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_rx.c b/drivers/net/wireless/intel/ipw2x00/libipw_rx.c index 48d6870bbf4e..1fe05e73a17c 100644 --- a/drivers/net/wireless/intel/ipw2x00/libipw_rx.c +++ b/drivers/net/wireless/intel/ipw2x00/libipw_rx.c @@ -27,9 +27,6 @@ #include #include #include - -#include - #include "libipw.h" static void libipw_monitor_rx(struct libipw_device *ieee, @@ -266,7 +263,7 @@ static int libipw_is_eapol_frame(struct libipw_device *ieee, /* Called only as a tasklet (software IRQ), by libipw_rx */ static int libipw_rx_frame_decrypt(struct libipw_device *ieee, struct sk_buff *skb, - struct lib80211_crypt_data *crypt) + struct libipw_crypt_data *crypt) { struct libipw_hdr_3addr *hdr; int res, hdrlen; @@ -298,7 +295,7 @@ libipw_rx_frame_decrypt(struct libipw_device *ieee, struct sk_buff *skb, static int libipw_rx_frame_decrypt_msdu(struct libipw_device *ieee, struct sk_buff *skb, int keyidx, - struct lib80211_crypt_data *crypt) + struct libipw_crypt_data *crypt) { struct libipw_hdr_3addr *hdr; int res, hdrlen; @@ -345,7 +342,7 @@ int libipw_rx(struct libipw_device *ieee, struct sk_buff *skb, #endif u8 dst[ETH_ALEN]; u8 src[ETH_ALEN]; - struct lib80211_crypt_data *crypt = NULL; + struct libipw_crypt_data *crypt = NULL; int keyidx = 0; int can_be_decrypted = 0; diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_tx.c b/drivers/net/wireless/intel/ipw2x00/libipw_tx.c index e22a6732a4c3..80edaa3dea9c 100644 --- a/drivers/net/wireless/intel/ipw2x00/libipw_tx.c +++ b/drivers/net/wireless/intel/ipw2x00/libipw_tx.c @@ -138,7 +138,7 @@ static int libipw_copy_snap(u8 * data, __be16 h_proto) static int libipw_encrypt_fragment(struct libipw_device *ieee, struct sk_buff *frag, int hdr_len) { - struct lib80211_crypt_data *crypt = + struct libipw_crypt_data *crypt = ieee->crypt_info.crypt[ieee->crypt_info.tx_keyidx]; int res; @@ -255,7 +255,7 @@ netdev_tx_t libipw_xmit(struct sk_buff *skb, struct net_device *dev) .qos_ctl = 0 }; u8 dest[ETH_ALEN], src[ETH_ALEN]; - struct lib80211_crypt_data *crypt; + struct libipw_crypt_data *crypt; int priority = skb->priority; int snapped = 0; diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_wx.c b/drivers/net/wireless/intel/ipw2x00/libipw_wx.c index dbc7153d0a3d..db71d81b0d4f 100644 --- a/drivers/net/wireless/intel/ipw2x00/libipw_wx.c +++ b/drivers/net/wireless/intel/ipw2x00/libipw_wx.c @@ -21,10 +21,7 @@ #include #include #include - -#include #include - #include "libipw.h" static const char *libipw_modes[] = { @@ -303,7 +300,7 @@ int libipw_wx_set_encode(struct libipw_device *ieee, .flags = 0 }; int i, key, key_provided, len; - struct lib80211_crypt_data **crypt; + struct libipw_crypt_data **crypt; int host_crypto = ieee->host_encrypt || ieee->host_decrypt; LIBIPW_DEBUG_WX("SET_ENCODE\n"); @@ -328,7 +325,7 @@ int libipw_wx_set_encode(struct libipw_device *ieee, if (key_provided && *crypt) { LIBIPW_DEBUG_WX("Disabling encryption on key %d.\n", key); - lib80211_crypt_delayed_deinit(&ieee->crypt_info, crypt); + libipw_crypt_delayed_deinit(&ieee->crypt_info, crypt); } else LIBIPW_DEBUG_WX("Disabling encryption.\n"); @@ -338,7 +335,7 @@ int libipw_wx_set_encode(struct libipw_device *ieee, if (ieee->crypt_info.crypt[i] != NULL) { if (key_provided) break; - lib80211_crypt_delayed_deinit(&ieee->crypt_info, + libipw_crypt_delayed_deinit(&ieee->crypt_info, &ieee->crypt_info.crypt[i]); } } @@ -361,21 +358,21 @@ int libipw_wx_set_encode(struct libipw_device *ieee, strcmp((*crypt)->ops->name, "WEP") != 0) { /* changing to use WEP; deinit previously used algorithm * on this key */ - lib80211_crypt_delayed_deinit(&ieee->crypt_info, crypt); + libipw_crypt_delayed_deinit(&ieee->crypt_info, crypt); } if (*crypt == NULL && host_crypto) { - struct lib80211_crypt_data *new_crypt; + struct libipw_crypt_data *new_crypt; /* take WEP into use */ - new_crypt = kzalloc(sizeof(struct lib80211_crypt_data), + new_crypt = kzalloc(sizeof(struct libipw_crypt_data), GFP_KERNEL); if (new_crypt == NULL) return -ENOMEM; - new_crypt->ops = lib80211_get_crypto_ops("WEP"); + new_crypt->ops = libipw_get_crypto_ops("WEP"); if (!new_crypt->ops) { - request_module("lib80211_crypt_wep"); - new_crypt->ops = lib80211_get_crypto_ops("WEP"); + request_module("libipw_crypt_wep"); + new_crypt->ops = libipw_get_crypto_ops("WEP"); } if (new_crypt->ops && try_module_get(new_crypt->ops->owner)) @@ -386,7 +383,7 @@ int libipw_wx_set_encode(struct libipw_device *ieee, new_crypt = NULL; printk(KERN_WARNING "%s: could not initialize WEP: " - "load module lib80211_crypt_wep\n", dev->name); + "load module libipw_crypt_wep\n", dev->name); return -EOPNOTSUPP; } *crypt = new_crypt; @@ -509,8 +506,8 @@ int libipw_wx_set_encodeext(struct libipw_device *ieee, int i, idx, ret = 0; int group_key = 0; const char *alg, *module; - const struct lib80211_crypto_ops *ops; - struct lib80211_crypt_data **crypt; + const struct libipw_crypto_ops *ops; + struct libipw_crypt_data **crypt; struct libipw_security sec = { .flags = 0, @@ -541,7 +538,7 @@ int libipw_wx_set_encodeext(struct libipw_device *ieee, if ((encoding->flags & IW_ENCODE_DISABLED) || ext->alg == IW_ENCODE_ALG_NONE) { if (*crypt) - lib80211_crypt_delayed_deinit(&ieee->crypt_info, crypt); + libipw_crypt_delayed_deinit(&ieee->crypt_info, crypt); for (i = 0; i < WEP_KEYS; i++) if (ieee->crypt_info.crypt[i] != NULL) @@ -567,15 +564,15 @@ int libipw_wx_set_encodeext(struct libipw_device *ieee, switch (ext->alg) { case IW_ENCODE_ALG_WEP: alg = "WEP"; - module = "lib80211_crypt_wep"; + module = "libipw_crypt_wep"; break; case IW_ENCODE_ALG_TKIP: alg = "TKIP"; - module = "lib80211_crypt_tkip"; + module = "libipw_crypt_tkip"; break; case IW_ENCODE_ALG_CCMP: alg = "CCMP"; - module = "lib80211_crypt_ccmp"; + module = "libipw_crypt_ccmp"; break; default: LIBIPW_DEBUG_WX("%s: unknown crypto alg %d\n", @@ -584,10 +581,10 @@ int libipw_wx_set_encodeext(struct libipw_device *ieee, goto done; } - ops = lib80211_get_crypto_ops(alg); + ops = libipw_get_crypto_ops(alg); if (ops == NULL) { request_module(module); - ops = lib80211_get_crypto_ops(alg); + ops = libipw_get_crypto_ops(alg); } if (ops == NULL) { LIBIPW_DEBUG_WX("%s: unknown crypto alg %d\n", @@ -597,9 +594,9 @@ int libipw_wx_set_encodeext(struct libipw_device *ieee, } if (*crypt == NULL || (*crypt)->ops != ops) { - struct lib80211_crypt_data *new_crypt; + struct libipw_crypt_data *new_crypt; - lib80211_crypt_delayed_deinit(&ieee->crypt_info, crypt); + libipw_crypt_delayed_deinit(&ieee->crypt_info, crypt); new_crypt = kzalloc(sizeof(*new_crypt), GFP_KERNEL); if (new_crypt == NULL) { diff --git a/include/net/lib80211.h b/include/net/lib80211.h deleted file mode 100644 index fd0f15d87d80..000000000000 --- a/include/net/lib80211.h +++ /dev/null @@ -1,122 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * lib80211.h -- common bits for IEEE802.11 wireless drivers - * - * Copyright (c) 2008, John W. Linville - * - * Some bits copied from old ieee80211 component, w/ original copyright - * notices below: - * - * Original code based on Host AP (software wireless LAN access point) driver - * for Intersil Prism2/2.5/3. - * - * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen - * - * Copyright (c) 2002-2003, Jouni Malinen - * - * Adaption to a generic IEEE 802.11 stack by James Ketrenos - * - * - * Copyright (c) 2004, Intel Corporation - * - */ - -#ifndef LIB80211_H -#define LIB80211_H - -#include -#include -#include -#include -#include -#include -#include -#include - -#define NUM_WEP_KEYS 4 - -enum { - IEEE80211_CRYPTO_TKIP_COUNTERMEASURES = (1 << 0), -}; - -struct module; - -struct lib80211_crypto_ops { - const char *name; - struct list_head list; - - /* init new crypto context (e.g., allocate private data space, - * select IV, etc.); returns NULL on failure or pointer to allocated - * private data on success */ - void *(*init) (int keyidx); - - /* deinitialize crypto context and free allocated private data */ - void (*deinit) (void *priv); - - /* encrypt/decrypt return < 0 on error or >= 0 on success. The return - * value from decrypt_mpdu is passed as the keyidx value for - * decrypt_msdu. skb must have enough head and tail room for the - * encryption; if not, error will be returned; these functions are - * called for all MPDUs (i.e., fragments). - */ - int (*encrypt_mpdu) (struct sk_buff * skb, int hdr_len, void *priv); - int (*decrypt_mpdu) (struct sk_buff * skb, int hdr_len, void *priv); - - /* These functions are called for full MSDUs, i.e. full frames. - * These can be NULL if full MSDU operations are not needed. */ - int (*encrypt_msdu) (struct sk_buff * skb, int hdr_len, void *priv); - int (*decrypt_msdu) (struct sk_buff * skb, int keyidx, int hdr_len, - void *priv); - - int (*set_key) (void *key, int len, u8 * seq, void *priv); - int (*get_key) (void *key, int len, u8 * seq, void *priv); - - /* procfs handler for printing out key information and possible - * statistics */ - void (*print_stats) (struct seq_file *m, void *priv); - - /* Crypto specific flag get/set for configuration settings */ - unsigned long (*get_flags) (void *priv); - unsigned long (*set_flags) (unsigned long flags, void *priv); - - /* maximum number of bytes added by encryption; encrypt buf is - * allocated with extra_prefix_len bytes, copy of in_buf, and - * extra_postfix_len; encrypt need not use all this space, but - * the result must start at the beginning of the buffer and correct - * length must be returned */ - int extra_mpdu_prefix_len, extra_mpdu_postfix_len; - int extra_msdu_prefix_len, extra_msdu_postfix_len; - - struct module *owner; -}; - -struct lib80211_crypt_data { - struct list_head list; /* delayed deletion list */ - const struct lib80211_crypto_ops *ops; - void *priv; - atomic_t refcnt; -}; - -struct lib80211_crypt_info { - char *name; - /* Most clients will already have a lock, - so just point to that. */ - spinlock_t *lock; - - struct lib80211_crypt_data *crypt[NUM_WEP_KEYS]; - int tx_keyidx; /* default TX key index (crypt[tx_keyidx]) */ - struct list_head crypt_deinit_list; - struct timer_list crypt_deinit_timer; - int crypt_quiesced; -}; - -int lib80211_crypt_info_init(struct lib80211_crypt_info *info, char *name, - spinlock_t *lock); -void lib80211_crypt_info_free(struct lib80211_crypt_info *info); -int lib80211_register_crypto_ops(const struct lib80211_crypto_ops *ops); -int lib80211_unregister_crypto_ops(const struct lib80211_crypto_ops *ops); -const struct lib80211_crypto_ops *lib80211_get_crypto_ops(const char *name); -void lib80211_crypt_delayed_deinit(struct lib80211_crypt_info *info, - struct lib80211_crypt_data **crypt); - -#endif /* LIB80211_H */ diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index 10345388ad13..733c53ad4de5 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -212,36 +212,3 @@ config CFG80211_KUNIT_TEST If unsure, say N. endif # CFG80211 - -config LIB80211 - tristate - default n - help - This options enables a library of common routines used - by IEEE802.11 wireless LAN drivers. - - Drivers should select this themselves if needed. - -config LIB80211_CRYPT_WEP - tristate - select CRYPTO_LIB_ARC4 - -config LIB80211_CRYPT_CCMP - tristate - select CRYPTO - select CRYPTO_AES - select CRYPTO_CCM - -config LIB80211_CRYPT_TKIP - tristate - select CRYPTO_LIB_ARC4 - -config LIB80211_DEBUG - bool "lib80211 debugging messages" - depends on LIB80211 - default n - help - You can enable this if you want verbose debugging messages - from lib80211. - - If unsure, say N. diff --git a/net/wireless/Makefile b/net/wireless/Makefile index 1d49cc8b6da1..27f211bd9954 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -1,9 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_CFG80211) += cfg80211.o -obj-$(CONFIG_LIB80211) += lib80211.o -obj-$(CONFIG_LIB80211_CRYPT_WEP) += lib80211_crypt_wep.o -obj-$(CONFIG_LIB80211_CRYPT_CCMP) += lib80211_crypt_ccmp.o -obj-$(CONFIG_LIB80211_CRYPT_TKIP) += lib80211_crypt_tkip.o obj-y += tests/ obj-$(CONFIG_WEXT_CORE) += wext-core.o diff --git a/net/wireless/lib80211.c b/net/wireless/lib80211.c deleted file mode 100644 index 64c447040786..000000000000 --- a/net/wireless/lib80211.c +++ /dev/null @@ -1,257 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * lib80211 -- common bits for IEEE802.11 drivers - * - * Copyright(c) 2008 John W. Linville - * - * Portions copied from old ieee80211 component, w/ original copyright - * notices below: - * - * Host AP crypto routines - * - * Copyright (c) 2002-2003, Jouni Malinen - * Portions Copyright (C) 2004, Intel Corporation - * - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include - -#include - -#define DRV_DESCRIPTION "common routines for IEEE802.11 drivers" - -MODULE_DESCRIPTION(DRV_DESCRIPTION); -MODULE_AUTHOR("John W. Linville "); -MODULE_LICENSE("GPL"); - -struct lib80211_crypto_alg { - struct list_head list; - const struct lib80211_crypto_ops *ops; -}; - -static LIST_HEAD(lib80211_crypto_algs); -static DEFINE_SPINLOCK(lib80211_crypto_lock); - -static void lib80211_crypt_deinit_entries(struct lib80211_crypt_info *info, - int force); -static void lib80211_crypt_quiescing(struct lib80211_crypt_info *info); -static void lib80211_crypt_deinit_handler(struct timer_list *t); - -int lib80211_crypt_info_init(struct lib80211_crypt_info *info, char *name, - spinlock_t *lock) -{ - memset(info, 0, sizeof(*info)); - - info->name = name; - info->lock = lock; - - INIT_LIST_HEAD(&info->crypt_deinit_list); - timer_setup(&info->crypt_deinit_timer, lib80211_crypt_deinit_handler, - 0); - - return 0; -} -EXPORT_SYMBOL(lib80211_crypt_info_init); - -void lib80211_crypt_info_free(struct lib80211_crypt_info *info) -{ - int i; - - lib80211_crypt_quiescing(info); - del_timer_sync(&info->crypt_deinit_timer); - lib80211_crypt_deinit_entries(info, 1); - - for (i = 0; i < NUM_WEP_KEYS; i++) { - struct lib80211_crypt_data *crypt = info->crypt[i]; - if (crypt) { - if (crypt->ops) { - crypt->ops->deinit(crypt->priv); - module_put(crypt->ops->owner); - } - kfree(crypt); - info->crypt[i] = NULL; - } - } -} -EXPORT_SYMBOL(lib80211_crypt_info_free); - -static void lib80211_crypt_deinit_entries(struct lib80211_crypt_info *info, - int force) -{ - struct lib80211_crypt_data *entry, *next; - unsigned long flags; - - spin_lock_irqsave(info->lock, flags); - list_for_each_entry_safe(entry, next, &info->crypt_deinit_list, list) { - if (atomic_read(&entry->refcnt) != 0 && !force) - continue; - - list_del(&entry->list); - - if (entry->ops) { - entry->ops->deinit(entry->priv); - module_put(entry->ops->owner); - } - kfree(entry); - } - spin_unlock_irqrestore(info->lock, flags); -} - -/* After this, crypt_deinit_list won't accept new members */ -static void lib80211_crypt_quiescing(struct lib80211_crypt_info *info) -{ - unsigned long flags; - - spin_lock_irqsave(info->lock, flags); - info->crypt_quiesced = 1; - spin_unlock_irqrestore(info->lock, flags); -} - -static void lib80211_crypt_deinit_handler(struct timer_list *t) -{ - struct lib80211_crypt_info *info = from_timer(info, t, - crypt_deinit_timer); - unsigned long flags; - - lib80211_crypt_deinit_entries(info, 0); - - spin_lock_irqsave(info->lock, flags); - if (!list_empty(&info->crypt_deinit_list) && !info->crypt_quiesced) { - printk(KERN_DEBUG "%s: entries remaining in delayed crypt " - "deletion list\n", info->name); - info->crypt_deinit_timer.expires = jiffies + HZ; - add_timer(&info->crypt_deinit_timer); - } - spin_unlock_irqrestore(info->lock, flags); -} - -void lib80211_crypt_delayed_deinit(struct lib80211_crypt_info *info, - struct lib80211_crypt_data **crypt) -{ - struct lib80211_crypt_data *tmp; - unsigned long flags; - - if (*crypt == NULL) - return; - - tmp = *crypt; - *crypt = NULL; - - /* must not run ops->deinit() while there may be pending encrypt or - * decrypt operations. Use a list of delayed deinits to avoid needing - * locking. */ - - spin_lock_irqsave(info->lock, flags); - if (!info->crypt_quiesced) { - list_add(&tmp->list, &info->crypt_deinit_list); - if (!timer_pending(&info->crypt_deinit_timer)) { - info->crypt_deinit_timer.expires = jiffies + HZ; - add_timer(&info->crypt_deinit_timer); - } - } - spin_unlock_irqrestore(info->lock, flags); -} -EXPORT_SYMBOL(lib80211_crypt_delayed_deinit); - -int lib80211_register_crypto_ops(const struct lib80211_crypto_ops *ops) -{ - unsigned long flags; - struct lib80211_crypto_alg *alg; - - alg = kzalloc(sizeof(*alg), GFP_KERNEL); - if (alg == NULL) - return -ENOMEM; - - alg->ops = ops; - - spin_lock_irqsave(&lib80211_crypto_lock, flags); - list_add(&alg->list, &lib80211_crypto_algs); - spin_unlock_irqrestore(&lib80211_crypto_lock, flags); - - printk(KERN_DEBUG "lib80211_crypt: registered algorithm '%s'\n", - ops->name); - - return 0; -} -EXPORT_SYMBOL(lib80211_register_crypto_ops); - -int lib80211_unregister_crypto_ops(const struct lib80211_crypto_ops *ops) -{ - struct lib80211_crypto_alg *alg; - unsigned long flags; - - spin_lock_irqsave(&lib80211_crypto_lock, flags); - list_for_each_entry(alg, &lib80211_crypto_algs, list) { - if (alg->ops == ops) - goto found; - } - spin_unlock_irqrestore(&lib80211_crypto_lock, flags); - return -EINVAL; - - found: - printk(KERN_DEBUG "lib80211_crypt: unregistered algorithm '%s'\n", - ops->name); - list_del(&alg->list); - spin_unlock_irqrestore(&lib80211_crypto_lock, flags); - kfree(alg); - return 0; -} -EXPORT_SYMBOL(lib80211_unregister_crypto_ops); - -const struct lib80211_crypto_ops *lib80211_get_crypto_ops(const char *name) -{ - struct lib80211_crypto_alg *alg; - unsigned long flags; - - spin_lock_irqsave(&lib80211_crypto_lock, flags); - list_for_each_entry(alg, &lib80211_crypto_algs, list) { - if (strcmp(alg->ops->name, name) == 0) - goto found; - } - spin_unlock_irqrestore(&lib80211_crypto_lock, flags); - return NULL; - - found: - spin_unlock_irqrestore(&lib80211_crypto_lock, flags); - return alg->ops; -} -EXPORT_SYMBOL(lib80211_get_crypto_ops); - -static void *lib80211_crypt_null_init(int keyidx) -{ - return (void *)1; -} - -static void lib80211_crypt_null_deinit(void *priv) -{ -} - -static const struct lib80211_crypto_ops lib80211_crypt_null = { - .name = "NULL", - .init = lib80211_crypt_null_init, - .deinit = lib80211_crypt_null_deinit, - .owner = THIS_MODULE, -}; - -static int __init lib80211_init(void) -{ - pr_info(DRV_DESCRIPTION "\n"); - return lib80211_register_crypto_ops(&lib80211_crypt_null); -} - -static void __exit lib80211_exit(void) -{ - lib80211_unregister_crypto_ops(&lib80211_crypt_null); - BUG_ON(!list_empty(&lib80211_crypto_algs)); -} - -module_init(lib80211_init); -module_exit(lib80211_exit); diff --git a/net/wireless/lib80211_crypt_ccmp.c b/net/wireless/lib80211_crypt_ccmp.c deleted file mode 100644 index 5aad139130e1..000000000000 --- a/net/wireless/lib80211_crypt_ccmp.c +++ /dev/null @@ -1,448 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * lib80211 crypt: host-based CCMP encryption implementation for lib80211 - * - * Copyright (c) 2003-2004, Jouni Malinen - * Copyright (c) 2008, John W. Linville - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include - -#include - -MODULE_AUTHOR("Jouni Malinen"); -MODULE_DESCRIPTION("Host AP crypt: CCMP"); -MODULE_LICENSE("GPL"); - -#define AES_BLOCK_LEN 16 -#define CCMP_HDR_LEN 8 -#define CCMP_MIC_LEN 8 -#define CCMP_TK_LEN 16 -#define CCMP_PN_LEN 6 - -struct lib80211_ccmp_data { - u8 key[CCMP_TK_LEN]; - int key_set; - - u8 tx_pn[CCMP_PN_LEN]; - u8 rx_pn[CCMP_PN_LEN]; - - u32 dot11RSNAStatsCCMPFormatErrors; - u32 dot11RSNAStatsCCMPReplays; - u32 dot11RSNAStatsCCMPDecryptErrors; - - int key_idx; - - struct crypto_aead *tfm; - - /* scratch buffers for virt_to_page() (crypto API) */ - u8 tx_aad[2 * AES_BLOCK_LEN]; - u8 rx_aad[2 * AES_BLOCK_LEN]; -}; - -static void *lib80211_ccmp_init(int key_idx) -{ - struct lib80211_ccmp_data *priv; - - priv = kzalloc(sizeof(*priv), GFP_ATOMIC); - if (priv == NULL) - goto fail; - priv->key_idx = key_idx; - - priv->tfm = crypto_alloc_aead("ccm(aes)", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(priv->tfm)) { - priv->tfm = NULL; - goto fail; - } - - return priv; - - fail: - if (priv) { - if (priv->tfm) - crypto_free_aead(priv->tfm); - kfree(priv); - } - - return NULL; -} - -static void lib80211_ccmp_deinit(void *priv) -{ - struct lib80211_ccmp_data *_priv = priv; - if (_priv && _priv->tfm) - crypto_free_aead(_priv->tfm); - kfree(priv); -} - -static int ccmp_init_iv_and_aad(const struct ieee80211_hdr *hdr, - const u8 *pn, u8 *iv, u8 *aad) -{ - u8 *pos, qc = 0; - size_t aad_len; - int a4_included, qc_included; - - a4_included = ieee80211_has_a4(hdr->frame_control); - qc_included = ieee80211_is_data_qos(hdr->frame_control); - - aad_len = 22; - if (a4_included) - aad_len += 6; - if (qc_included) { - pos = (u8 *) & hdr->addr4; - if (a4_included) - pos += 6; - qc = *pos & 0x0f; - aad_len += 2; - } - - /* In CCM, the initial vectors (IV) used for CTR mode encryption and CBC - * mode authentication are not allowed to collide, yet both are derived - * from the same vector. We only set L := 1 here to indicate that the - * data size can be represented in (L+1) bytes. The CCM layer will take - * care of storing the data length in the top (L+1) bytes and setting - * and clearing the other bits as is required to derive the two IVs. - */ - iv[0] = 0x1; - - /* Nonce: QC | A2 | PN */ - iv[1] = qc; - memcpy(iv + 2, hdr->addr2, ETH_ALEN); - memcpy(iv + 8, pn, CCMP_PN_LEN); - - /* AAD: - * FC with bits 4..6 and 11..13 masked to zero; 14 is always one - * A1 | A2 | A3 - * SC with bits 4..15 (seq#) masked to zero - * A4 (if present) - * QC (if present) - */ - pos = (u8 *) hdr; - aad[0] = pos[0] & 0x8f; - aad[1] = pos[1] & 0xc7; - memcpy(aad + 2, &hdr->addrs, 3 * ETH_ALEN); - pos = (u8 *) & hdr->seq_ctrl; - aad[20] = pos[0] & 0x0f; - aad[21] = 0; /* all bits masked */ - memset(aad + 22, 0, 8); - if (a4_included) - memcpy(aad + 22, hdr->addr4, ETH_ALEN); - if (qc_included) { - aad[a4_included ? 28 : 22] = qc; - /* rest of QC masked */ - } - return aad_len; -} - -static int lib80211_ccmp_hdr(struct sk_buff *skb, int hdr_len, - u8 *aeskey, int keylen, void *priv) -{ - struct lib80211_ccmp_data *key = priv; - int i; - u8 *pos; - - if (skb_headroom(skb) < CCMP_HDR_LEN || skb->len < hdr_len) - return -1; - - if (aeskey != NULL && keylen >= CCMP_TK_LEN) - memcpy(aeskey, key->key, CCMP_TK_LEN); - - pos = skb_push(skb, CCMP_HDR_LEN); - memmove(pos, pos + CCMP_HDR_LEN, hdr_len); - pos += hdr_len; - - i = CCMP_PN_LEN - 1; - while (i >= 0) { - key->tx_pn[i]++; - if (key->tx_pn[i] != 0) - break; - i--; - } - - *pos++ = key->tx_pn[5]; - *pos++ = key->tx_pn[4]; - *pos++ = 0; - *pos++ = (key->key_idx << 6) | (1 << 5) /* Ext IV included */ ; - *pos++ = key->tx_pn[3]; - *pos++ = key->tx_pn[2]; - *pos++ = key->tx_pn[1]; - *pos++ = key->tx_pn[0]; - - return CCMP_HDR_LEN; -} - -static int lib80211_ccmp_encrypt(struct sk_buff *skb, int hdr_len, void *priv) -{ - struct lib80211_ccmp_data *key = priv; - struct ieee80211_hdr *hdr; - struct aead_request *req; - struct scatterlist sg[2]; - u8 *aad = key->tx_aad; - u8 iv[AES_BLOCK_LEN]; - int len, data_len, aad_len; - int ret; - - if (skb_tailroom(skb) < CCMP_MIC_LEN || skb->len < hdr_len) - return -1; - - data_len = skb->len - hdr_len; - len = lib80211_ccmp_hdr(skb, hdr_len, NULL, 0, priv); - if (len < 0) - return -1; - - req = aead_request_alloc(key->tfm, GFP_ATOMIC); - if (!req) - return -ENOMEM; - - hdr = (struct ieee80211_hdr *)skb->data; - aad_len = ccmp_init_iv_and_aad(hdr, key->tx_pn, iv, aad); - - skb_put(skb, CCMP_MIC_LEN); - - sg_init_table(sg, 2); - sg_set_buf(&sg[0], aad, aad_len); - sg_set_buf(&sg[1], skb->data + hdr_len + CCMP_HDR_LEN, - data_len + CCMP_MIC_LEN); - - aead_request_set_callback(req, 0, NULL, NULL); - aead_request_set_ad(req, aad_len); - aead_request_set_crypt(req, sg, sg, data_len, iv); - - ret = crypto_aead_encrypt(req); - aead_request_free(req); - - return ret; -} - -/* - * deal with seq counter wrapping correctly. - * refer to timer_after() for jiffies wrapping handling - */ -static inline int ccmp_replay_check(u8 *pn_n, u8 *pn_o) -{ - u32 iv32_n, iv16_n; - u32 iv32_o, iv16_o; - - iv32_n = (pn_n[0] << 24) | (pn_n[1] << 16) | (pn_n[2] << 8) | pn_n[3]; - iv16_n = (pn_n[4] << 8) | pn_n[5]; - - iv32_o = (pn_o[0] << 24) | (pn_o[1] << 16) | (pn_o[2] << 8) | pn_o[3]; - iv16_o = (pn_o[4] << 8) | pn_o[5]; - - if ((s32)iv32_n - (s32)iv32_o < 0 || - (iv32_n == iv32_o && iv16_n <= iv16_o)) - return 1; - return 0; -} - -static int lib80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv) -{ - struct lib80211_ccmp_data *key = priv; - u8 keyidx, *pos; - struct ieee80211_hdr *hdr; - struct aead_request *req; - struct scatterlist sg[2]; - u8 *aad = key->rx_aad; - u8 iv[AES_BLOCK_LEN]; - u8 pn[6]; - int aad_len, ret; - size_t data_len = skb->len - hdr_len - CCMP_HDR_LEN; - - if (skb->len < hdr_len + CCMP_HDR_LEN + CCMP_MIC_LEN) { - key->dot11RSNAStatsCCMPFormatErrors++; - return -1; - } - - hdr = (struct ieee80211_hdr *)skb->data; - pos = skb->data + hdr_len; - keyidx = pos[3]; - if (!(keyidx & (1 << 5))) { - net_dbg_ratelimited("CCMP: received packet without ExtIV flag from %pM\n", - hdr->addr2); - key->dot11RSNAStatsCCMPFormatErrors++; - return -2; - } - keyidx >>= 6; - if (key->key_idx != keyidx) { - net_dbg_ratelimited("CCMP: RX tkey->key_idx=%d frame keyidx=%d\n", - key->key_idx, keyidx); - return -6; - } - if (!key->key_set) { - net_dbg_ratelimited("CCMP: received packet from %pM with keyid=%d that does not have a configured key\n", - hdr->addr2, keyidx); - return -3; - } - - pn[0] = pos[7]; - pn[1] = pos[6]; - pn[2] = pos[5]; - pn[3] = pos[4]; - pn[4] = pos[1]; - pn[5] = pos[0]; - pos += 8; - - if (ccmp_replay_check(pn, key->rx_pn)) { -#ifdef CONFIG_LIB80211_DEBUG - net_dbg_ratelimited("CCMP: replay detected: STA=%pM previous PN %02x%02x%02x%02x%02x%02x received PN %02x%02x%02x%02x%02x%02x\n", - hdr->addr2, - key->rx_pn[0], key->rx_pn[1], key->rx_pn[2], - key->rx_pn[3], key->rx_pn[4], key->rx_pn[5], - pn[0], pn[1], pn[2], pn[3], pn[4], pn[5]); -#endif - key->dot11RSNAStatsCCMPReplays++; - return -4; - } - - req = aead_request_alloc(key->tfm, GFP_ATOMIC); - if (!req) - return -ENOMEM; - - aad_len = ccmp_init_iv_and_aad(hdr, pn, iv, aad); - - sg_init_table(sg, 2); - sg_set_buf(&sg[0], aad, aad_len); - sg_set_buf(&sg[1], pos, data_len); - - aead_request_set_callback(req, 0, NULL, NULL); - aead_request_set_ad(req, aad_len); - aead_request_set_crypt(req, sg, sg, data_len, iv); - - ret = crypto_aead_decrypt(req); - aead_request_free(req); - - if (ret) { - net_dbg_ratelimited("CCMP: decrypt failed: STA=%pM (%d)\n", - hdr->addr2, ret); - key->dot11RSNAStatsCCMPDecryptErrors++; - return -5; - } - - memcpy(key->rx_pn, pn, CCMP_PN_LEN); - - /* Remove hdr and MIC */ - memmove(skb->data + CCMP_HDR_LEN, skb->data, hdr_len); - skb_pull(skb, CCMP_HDR_LEN); - skb_trim(skb, skb->len - CCMP_MIC_LEN); - - return keyidx; -} - -static int lib80211_ccmp_set_key(void *key, int len, u8 * seq, void *priv) -{ - struct lib80211_ccmp_data *data = priv; - int keyidx; - struct crypto_aead *tfm = data->tfm; - - keyidx = data->key_idx; - memset(data, 0, sizeof(*data)); - data->key_idx = keyidx; - data->tfm = tfm; - if (len == CCMP_TK_LEN) { - memcpy(data->key, key, CCMP_TK_LEN); - data->key_set = 1; - if (seq) { - data->rx_pn[0] = seq[5]; - data->rx_pn[1] = seq[4]; - data->rx_pn[2] = seq[3]; - data->rx_pn[3] = seq[2]; - data->rx_pn[4] = seq[1]; - data->rx_pn[5] = seq[0]; - } - if (crypto_aead_setauthsize(data->tfm, CCMP_MIC_LEN) || - crypto_aead_setkey(data->tfm, data->key, CCMP_TK_LEN)) - return -1; - } else if (len == 0) - data->key_set = 0; - else - return -1; - - return 0; -} - -static int lib80211_ccmp_get_key(void *key, int len, u8 * seq, void *priv) -{ - struct lib80211_ccmp_data *data = priv; - - if (len < CCMP_TK_LEN) - return -1; - - if (!data->key_set) - return 0; - memcpy(key, data->key, CCMP_TK_LEN); - - if (seq) { - seq[0] = data->tx_pn[5]; - seq[1] = data->tx_pn[4]; - seq[2] = data->tx_pn[3]; - seq[3] = data->tx_pn[2]; - seq[4] = data->tx_pn[1]; - seq[5] = data->tx_pn[0]; - } - - return CCMP_TK_LEN; -} - -static void lib80211_ccmp_print_stats(struct seq_file *m, void *priv) -{ - struct lib80211_ccmp_data *ccmp = priv; - - seq_printf(m, - "key[%d] alg=CCMP key_set=%d " - "tx_pn=%02x%02x%02x%02x%02x%02x " - "rx_pn=%02x%02x%02x%02x%02x%02x " - "format_errors=%d replays=%d decrypt_errors=%d\n", - ccmp->key_idx, ccmp->key_set, - ccmp->tx_pn[0], ccmp->tx_pn[1], ccmp->tx_pn[2], - ccmp->tx_pn[3], ccmp->tx_pn[4], ccmp->tx_pn[5], - ccmp->rx_pn[0], ccmp->rx_pn[1], ccmp->rx_pn[2], - ccmp->rx_pn[3], ccmp->rx_pn[4], ccmp->rx_pn[5], - ccmp->dot11RSNAStatsCCMPFormatErrors, - ccmp->dot11RSNAStatsCCMPReplays, - ccmp->dot11RSNAStatsCCMPDecryptErrors); -} - -static const struct lib80211_crypto_ops lib80211_crypt_ccmp = { - .name = "CCMP", - .init = lib80211_ccmp_init, - .deinit = lib80211_ccmp_deinit, - .encrypt_mpdu = lib80211_ccmp_encrypt, - .decrypt_mpdu = lib80211_ccmp_decrypt, - .encrypt_msdu = NULL, - .decrypt_msdu = NULL, - .set_key = lib80211_ccmp_set_key, - .get_key = lib80211_ccmp_get_key, - .print_stats = lib80211_ccmp_print_stats, - .extra_mpdu_prefix_len = CCMP_HDR_LEN, - .extra_mpdu_postfix_len = CCMP_MIC_LEN, - .owner = THIS_MODULE, -}; - -static int __init lib80211_crypto_ccmp_init(void) -{ - return lib80211_register_crypto_ops(&lib80211_crypt_ccmp); -} - -static void __exit lib80211_crypto_ccmp_exit(void) -{ - lib80211_unregister_crypto_ops(&lib80211_crypt_ccmp); -} - -module_init(lib80211_crypto_ccmp_init); -module_exit(lib80211_crypto_ccmp_exit); diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c deleted file mode 100644 index 63e68e5e121e..000000000000 --- a/net/wireless/lib80211_crypt_tkip.c +++ /dev/null @@ -1,738 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * lib80211 crypt: host-based TKIP encryption implementation for lib80211 - * - * Copyright (c) 2003-2004, Jouni Malinen - * Copyright (c) 2008, John W. Linville - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include - -#include - -MODULE_AUTHOR("Jouni Malinen"); -MODULE_DESCRIPTION("lib80211 crypt: TKIP"); -MODULE_LICENSE("GPL"); - -#define TKIP_HDR_LEN 8 - -struct lib80211_tkip_data { -#define TKIP_KEY_LEN 32 - u8 key[TKIP_KEY_LEN]; - int key_set; - - u32 tx_iv32; - u16 tx_iv16; - u16 tx_ttak[5]; - int tx_phase1_done; - - u32 rx_iv32; - u16 rx_iv16; - u16 rx_ttak[5]; - int rx_phase1_done; - u32 rx_iv32_new; - u16 rx_iv16_new; - - u32 dot11RSNAStatsTKIPReplays; - u32 dot11RSNAStatsTKIPICVErrors; - u32 dot11RSNAStatsTKIPLocalMICFailures; - - int key_idx; - - struct arc4_ctx rx_ctx_arc4; - struct arc4_ctx tx_ctx_arc4; - struct crypto_shash *rx_tfm_michael; - struct crypto_shash *tx_tfm_michael; - - /* scratch buffers for virt_to_page() (crypto API) */ - u8 rx_hdr[16], tx_hdr[16]; - - unsigned long flags; -}; - -static unsigned long lib80211_tkip_set_flags(unsigned long flags, void *priv) -{ - struct lib80211_tkip_data *_priv = priv; - unsigned long old_flags = _priv->flags; - _priv->flags = flags; - return old_flags; -} - -static unsigned long lib80211_tkip_get_flags(void *priv) -{ - struct lib80211_tkip_data *_priv = priv; - return _priv->flags; -} - -static void *lib80211_tkip_init(int key_idx) -{ - struct lib80211_tkip_data *priv; - - if (fips_enabled) - return NULL; - - priv = kzalloc(sizeof(*priv), GFP_ATOMIC); - if (priv == NULL) - goto fail; - - priv->key_idx = key_idx; - - priv->tx_tfm_michael = crypto_alloc_shash("michael_mic", 0, 0); - if (IS_ERR(priv->tx_tfm_michael)) { - priv->tx_tfm_michael = NULL; - goto fail; - } - - priv->rx_tfm_michael = crypto_alloc_shash("michael_mic", 0, 0); - if (IS_ERR(priv->rx_tfm_michael)) { - priv->rx_tfm_michael = NULL; - goto fail; - } - - return priv; - - fail: - if (priv) { - crypto_free_shash(priv->tx_tfm_michael); - crypto_free_shash(priv->rx_tfm_michael); - kfree(priv); - } - - return NULL; -} - -static void lib80211_tkip_deinit(void *priv) -{ - struct lib80211_tkip_data *_priv = priv; - if (_priv) { - crypto_free_shash(_priv->tx_tfm_michael); - crypto_free_shash(_priv->rx_tfm_michael); - } - kfree_sensitive(priv); -} - -static inline u16 RotR1(u16 val) -{ - return (val >> 1) | (val << 15); -} - -static inline u8 Lo8(u16 val) -{ - return val & 0xff; -} - -static inline u8 Hi8(u16 val) -{ - return val >> 8; -} - -static inline u16 Lo16(u32 val) -{ - return val & 0xffff; -} - -static inline u16 Hi16(u32 val) -{ - return val >> 16; -} - -static inline u16 Mk16(u8 hi, u8 lo) -{ - return lo | (((u16) hi) << 8); -} - -static inline u16 Mk16_le(__le16 * v) -{ - return le16_to_cpu(*v); -} - -static const u16 Sbox[256] = { - 0xC6A5, 0xF884, 0xEE99, 0xF68D, 0xFF0D, 0xD6BD, 0xDEB1, 0x9154, - 0x6050, 0x0203, 0xCEA9, 0x567D, 0xE719, 0xB562, 0x4DE6, 0xEC9A, - 0x8F45, 0x1F9D, 0x8940, 0xFA87, 0xEF15, 0xB2EB, 0x8EC9, 0xFB0B, - 0x41EC, 0xB367, 0x5FFD, 0x45EA, 0x23BF, 0x53F7, 0xE496, 0x9B5B, - 0x75C2, 0xE11C, 0x3DAE, 0x4C6A, 0x6C5A, 0x7E41, 0xF502, 0x834F, - 0x685C, 0x51F4, 0xD134, 0xF908, 0xE293, 0xAB73, 0x6253, 0x2A3F, - 0x080C, 0x9552, 0x4665, 0x9D5E, 0x3028, 0x37A1, 0x0A0F, 0x2FB5, - 0x0E09, 0x2436, 0x1B9B, 0xDF3D, 0xCD26, 0x4E69, 0x7FCD, 0xEA9F, - 0x121B, 0x1D9E, 0x5874, 0x342E, 0x362D, 0xDCB2, 0xB4EE, 0x5BFB, - 0xA4F6, 0x764D, 0xB761, 0x7DCE, 0x527B, 0xDD3E, 0x5E71, 0x1397, - 0xA6F5, 0xB968, 0x0000, 0xC12C, 0x4060, 0xE31F, 0x79C8, 0xB6ED, - 0xD4BE, 0x8D46, 0x67D9, 0x724B, 0x94DE, 0x98D4, 0xB0E8, 0x854A, - 0xBB6B, 0xC52A, 0x4FE5, 0xED16, 0x86C5, 0x9AD7, 0x6655, 0x1194, - 0x8ACF, 0xE910, 0x0406, 0xFE81, 0xA0F0, 0x7844, 0x25BA, 0x4BE3, - 0xA2F3, 0x5DFE, 0x80C0, 0x058A, 0x3FAD, 0x21BC, 0x7048, 0xF104, - 0x63DF, 0x77C1, 0xAF75, 0x4263, 0x2030, 0xE51A, 0xFD0E, 0xBF6D, - 0x814C, 0x1814, 0x2635, 0xC32F, 0xBEE1, 0x35A2, 0x88CC, 0x2E39, - 0x9357, 0x55F2, 0xFC82, 0x7A47, 0xC8AC, 0xBAE7, 0x322B, 0xE695, - 0xC0A0, 0x1998, 0x9ED1, 0xA37F, 0x4466, 0x547E, 0x3BAB, 0x0B83, - 0x8CCA, 0xC729, 0x6BD3, 0x283C, 0xA779, 0xBCE2, 0x161D, 0xAD76, - 0xDB3B, 0x6456, 0x744E, 0x141E, 0x92DB, 0x0C0A, 0x486C, 0xB8E4, - 0x9F5D, 0xBD6E, 0x43EF, 0xC4A6, 0x39A8, 0x31A4, 0xD337, 0xF28B, - 0xD532, 0x8B43, 0x6E59, 0xDAB7, 0x018C, 0xB164, 0x9CD2, 0x49E0, - 0xD8B4, 0xACFA, 0xF307, 0xCF25, 0xCAAF, 0xF48E, 0x47E9, 0x1018, - 0x6FD5, 0xF088, 0x4A6F, 0x5C72, 0x3824, 0x57F1, 0x73C7, 0x9751, - 0xCB23, 0xA17C, 0xE89C, 0x3E21, 0x96DD, 0x61DC, 0x0D86, 0x0F85, - 0xE090, 0x7C42, 0x71C4, 0xCCAA, 0x90D8, 0x0605, 0xF701, 0x1C12, - 0xC2A3, 0x6A5F, 0xAEF9, 0x69D0, 0x1791, 0x9958, 0x3A27, 0x27B9, - 0xD938, 0xEB13, 0x2BB3, 0x2233, 0xD2BB, 0xA970, 0x0789, 0x33A7, - 0x2DB6, 0x3C22, 0x1592, 0xC920, 0x8749, 0xAAFF, 0x5078, 0xA57A, - 0x038F, 0x59F8, 0x0980, 0x1A17, 0x65DA, 0xD731, 0x84C6, 0xD0B8, - 0x82C3, 0x29B0, 0x5A77, 0x1E11, 0x7BCB, 0xA8FC, 0x6DD6, 0x2C3A, -}; - -static inline u16 _S_(u16 v) -{ - u16 t = Sbox[Hi8(v)]; - return Sbox[Lo8(v)] ^ ((t << 8) | (t >> 8)); -} - -#define PHASE1_LOOP_COUNT 8 - -static void tkip_mixing_phase1(u16 * TTAK, const u8 * TK, const u8 * TA, - u32 IV32) -{ - int i, j; - - /* Initialize the 80-bit TTAK from TSC (IV32) and TA[0..5] */ - TTAK[0] = Lo16(IV32); - TTAK[1] = Hi16(IV32); - TTAK[2] = Mk16(TA[1], TA[0]); - TTAK[3] = Mk16(TA[3], TA[2]); - TTAK[4] = Mk16(TA[5], TA[4]); - - for (i = 0; i < PHASE1_LOOP_COUNT; i++) { - j = 2 * (i & 1); - TTAK[0] += _S_(TTAK[4] ^ Mk16(TK[1 + j], TK[0 + j])); - TTAK[1] += _S_(TTAK[0] ^ Mk16(TK[5 + j], TK[4 + j])); - TTAK[2] += _S_(TTAK[1] ^ Mk16(TK[9 + j], TK[8 + j])); - TTAK[3] += _S_(TTAK[2] ^ Mk16(TK[13 + j], TK[12 + j])); - TTAK[4] += _S_(TTAK[3] ^ Mk16(TK[1 + j], TK[0 + j])) + i; - } -} - -static void tkip_mixing_phase2(u8 * WEPSeed, const u8 * TK, const u16 * TTAK, - u16 IV16) -{ - /* Make temporary area overlap WEP seed so that the final copy can be - * avoided on little endian hosts. */ - u16 *PPK = (u16 *) & WEPSeed[4]; - - /* Step 1 - make copy of TTAK and bring in TSC */ - PPK[0] = TTAK[0]; - PPK[1] = TTAK[1]; - PPK[2] = TTAK[2]; - PPK[3] = TTAK[3]; - PPK[4] = TTAK[4]; - PPK[5] = TTAK[4] + IV16; - - /* Step 2 - 96-bit bijective mixing using S-box */ - PPK[0] += _S_(PPK[5] ^ Mk16_le((__le16 *) & TK[0])); - PPK[1] += _S_(PPK[0] ^ Mk16_le((__le16 *) & TK[2])); - PPK[2] += _S_(PPK[1] ^ Mk16_le((__le16 *) & TK[4])); - PPK[3] += _S_(PPK[2] ^ Mk16_le((__le16 *) & TK[6])); - PPK[4] += _S_(PPK[3] ^ Mk16_le((__le16 *) & TK[8])); - PPK[5] += _S_(PPK[4] ^ Mk16_le((__le16 *) & TK[10])); - - PPK[0] += RotR1(PPK[5] ^ Mk16_le((__le16 *) & TK[12])); - PPK[1] += RotR1(PPK[0] ^ Mk16_le((__le16 *) & TK[14])); - PPK[2] += RotR1(PPK[1]); - PPK[3] += RotR1(PPK[2]); - PPK[4] += RotR1(PPK[3]); - PPK[5] += RotR1(PPK[4]); - - /* Step 3 - bring in last of TK bits, assign 24-bit WEP IV value - * WEPSeed[0..2] is transmitted as WEP IV */ - WEPSeed[0] = Hi8(IV16); - WEPSeed[1] = (Hi8(IV16) | 0x20) & 0x7F; - WEPSeed[2] = Lo8(IV16); - WEPSeed[3] = Lo8((PPK[5] ^ Mk16_le((__le16 *) & TK[0])) >> 1); - -#ifdef __BIG_ENDIAN - { - int i; - for (i = 0; i < 6; i++) - PPK[i] = (PPK[i] << 8) | (PPK[i] >> 8); - } -#endif -} - -static int lib80211_tkip_hdr(struct sk_buff *skb, int hdr_len, - u8 * rc4key, int keylen, void *priv) -{ - struct lib80211_tkip_data *tkey = priv; - u8 *pos; - struct ieee80211_hdr *hdr; - - hdr = (struct ieee80211_hdr *)skb->data; - - if (skb_headroom(skb) < TKIP_HDR_LEN || skb->len < hdr_len) - return -1; - - if (rc4key == NULL || keylen < 16) - return -1; - - if (!tkey->tx_phase1_done) { - tkip_mixing_phase1(tkey->tx_ttak, tkey->key, hdr->addr2, - tkey->tx_iv32); - tkey->tx_phase1_done = 1; - } - tkip_mixing_phase2(rc4key, tkey->key, tkey->tx_ttak, tkey->tx_iv16); - - pos = skb_push(skb, TKIP_HDR_LEN); - memmove(pos, pos + TKIP_HDR_LEN, hdr_len); - pos += hdr_len; - - *pos++ = *rc4key; - *pos++ = *(rc4key + 1); - *pos++ = *(rc4key + 2); - *pos++ = (tkey->key_idx << 6) | (1 << 5) /* Ext IV included */ ; - *pos++ = tkey->tx_iv32 & 0xff; - *pos++ = (tkey->tx_iv32 >> 8) & 0xff; - *pos++ = (tkey->tx_iv32 >> 16) & 0xff; - *pos++ = (tkey->tx_iv32 >> 24) & 0xff; - - tkey->tx_iv16++; - if (tkey->tx_iv16 == 0) { - tkey->tx_phase1_done = 0; - tkey->tx_iv32++; - } - - return TKIP_HDR_LEN; -} - -static int lib80211_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv) -{ - struct lib80211_tkip_data *tkey = priv; - int len; - u8 rc4key[16], *pos, *icv; - u32 crc; - - if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) { - struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; - net_dbg_ratelimited("TKIP countermeasures: dropped TX packet to %pM\n", - hdr->addr1); - return -1; - } - - if (skb_tailroom(skb) < 4 || skb->len < hdr_len) - return -1; - - len = skb->len - hdr_len; - pos = skb->data + hdr_len; - - if ((lib80211_tkip_hdr(skb, hdr_len, rc4key, 16, priv)) < 0) - return -1; - - crc = ~crc32_le(~0, pos, len); - icv = skb_put(skb, 4); - icv[0] = crc; - icv[1] = crc >> 8; - icv[2] = crc >> 16; - icv[3] = crc >> 24; - - arc4_setkey(&tkey->tx_ctx_arc4, rc4key, 16); - arc4_crypt(&tkey->tx_ctx_arc4, pos, pos, len + 4); - - return 0; -} - -/* - * deal with seq counter wrapping correctly. - * refer to timer_after() for jiffies wrapping handling - */ -static inline int tkip_replay_check(u32 iv32_n, u16 iv16_n, - u32 iv32_o, u16 iv16_o) -{ - if ((s32)iv32_n - (s32)iv32_o < 0 || - (iv32_n == iv32_o && iv16_n <= iv16_o)) - return 1; - return 0; -} - -static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv) -{ - struct lib80211_tkip_data *tkey = priv; - u8 rc4key[16]; - u8 keyidx, *pos; - u32 iv32; - u16 iv16; - struct ieee80211_hdr *hdr; - u8 icv[4]; - u32 crc; - int plen; - - hdr = (struct ieee80211_hdr *)skb->data; - - if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) { - net_dbg_ratelimited("TKIP countermeasures: dropped received packet from %pM\n", - hdr->addr2); - return -1; - } - - if (skb->len < hdr_len + TKIP_HDR_LEN + 4) - return -1; - - pos = skb->data + hdr_len; - keyidx = pos[3]; - if (!(keyidx & (1 << 5))) { - net_dbg_ratelimited("TKIP: received packet without ExtIV flag from %pM\n", - hdr->addr2); - return -2; - } - keyidx >>= 6; - if (tkey->key_idx != keyidx) { - net_dbg_ratelimited("TKIP: RX tkey->key_idx=%d frame keyidx=%d\n", - tkey->key_idx, keyidx); - return -6; - } - if (!tkey->key_set) { - net_dbg_ratelimited("TKIP: received packet from %pM with keyid=%d that does not have a configured key\n", - hdr->addr2, keyidx); - return -3; - } - iv16 = (pos[0] << 8) | pos[2]; - iv32 = pos[4] | (pos[5] << 8) | (pos[6] << 16) | (pos[7] << 24); - pos += TKIP_HDR_LEN; - - if (tkip_replay_check(iv32, iv16, tkey->rx_iv32, tkey->rx_iv16)) { -#ifdef CONFIG_LIB80211_DEBUG - net_dbg_ratelimited("TKIP: replay detected: STA=%pM previous TSC %08x%04x received TSC %08x%04x\n", - hdr->addr2, tkey->rx_iv32, tkey->rx_iv16, - iv32, iv16); -#endif - tkey->dot11RSNAStatsTKIPReplays++; - return -4; - } - - if (iv32 != tkey->rx_iv32 || !tkey->rx_phase1_done) { - tkip_mixing_phase1(tkey->rx_ttak, tkey->key, hdr->addr2, iv32); - tkey->rx_phase1_done = 1; - } - tkip_mixing_phase2(rc4key, tkey->key, tkey->rx_ttak, iv16); - - plen = skb->len - hdr_len - 12; - - arc4_setkey(&tkey->rx_ctx_arc4, rc4key, 16); - arc4_crypt(&tkey->rx_ctx_arc4, pos, pos, plen + 4); - - crc = ~crc32_le(~0, pos, plen); - icv[0] = crc; - icv[1] = crc >> 8; - icv[2] = crc >> 16; - icv[3] = crc >> 24; - if (memcmp(icv, pos + plen, 4) != 0) { - if (iv32 != tkey->rx_iv32) { - /* Previously cached Phase1 result was already lost, so - * it needs to be recalculated for the next packet. */ - tkey->rx_phase1_done = 0; - } -#ifdef CONFIG_LIB80211_DEBUG - net_dbg_ratelimited("TKIP: ICV error detected: STA=%pM\n", - hdr->addr2); -#endif - tkey->dot11RSNAStatsTKIPICVErrors++; - return -5; - } - - /* Update real counters only after Michael MIC verification has - * completed */ - tkey->rx_iv32_new = iv32; - tkey->rx_iv16_new = iv16; - - /* Remove IV and ICV */ - memmove(skb->data + TKIP_HDR_LEN, skb->data, hdr_len); - skb_pull(skb, TKIP_HDR_LEN); - skb_trim(skb, skb->len - 4); - - return keyidx; -} - -static int michael_mic(struct crypto_shash *tfm_michael, u8 *key, u8 *hdr, - u8 *data, size_t data_len, u8 *mic) -{ - SHASH_DESC_ON_STACK(desc, tfm_michael); - int err; - - if (tfm_michael == NULL) { - pr_warn("%s(): tfm_michael == NULL\n", __func__); - return -1; - } - - desc->tfm = tfm_michael; - - if (crypto_shash_setkey(tfm_michael, key, 8)) - return -1; - - err = crypto_shash_init(desc); - if (err) - goto out; - err = crypto_shash_update(desc, hdr, 16); - if (err) - goto out; - err = crypto_shash_update(desc, data, data_len); - if (err) - goto out; - err = crypto_shash_final(desc, mic); - -out: - shash_desc_zero(desc); - return err; -} - -static void michael_mic_hdr(struct sk_buff *skb, u8 * hdr) -{ - struct ieee80211_hdr *hdr11; - - hdr11 = (struct ieee80211_hdr *)skb->data; - - switch (le16_to_cpu(hdr11->frame_control) & - (IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS)) { - case IEEE80211_FCTL_TODS: - memcpy(hdr, hdr11->addr3, ETH_ALEN); /* DA */ - memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */ - break; - case IEEE80211_FCTL_FROMDS: - memcpy(hdr, hdr11->addr1, ETH_ALEN); /* DA */ - memcpy(hdr + ETH_ALEN, hdr11->addr3, ETH_ALEN); /* SA */ - break; - case IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS: - memcpy(hdr, hdr11->addr3, ETH_ALEN); /* DA */ - memcpy(hdr + ETH_ALEN, hdr11->addr4, ETH_ALEN); /* SA */ - break; - default: - memcpy(hdr, hdr11->addr1, ETH_ALEN); /* DA */ - memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */ - break; - } - - if (ieee80211_is_data_qos(hdr11->frame_control)) { - hdr[12] = le16_to_cpu(*((__le16 *)ieee80211_get_qos_ctl(hdr11))) - & IEEE80211_QOS_CTL_TID_MASK; - } else - hdr[12] = 0; /* priority */ - - hdr[13] = hdr[14] = hdr[15] = 0; /* reserved */ -} - -static int lib80211_michael_mic_add(struct sk_buff *skb, int hdr_len, - void *priv) -{ - struct lib80211_tkip_data *tkey = priv; - u8 *pos; - - if (skb_tailroom(skb) < 8 || skb->len < hdr_len) { - printk(KERN_DEBUG "Invalid packet for Michael MIC add " - "(tailroom=%d hdr_len=%d skb->len=%d)\n", - skb_tailroom(skb), hdr_len, skb->len); - return -1; - } - - michael_mic_hdr(skb, tkey->tx_hdr); - pos = skb_put(skb, 8); - if (michael_mic(tkey->tx_tfm_michael, &tkey->key[16], tkey->tx_hdr, - skb->data + hdr_len, skb->len - 8 - hdr_len, pos)) - return -1; - - return 0; -} - -static void lib80211_michael_mic_failure(struct net_device *dev, - struct ieee80211_hdr *hdr, - int keyidx) -{ - union iwreq_data wrqu; - struct iw_michaelmicfailure ev; - - /* TODO: needed parameters: count, keyid, key type, TSC */ - memset(&ev, 0, sizeof(ev)); - ev.flags = keyidx & IW_MICFAILURE_KEY_ID; - if (hdr->addr1[0] & 0x01) - ev.flags |= IW_MICFAILURE_GROUP; - else - ev.flags |= IW_MICFAILURE_PAIRWISE; - ev.src_addr.sa_family = ARPHRD_ETHER; - memcpy(ev.src_addr.sa_data, hdr->addr2, ETH_ALEN); - memset(&wrqu, 0, sizeof(wrqu)); - wrqu.data.length = sizeof(ev); - wireless_send_event(dev, IWEVMICHAELMICFAILURE, &wrqu, (char *)&ev); -} - -static int lib80211_michael_mic_verify(struct sk_buff *skb, int keyidx, - int hdr_len, void *priv) -{ - struct lib80211_tkip_data *tkey = priv; - u8 mic[8]; - - if (!tkey->key_set) - return -1; - - michael_mic_hdr(skb, tkey->rx_hdr); - if (michael_mic(tkey->rx_tfm_michael, &tkey->key[24], tkey->rx_hdr, - skb->data + hdr_len, skb->len - 8 - hdr_len, mic)) - return -1; - if (memcmp(mic, skb->data + skb->len - 8, 8) != 0) { - struct ieee80211_hdr *hdr; - hdr = (struct ieee80211_hdr *)skb->data; - printk(KERN_DEBUG "%s: Michael MIC verification failed for " - "MSDU from %pM keyidx=%d\n", - skb->dev ? skb->dev->name : "N/A", hdr->addr2, - keyidx); - if (skb->dev) - lib80211_michael_mic_failure(skb->dev, hdr, keyidx); - tkey->dot11RSNAStatsTKIPLocalMICFailures++; - return -1; - } - - /* Update TSC counters for RX now that the packet verification has - * completed. */ - tkey->rx_iv32 = tkey->rx_iv32_new; - tkey->rx_iv16 = tkey->rx_iv16_new; - - skb_trim(skb, skb->len - 8); - - return 0; -} - -static int lib80211_tkip_set_key(void *key, int len, u8 * seq, void *priv) -{ - struct lib80211_tkip_data *tkey = priv; - int keyidx; - struct crypto_shash *tfm = tkey->tx_tfm_michael; - struct arc4_ctx *tfm2 = &tkey->tx_ctx_arc4; - struct crypto_shash *tfm3 = tkey->rx_tfm_michael; - struct arc4_ctx *tfm4 = &tkey->rx_ctx_arc4; - - keyidx = tkey->key_idx; - memset(tkey, 0, sizeof(*tkey)); - tkey->key_idx = keyidx; - tkey->tx_tfm_michael = tfm; - tkey->tx_ctx_arc4 = *tfm2; - tkey->rx_tfm_michael = tfm3; - tkey->rx_ctx_arc4 = *tfm4; - if (len == TKIP_KEY_LEN) { - memcpy(tkey->key, key, TKIP_KEY_LEN); - tkey->key_set = 1; - tkey->tx_iv16 = 1; /* TSC is initialized to 1 */ - if (seq) { - tkey->rx_iv32 = (seq[5] << 24) | (seq[4] << 16) | - (seq[3] << 8) | seq[2]; - tkey->rx_iv16 = (seq[1] << 8) | seq[0]; - } - } else if (len == 0) - tkey->key_set = 0; - else - return -1; - - return 0; -} - -static int lib80211_tkip_get_key(void *key, int len, u8 * seq, void *priv) -{ - struct lib80211_tkip_data *tkey = priv; - - if (len < TKIP_KEY_LEN) - return -1; - - if (!tkey->key_set) - return 0; - memcpy(key, tkey->key, TKIP_KEY_LEN); - - if (seq) { - /* - * Not clear if this should return the value as is - * or - as the code previously seemed to partially - * have been written as - subtract one from it. It - * was working this way for a long time so leave it. - */ - seq[0] = tkey->tx_iv16; - seq[1] = tkey->tx_iv16 >> 8; - seq[2] = tkey->tx_iv32; - seq[3] = tkey->tx_iv32 >> 8; - seq[4] = tkey->tx_iv32 >> 16; - seq[5] = tkey->tx_iv32 >> 24; - } - - return TKIP_KEY_LEN; -} - -static void lib80211_tkip_print_stats(struct seq_file *m, void *priv) -{ - struct lib80211_tkip_data *tkip = priv; - seq_printf(m, - "key[%d] alg=TKIP key_set=%d " - "tx_pn=%02x%02x%02x%02x%02x%02x " - "rx_pn=%02x%02x%02x%02x%02x%02x " - "replays=%d icv_errors=%d local_mic_failures=%d\n", - tkip->key_idx, tkip->key_set, - (tkip->tx_iv32 >> 24) & 0xff, - (tkip->tx_iv32 >> 16) & 0xff, - (tkip->tx_iv32 >> 8) & 0xff, - tkip->tx_iv32 & 0xff, - (tkip->tx_iv16 >> 8) & 0xff, - tkip->tx_iv16 & 0xff, - (tkip->rx_iv32 >> 24) & 0xff, - (tkip->rx_iv32 >> 16) & 0xff, - (tkip->rx_iv32 >> 8) & 0xff, - tkip->rx_iv32 & 0xff, - (tkip->rx_iv16 >> 8) & 0xff, - tkip->rx_iv16 & 0xff, - tkip->dot11RSNAStatsTKIPReplays, - tkip->dot11RSNAStatsTKIPICVErrors, - tkip->dot11RSNAStatsTKIPLocalMICFailures); -} - -static const struct lib80211_crypto_ops lib80211_crypt_tkip = { - .name = "TKIP", - .init = lib80211_tkip_init, - .deinit = lib80211_tkip_deinit, - .encrypt_mpdu = lib80211_tkip_encrypt, - .decrypt_mpdu = lib80211_tkip_decrypt, - .encrypt_msdu = lib80211_michael_mic_add, - .decrypt_msdu = lib80211_michael_mic_verify, - .set_key = lib80211_tkip_set_key, - .get_key = lib80211_tkip_get_key, - .print_stats = lib80211_tkip_print_stats, - .extra_mpdu_prefix_len = 4 + 4, /* IV + ExtIV */ - .extra_mpdu_postfix_len = 4, /* ICV */ - .extra_msdu_postfix_len = 8, /* MIC */ - .get_flags = lib80211_tkip_get_flags, - .set_flags = lib80211_tkip_set_flags, - .owner = THIS_MODULE, -}; - -static int __init lib80211_crypto_tkip_init(void) -{ - return lib80211_register_crypto_ops(&lib80211_crypt_tkip); -} - -static void __exit lib80211_crypto_tkip_exit(void) -{ - lib80211_unregister_crypto_ops(&lib80211_crypt_tkip); -} - -module_init(lib80211_crypto_tkip_init); -module_exit(lib80211_crypto_tkip_exit); diff --git a/net/wireless/lib80211_crypt_wep.c b/net/wireless/lib80211_crypt_wep.c deleted file mode 100644 index 3b148c7bef85..000000000000 --- a/net/wireless/lib80211_crypt_wep.c +++ /dev/null @@ -1,256 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * lib80211 crypt: host-based WEP encryption implementation for lib80211 - * - * Copyright (c) 2002-2004, Jouni Malinen - * Copyright (c) 2008, John W. Linville - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include - -MODULE_AUTHOR("Jouni Malinen"); -MODULE_DESCRIPTION("lib80211 crypt: WEP"); -MODULE_LICENSE("GPL"); - -struct lib80211_wep_data { - u32 iv; -#define WEP_KEY_LEN 13 - u8 key[WEP_KEY_LEN + 1]; - u8 key_len; - u8 key_idx; - struct arc4_ctx tx_ctx; - struct arc4_ctx rx_ctx; -}; - -static void *lib80211_wep_init(int keyidx) -{ - struct lib80211_wep_data *priv; - - if (fips_enabled) - return NULL; - - priv = kzalloc(sizeof(*priv), GFP_ATOMIC); - if (priv == NULL) - return NULL; - priv->key_idx = keyidx; - - /* start WEP IV from a random value */ - get_random_bytes(&priv->iv, 4); - - return priv; -} - -static void lib80211_wep_deinit(void *priv) -{ - kfree_sensitive(priv); -} - -/* Add WEP IV/key info to a frame that has at least 4 bytes of headroom */ -static int lib80211_wep_build_iv(struct sk_buff *skb, int hdr_len, - u8 *key, int keylen, void *priv) -{ - struct lib80211_wep_data *wep = priv; - u32 klen; - u8 *pos; - - if (skb_headroom(skb) < 4 || skb->len < hdr_len) - return -1; - - pos = skb_push(skb, 4); - memmove(pos, pos + 4, hdr_len); - pos += hdr_len; - - klen = 3 + wep->key_len; - - wep->iv++; - - /* Fluhrer, Mantin, and Shamir have reported weaknesses in the key - * scheduling algorithm of RC4. At least IVs (KeyByte + 3, 0xff, N) - * can be used to speedup attacks, so avoid using them. */ - if ((wep->iv & 0xff00) == 0xff00) { - u8 B = (wep->iv >> 16) & 0xff; - if (B >= 3 && B < klen) - wep->iv += 0x0100; - } - - /* Prepend 24-bit IV to RC4 key and TX frame */ - *pos++ = (wep->iv >> 16) & 0xff; - *pos++ = (wep->iv >> 8) & 0xff; - *pos++ = wep->iv & 0xff; - *pos++ = wep->key_idx << 6; - - return 0; -} - -/* Perform WEP encryption on given skb that has at least 4 bytes of headroom - * for IV and 4 bytes of tailroom for ICV. Both IV and ICV will be transmitted, - * so the payload length increases with 8 bytes. - * - * WEP frame payload: IV + TX key idx, RC4(data), ICV = RC4(CRC32(data)) - */ -static int lib80211_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv) -{ - struct lib80211_wep_data *wep = priv; - u32 crc, klen, len; - u8 *pos, *icv; - u8 key[WEP_KEY_LEN + 3]; - - /* other checks are in lib80211_wep_build_iv */ - if (skb_tailroom(skb) < 4) - return -1; - - /* add the IV to the frame */ - if (lib80211_wep_build_iv(skb, hdr_len, NULL, 0, priv)) - return -1; - - /* Copy the IV into the first 3 bytes of the key */ - skb_copy_from_linear_data_offset(skb, hdr_len, key, 3); - - /* Copy rest of the WEP key (the secret part) */ - memcpy(key + 3, wep->key, wep->key_len); - - len = skb->len - hdr_len - 4; - pos = skb->data + hdr_len + 4; - klen = 3 + wep->key_len; - - /* Append little-endian CRC32 over only the data and encrypt it to produce ICV */ - crc = ~crc32_le(~0, pos, len); - icv = skb_put(skb, 4); - icv[0] = crc; - icv[1] = crc >> 8; - icv[2] = crc >> 16; - icv[3] = crc >> 24; - - arc4_setkey(&wep->tx_ctx, key, klen); - arc4_crypt(&wep->tx_ctx, pos, pos, len + 4); - - return 0; -} - -/* Perform WEP decryption on given buffer. Buffer includes whole WEP part of - * the frame: IV (4 bytes), encrypted payload (including SNAP header), - * ICV (4 bytes). len includes both IV and ICV. - * - * Returns 0 if frame was decrypted successfully and ICV was correct and -1 on - * failure. If frame is OK, IV and ICV will be removed. - */ -static int lib80211_wep_decrypt(struct sk_buff *skb, int hdr_len, void *priv) -{ - struct lib80211_wep_data *wep = priv; - u32 crc, klen, plen; - u8 key[WEP_KEY_LEN + 3]; - u8 keyidx, *pos, icv[4]; - - if (skb->len < hdr_len + 8) - return -1; - - pos = skb->data + hdr_len; - key[0] = *pos++; - key[1] = *pos++; - key[2] = *pos++; - keyidx = *pos++ >> 6; - if (keyidx != wep->key_idx) - return -1; - - klen = 3 + wep->key_len; - - /* Copy rest of the WEP key (the secret part) */ - memcpy(key + 3, wep->key, wep->key_len); - - /* Apply RC4 to data and compute CRC32 over decrypted data */ - plen = skb->len - hdr_len - 8; - - arc4_setkey(&wep->rx_ctx, key, klen); - arc4_crypt(&wep->rx_ctx, pos, pos, plen + 4); - - crc = ~crc32_le(~0, pos, plen); - icv[0] = crc; - icv[1] = crc >> 8; - icv[2] = crc >> 16; - icv[3] = crc >> 24; - if (memcmp(icv, pos + plen, 4) != 0) { - /* ICV mismatch - drop frame */ - return -2; - } - - /* Remove IV and ICV */ - memmove(skb->data + 4, skb->data, hdr_len); - skb_pull(skb, 4); - skb_trim(skb, skb->len - 4); - - return 0; -} - -static int lib80211_wep_set_key(void *key, int len, u8 * seq, void *priv) -{ - struct lib80211_wep_data *wep = priv; - - if (len < 0 || len > WEP_KEY_LEN) - return -1; - - memcpy(wep->key, key, len); - wep->key_len = len; - - return 0; -} - -static int lib80211_wep_get_key(void *key, int len, u8 * seq, void *priv) -{ - struct lib80211_wep_data *wep = priv; - - if (len < wep->key_len) - return -1; - - memcpy(key, wep->key, wep->key_len); - - return wep->key_len; -} - -static void lib80211_wep_print_stats(struct seq_file *m, void *priv) -{ - struct lib80211_wep_data *wep = priv; - seq_printf(m, "key[%d] alg=WEP len=%d\n", wep->key_idx, wep->key_len); -} - -static const struct lib80211_crypto_ops lib80211_crypt_wep = { - .name = "WEP", - .init = lib80211_wep_init, - .deinit = lib80211_wep_deinit, - .encrypt_mpdu = lib80211_wep_encrypt, - .decrypt_mpdu = lib80211_wep_decrypt, - .encrypt_msdu = NULL, - .decrypt_msdu = NULL, - .set_key = lib80211_wep_set_key, - .get_key = lib80211_wep_get_key, - .print_stats = lib80211_wep_print_stats, - .extra_mpdu_prefix_len = 4, /* IV */ - .extra_mpdu_postfix_len = 4, /* ICV */ - .owner = THIS_MODULE, -}; - -static int __init lib80211_crypto_wep_init(void) -{ - return lib80211_register_crypto_ops(&lib80211_crypt_wep); -} - -static void __exit lib80211_crypto_wep_exit(void) -{ - lib80211_unregister_crypto_ops(&lib80211_crypt_wep); -} - -module_init(lib80211_crypto_wep_init); -module_exit(lib80211_crypto_wep_exit); -- cgit v1.2.3 From 3a1d429ebd43bcfdf3590096ca72cbf593d1598b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 21:02:53 +0200 Subject: wifi: wext/libipw: move spy implementation to libipw There's no driver left using this other than ipw2200, so move the data bookkeeping and code into libipw. Link: https://patch.msgid.link/20241007210254.037d864cda7d.Ib2197cb056ff05746d3521a5fba637062acb7314@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/ipw2x00/Kconfig | 3 - drivers/net/wireless/intel/ipw2x00/Makefile | 1 + drivers/net/wireless/intel/ipw2x00/ipw2200.c | 10 +- drivers/net/wireless/intel/ipw2x00/libipw.h | 13 ++ drivers/net/wireless/intel/ipw2x00/libipw_rx.c | 2 +- drivers/net/wireless/intel/ipw2x00/libipw_spy.c | 232 ++++++++++++++++++++++++ include/net/iw_handler.h | 18 -- net/wireless/Kconfig | 3 - net/wireless/Makefile | 1 - net/wireless/wext-spy.c | 232 ------------------------ 10 files changed, 252 insertions(+), 263 deletions(-) create mode 100644 drivers/net/wireless/intel/ipw2x00/libipw_spy.c delete mode 100644 net/wireless/wext-spy.c (limited to 'include/net') diff --git a/drivers/net/wireless/intel/ipw2x00/Kconfig b/drivers/net/wireless/intel/ipw2x00/Kconfig index d9c042772399..ce34118f1e90 100644 --- a/drivers/net/wireless/intel/ipw2x00/Kconfig +++ b/drivers/net/wireless/intel/ipw2x00/Kconfig @@ -7,7 +7,6 @@ config IPW2100 tristate "Intel PRO/Wireless 2100 Network Connection" depends on PCI && CFG80211 select WIRELESS_EXT - select WEXT_SPY select WEXT_PRIV select FW_LOADER select LIBIPW @@ -68,7 +67,6 @@ config IPW2200 depends on PCI && CFG80211 select CFG80211_WEXT_EXPORT select WIRELESS_EXT - select WEXT_SPY select WEXT_PRIV select FW_LOADER select LIBIPW @@ -156,7 +154,6 @@ config LIBIPW tristate depends on PCI && CFG80211 select WIRELESS_EXT - select WEXT_SPY select CRYPTO select CRYPTO_MICHAEL_MIC select CRC32 diff --git a/drivers/net/wireless/intel/ipw2x00/Makefile b/drivers/net/wireless/intel/ipw2x00/Makefile index 60c5faccbe15..91e6091c4ebf 100644 --- a/drivers/net/wireless/intel/ipw2x00/Makefile +++ b/drivers/net/wireless/intel/ipw2x00/Makefile @@ -13,6 +13,7 @@ libipw-objs := \ libipw_rx.o \ libipw_wx.o \ libipw_geo.o \ + libipw_spy.o \ libipw_crypto.o \ libipw_crypto_ccmp.o \ libipw_crypto_tkip.o \ diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2200.c b/drivers/net/wireless/intel/ipw2x00/ipw2200.c index f4fd1fc784b7..0008b4615731 100644 --- a/drivers/net/wireless/intel/ipw2x00/ipw2200.c +++ b/drivers/net/wireless/intel/ipw2x00/ipw2200.c @@ -9856,10 +9856,10 @@ static iw_handler ipw_wx_handlers[] = { IW_HANDLER(SIOCGIWENCODE, ipw_wx_get_encode), IW_HANDLER(SIOCSIWPOWER, ipw_wx_set_power), IW_HANDLER(SIOCGIWPOWER, ipw_wx_get_power), - IW_HANDLER(SIOCSIWSPY, iw_handler_set_spy), - IW_HANDLER(SIOCGIWSPY, iw_handler_get_spy), - IW_HANDLER(SIOCSIWTHRSPY, iw_handler_set_thrspy), - IW_HANDLER(SIOCGIWTHRSPY, iw_handler_get_thrspy), + IW_HANDLER(SIOCSIWSPY, ipw_wx_set_spy), + IW_HANDLER(SIOCGIWSPY, ipw_wx_get_spy), + IW_HANDLER(SIOCSIWTHRSPY, ipw_wx_set_thrspy), + IW_HANDLER(SIOCGIWTHRSPY, ipw_wx_get_thrspy), IW_HANDLER(SIOCSIWGENIE, ipw_wx_set_genie), IW_HANDLER(SIOCGIWGENIE, ipw_wx_get_genie), IW_HANDLER(SIOCSIWMLME, ipw_wx_set_mlme), @@ -11636,7 +11636,7 @@ static int ipw_pci_probe(struct pci_dev *pdev, priv->ieee->worst_rssi = -85; net_dev->netdev_ops = &ipw_netdev_ops; - priv->wireless_data.spy_data = &priv->ieee->spy_data; + priv->ieee->spy_enabled = true; net_dev->wireless_data = &priv->wireless_data; net_dev->wireless_handlers = &ipw_wx_handler_def; net_dev->ethtool_ops = &ipw_ethtool_ops; diff --git a/drivers/net/wireless/intel/ipw2x00/libipw.h b/drivers/net/wireless/intel/ipw2x00/libipw.h index bc727c99ff3c..3c20353e5a41 100644 --- a/drivers/net/wireless/intel/ipw2x00/libipw.h +++ b/drivers/net/wireless/intel/ipw2x00/libipw.h @@ -788,6 +788,7 @@ struct libipw_device { int iw_mode; /* operating mode (IW_MODE_*) */ struct iw_spy_data spy_data; /* iwspy support */ + bool spy_enabled; spinlock_t lock; @@ -1083,4 +1084,16 @@ void libipw_crypto_tkip_exit(void); void libipw_crypto_ccmp_exit(void); void libipw_crypto_exit(void); + +int ipw_wx_set_spy(struct net_device *dev, struct iw_request_info *info, + union iwreq_data *wrqu, char *extra); +int ipw_wx_get_spy(struct net_device *dev, struct iw_request_info *info, + union iwreq_data *wrqu, char *extra); +int ipw_wx_set_thrspy(struct net_device *dev, struct iw_request_info *info, + union iwreq_data *wrqu, char *extra); +int ipw_wx_get_thrspy(struct net_device *dev, struct iw_request_info *info, + union iwreq_data *wrqu, char *extra); +void libipw_spy_update(struct net_device *dev, unsigned char *address, + struct iw_quality *wstats); + #endif /* LIBIPW_H */ diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_rx.c b/drivers/net/wireless/intel/ipw2x00/libipw_rx.c index 1fe05e73a17c..7e41cb7bbfe0 100644 --- a/drivers/net/wireless/intel/ipw2x00/libipw_rx.c +++ b/drivers/net/wireless/intel/ipw2x00/libipw_rx.c @@ -393,7 +393,7 @@ int libipw_rx(struct libipw_device *ieee, struct sk_buff *skb, wstats.updated |= IW_QUAL_QUAL_INVALID; /* Update spy records */ - wireless_spy_update(ieee->dev, hdr->addr2, &wstats); + libipw_spy_update(ieee->dev, hdr->addr2, &wstats); } #endif /* IW_WIRELESS_SPY */ #endif /* CONFIG_WIRELESS_EXT */ diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_spy.c b/drivers/net/wireless/intel/ipw2x00/libipw_spy.c new file mode 100644 index 000000000000..979aeb10aeeb --- /dev/null +++ b/drivers/net/wireless/intel/ipw2x00/libipw_spy.c @@ -0,0 +1,232 @@ +/* + * This file implement the Wireless Extensions spy API. + * + * Authors : Jean Tourrilhes - HPL - + * Copyright (c) 1997-2007 Jean Tourrilhes, All Rights Reserved. + * + * (As all part of the Linux kernel, this file is GPL) + */ + +#include +#include +#include +#include +#include +#include +#include +#include "libipw.h" + +static struct iw_spy_data *get_spydata(struct net_device *dev) +{ + if (dev->wireless_data && dev->wireless_data->libipw && + dev->wireless_data->libipw->spy_enabled) + return &dev->wireless_data->libipw->spy_data; + return NULL; +} + +int ipw_wx_set_spy(struct net_device * dev, + struct iw_request_info * info, + union iwreq_data * wrqu, + char * extra) +{ + struct iw_spy_data * spydata = get_spydata(dev); + struct sockaddr * address = (struct sockaddr *) extra; + + /* Make sure driver is not buggy or using the old API */ + if (!spydata) + return -EOPNOTSUPP; + + /* Disable spy collection while we copy the addresses. + * While we copy addresses, any call to libipw_spy_update() + * will NOP. This is OK, as anyway the addresses are changing. */ + spydata->spy_number = 0; + + /* We want to operate without locking, because libipw_spy_update() + * most likely will happen in the interrupt handler, and therefore + * have its own locking constraints and needs performance. + * The rtnl_lock() make sure we don't race with the other iw_handlers. + * This make sure libipw_spy_update() "see" that the spy list + * is temporarily disabled. */ + smp_wmb(); + + /* Are there are addresses to copy? */ + if (wrqu->data.length > 0) { + int i; + + /* Copy addresses */ + for (i = 0; i < wrqu->data.length; i++) + memcpy(spydata->spy_address[i], address[i].sa_data, + ETH_ALEN); + /* Reset stats */ + memset(spydata->spy_stat, 0, + sizeof(struct iw_quality) * IW_MAX_SPY); + } + + /* Make sure above is updated before re-enabling */ + smp_wmb(); + + /* Enable addresses */ + spydata->spy_number = wrqu->data.length; + + return 0; +} +EXPORT_SYMBOL(ipw_wx_set_spy); + +int ipw_wx_get_spy(struct net_device * dev, + struct iw_request_info * info, + union iwreq_data * wrqu, + char * extra) +{ + struct iw_spy_data * spydata = get_spydata(dev); + struct sockaddr * address = (struct sockaddr *) extra; + int i; + + /* Make sure driver is not buggy or using the old API */ + if (!spydata) + return -EOPNOTSUPP; + + wrqu->data.length = spydata->spy_number; + + /* Copy addresses. */ + for (i = 0; i < spydata->spy_number; i++) { + memcpy(address[i].sa_data, spydata->spy_address[i], ETH_ALEN); + address[i].sa_family = AF_UNIX; + } + /* Copy stats to the user buffer (just after). */ + if (spydata->spy_number > 0) + memcpy(extra + (sizeof(struct sockaddr) *spydata->spy_number), + spydata->spy_stat, + sizeof(struct iw_quality) * spydata->spy_number); + /* Reset updated flags. */ + for (i = 0; i < spydata->spy_number; i++) + spydata->spy_stat[i].updated &= ~IW_QUAL_ALL_UPDATED; + return 0; +} +EXPORT_SYMBOL(ipw_wx_get_spy); + +/*------------------------------------------------------------------*/ +/* + * Standard Wireless Handler : set spy threshold + */ +int ipw_wx_set_thrspy(struct net_device * dev, + struct iw_request_info * info, + union iwreq_data * wrqu, + char * extra) +{ + struct iw_spy_data * spydata = get_spydata(dev); + struct iw_thrspy * threshold = (struct iw_thrspy *) extra; + + /* Make sure driver is not buggy or using the old API */ + if (!spydata) + return -EOPNOTSUPP; + + /* Just do it */ + spydata->spy_thr_low = threshold->low; + spydata->spy_thr_high = threshold->high; + + /* Clear flag */ + memset(spydata->spy_thr_under, '\0', sizeof(spydata->spy_thr_under)); + + return 0; +} +EXPORT_SYMBOL(ipw_wx_set_thrspy); + +/*------------------------------------------------------------------*/ +/* + * Standard Wireless Handler : get spy threshold + */ +int ipw_wx_get_thrspy(struct net_device * dev, + struct iw_request_info * info, + union iwreq_data * wrqu, + char * extra) +{ + struct iw_spy_data * spydata = get_spydata(dev); + struct iw_thrspy * threshold = (struct iw_thrspy *) extra; + + /* Make sure driver is not buggy or using the old API */ + if (!spydata) + return -EOPNOTSUPP; + + /* Just do it */ + threshold->low = spydata->spy_thr_low; + threshold->high = spydata->spy_thr_high; + + return 0; +} +EXPORT_SYMBOL(ipw_wx_get_thrspy); + +/*------------------------------------------------------------------*/ +/* + * Prepare and send a Spy Threshold event + */ +static void iw_send_thrspy_event(struct net_device * dev, + struct iw_spy_data * spydata, + unsigned char * address, + struct iw_quality * wstats) +{ + union iwreq_data wrqu; + struct iw_thrspy threshold; + + /* Init */ + wrqu.data.length = 1; + wrqu.data.flags = 0; + /* Copy address */ + memcpy(threshold.addr.sa_data, address, ETH_ALEN); + threshold.addr.sa_family = ARPHRD_ETHER; + /* Copy stats */ + threshold.qual = *wstats; + /* Copy also thresholds */ + threshold.low = spydata->spy_thr_low; + threshold.high = spydata->spy_thr_high; + + /* Send event to user space */ + wireless_send_event(dev, SIOCGIWTHRSPY, &wrqu, (char *) &threshold); +} + +/* ---------------------------------------------------------------- */ +/* + * Call for the driver to update the spy data. + * For now, the spy data is a simple array. As the size of the array is + * small, this is good enough. If we wanted to support larger number of + * spy addresses, we should use something more efficient... + */ +void libipw_spy_update(struct net_device * dev, + unsigned char * address, + struct iw_quality * wstats) +{ + struct iw_spy_data * spydata = get_spydata(dev); + int i; + int match = -1; + + /* Make sure driver is not buggy or using the old API */ + if (!spydata) + return; + + /* Update all records that match */ + for (i = 0; i < spydata->spy_number; i++) + if (ether_addr_equal(address, spydata->spy_address[i])) { + memcpy(&(spydata->spy_stat[i]), wstats, + sizeof(struct iw_quality)); + match = i; + } + + /* Generate an event if we cross the spy threshold. + * To avoid event storms, we have a simple hysteresis : we generate + * event only when we go under the low threshold or above the + * high threshold. */ + if (match >= 0) { + if (spydata->spy_thr_under[match]) { + if (wstats->level > spydata->spy_thr_high.level) { + spydata->spy_thr_under[match] = 0; + iw_send_thrspy_event(dev, spydata, + address, wstats); + } + } else { + if (wstats->level < spydata->spy_thr_low.level) { + spydata->spy_thr_under[match] = 1; + iw_send_thrspy_event(dev, spydata, + address, wstats); + } + } + } +} diff --git a/include/net/iw_handler.h b/include/net/iw_handler.h index 7af1082ea9a0..a7b502958d27 100644 --- a/include/net/iw_handler.h +++ b/include/net/iw_handler.h @@ -418,8 +418,6 @@ struct iw_spy_data { struct libipw_device; /* The struct */ struct iw_public_data { - /* Driver enhanced spy support */ - struct iw_spy_data * spy_data; /* Legacy structure managed by the ipw2x00-specific IEEE 802.11 layer */ struct libipw_device * libipw; }; @@ -443,22 +441,6 @@ static inline void wireless_nlevent_flush(void) {} /* We may need a function to send a stream of events to user space. * More on that later... */ -/* Standard handler for SIOCSIWSPY */ -int iw_handler_set_spy(struct net_device *dev, struct iw_request_info *info, - union iwreq_data *wrqu, char *extra); -/* Standard handler for SIOCGIWSPY */ -int iw_handler_get_spy(struct net_device *dev, struct iw_request_info *info, - union iwreq_data *wrqu, char *extra); -/* Standard handler for SIOCSIWTHRSPY */ -int iw_handler_set_thrspy(struct net_device *dev, struct iw_request_info *info, - union iwreq_data *wrqu, char *extra); -/* Standard handler for SIOCGIWTHRSPY */ -int iw_handler_get_thrspy(struct net_device *dev, struct iw_request_info *info, - union iwreq_data *wrqu, char *extra); -/* Driver call to update spy records */ -void wireless_spy_update(struct net_device *dev, unsigned char *address, - struct iw_quality *wstats); - /************************* INLINE FUNCTIONS *************************/ /* * Function that are so simple that it's more efficient inlining them diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index 733c53ad4de5..8c8bd8b75708 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -11,9 +11,6 @@ config WEXT_PROC depends on PROC_FS depends on WEXT_CORE -config WEXT_SPY - bool - config WEXT_PRIV bool diff --git a/net/wireless/Makefile b/net/wireless/Makefile index 27f211bd9954..62a83faf0e07 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -4,7 +4,6 @@ obj-y += tests/ obj-$(CONFIG_WEXT_CORE) += wext-core.o obj-$(CONFIG_WEXT_PROC) += wext-proc.o -obj-$(CONFIG_WEXT_SPY) += wext-spy.o obj-$(CONFIG_WEXT_PRIV) += wext-priv.o cfg80211-y += core.o sysfs.o radiotap.o util.o reg.o scan.o nl80211.o diff --git a/net/wireless/wext-spy.c b/net/wireless/wext-spy.c deleted file mode 100644 index b379a0371653..000000000000 --- a/net/wireless/wext-spy.c +++ /dev/null @@ -1,232 +0,0 @@ -/* - * This file implement the Wireless Extensions spy API. - * - * Authors : Jean Tourrilhes - HPL - - * Copyright (c) 1997-2007 Jean Tourrilhes, All Rights Reserved. - * - * (As all part of the Linux kernel, this file is GPL) - */ - -#include -#include -#include -#include -#include -#include -#include - -static inline struct iw_spy_data *get_spydata(struct net_device *dev) -{ - /* This is the new way */ - if (dev->wireless_data) - return dev->wireless_data->spy_data; - return NULL; -} - -int iw_handler_set_spy(struct net_device * dev, - struct iw_request_info * info, - union iwreq_data * wrqu, - char * extra) -{ - struct iw_spy_data * spydata = get_spydata(dev); - struct sockaddr * address = (struct sockaddr *) extra; - - /* Make sure driver is not buggy or using the old API */ - if (!spydata) - return -EOPNOTSUPP; - - /* Disable spy collection while we copy the addresses. - * While we copy addresses, any call to wireless_spy_update() - * will NOP. This is OK, as anyway the addresses are changing. */ - spydata->spy_number = 0; - - /* We want to operate without locking, because wireless_spy_update() - * most likely will happen in the interrupt handler, and therefore - * have its own locking constraints and needs performance. - * The rtnl_lock() make sure we don't race with the other iw_handlers. - * This make sure wireless_spy_update() "see" that the spy list - * is temporarily disabled. */ - smp_wmb(); - - /* Are there are addresses to copy? */ - if (wrqu->data.length > 0) { - int i; - - /* Copy addresses */ - for (i = 0; i < wrqu->data.length; i++) - memcpy(spydata->spy_address[i], address[i].sa_data, - ETH_ALEN); - /* Reset stats */ - memset(spydata->spy_stat, 0, - sizeof(struct iw_quality) * IW_MAX_SPY); - } - - /* Make sure above is updated before re-enabling */ - smp_wmb(); - - /* Enable addresses */ - spydata->spy_number = wrqu->data.length; - - return 0; -} -EXPORT_SYMBOL(iw_handler_set_spy); - -int iw_handler_get_spy(struct net_device * dev, - struct iw_request_info * info, - union iwreq_data * wrqu, - char * extra) -{ - struct iw_spy_data * spydata = get_spydata(dev); - struct sockaddr * address = (struct sockaddr *) extra; - int i; - - /* Make sure driver is not buggy or using the old API */ - if (!spydata) - return -EOPNOTSUPP; - - wrqu->data.length = spydata->spy_number; - - /* Copy addresses. */ - for (i = 0; i < spydata->spy_number; i++) { - memcpy(address[i].sa_data, spydata->spy_address[i], ETH_ALEN); - address[i].sa_family = AF_UNIX; - } - /* Copy stats to the user buffer (just after). */ - if (spydata->spy_number > 0) - memcpy(extra + (sizeof(struct sockaddr) *spydata->spy_number), - spydata->spy_stat, - sizeof(struct iw_quality) * spydata->spy_number); - /* Reset updated flags. */ - for (i = 0; i < spydata->spy_number; i++) - spydata->spy_stat[i].updated &= ~IW_QUAL_ALL_UPDATED; - return 0; -} -EXPORT_SYMBOL(iw_handler_get_spy); - -/*------------------------------------------------------------------*/ -/* - * Standard Wireless Handler : set spy threshold - */ -int iw_handler_set_thrspy(struct net_device * dev, - struct iw_request_info *info, - union iwreq_data * wrqu, - char * extra) -{ - struct iw_spy_data * spydata = get_spydata(dev); - struct iw_thrspy * threshold = (struct iw_thrspy *) extra; - - /* Make sure driver is not buggy or using the old API */ - if (!spydata) - return -EOPNOTSUPP; - - /* Just do it */ - spydata->spy_thr_low = threshold->low; - spydata->spy_thr_high = threshold->high; - - /* Clear flag */ - memset(spydata->spy_thr_under, '\0', sizeof(spydata->spy_thr_under)); - - return 0; -} -EXPORT_SYMBOL(iw_handler_set_thrspy); - -/*------------------------------------------------------------------*/ -/* - * Standard Wireless Handler : get spy threshold - */ -int iw_handler_get_thrspy(struct net_device * dev, - struct iw_request_info *info, - union iwreq_data * wrqu, - char * extra) -{ - struct iw_spy_data * spydata = get_spydata(dev); - struct iw_thrspy * threshold = (struct iw_thrspy *) extra; - - /* Make sure driver is not buggy or using the old API */ - if (!spydata) - return -EOPNOTSUPP; - - /* Just do it */ - threshold->low = spydata->spy_thr_low; - threshold->high = spydata->spy_thr_high; - - return 0; -} -EXPORT_SYMBOL(iw_handler_get_thrspy); - -/*------------------------------------------------------------------*/ -/* - * Prepare and send a Spy Threshold event - */ -static void iw_send_thrspy_event(struct net_device * dev, - struct iw_spy_data * spydata, - unsigned char * address, - struct iw_quality * wstats) -{ - union iwreq_data wrqu; - struct iw_thrspy threshold; - - /* Init */ - wrqu.data.length = 1; - wrqu.data.flags = 0; - /* Copy address */ - memcpy(threshold.addr.sa_data, address, ETH_ALEN); - threshold.addr.sa_family = ARPHRD_ETHER; - /* Copy stats */ - threshold.qual = *wstats; - /* Copy also thresholds */ - threshold.low = spydata->spy_thr_low; - threshold.high = spydata->spy_thr_high; - - /* Send event to user space */ - wireless_send_event(dev, SIOCGIWTHRSPY, &wrqu, (char *) &threshold); -} - -/* ---------------------------------------------------------------- */ -/* - * Call for the driver to update the spy data. - * For now, the spy data is a simple array. As the size of the array is - * small, this is good enough. If we wanted to support larger number of - * spy addresses, we should use something more efficient... - */ -void wireless_spy_update(struct net_device * dev, - unsigned char * address, - struct iw_quality * wstats) -{ - struct iw_spy_data * spydata = get_spydata(dev); - int i; - int match = -1; - - /* Make sure driver is not buggy or using the old API */ - if (!spydata) - return; - - /* Update all records that match */ - for (i = 0; i < spydata->spy_number; i++) - if (ether_addr_equal(address, spydata->spy_address[i])) { - memcpy(&(spydata->spy_stat[i]), wstats, - sizeof(struct iw_quality)); - match = i; - } - - /* Generate an event if we cross the spy threshold. - * To avoid event storms, we have a simple hysteresis : we generate - * event only when we go under the low threshold or above the - * high threshold. */ - if (match >= 0) { - if (spydata->spy_thr_under[match]) { - if (wstats->level > spydata->spy_thr_high.level) { - spydata->spy_thr_under[match] = 0; - iw_send_thrspy_event(dev, spydata, - address, wstats); - } - } else { - if (wstats->level < spydata->spy_thr_low.level) { - spydata->spy_thr_under[match] = 1; - iw_send_thrspy_event(dev, spydata, - address, wstats); - } - } - } -} -EXPORT_SYMBOL(wireless_spy_update); -- cgit v1.2.3 From 836265d31631e28000fc8917ce697fc687a58724 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 21:35:25 +0200 Subject: wifi: remove iw_public_data from struct net_device Given the previous patches, we no longer need the struct iw_public_data etc., it's only used by the old Intel drivers (and ps3_gelic creates it but then doesn't use it). Remove all of that, including the pointer in struct net_device. Link: https://patch.msgid.link/20241007213525.8b2d52b60531.I6a27aaf30bded9a0977f07f47fba2bd31a3b3330@changeid Signed-off-by: Johannes Berg --- Documentation/networking/net_cachelines/net_device.rst | 1 - drivers/net/ethernet/toshiba/ps3_gelic_wireless.c | 1 - drivers/net/ethernet/toshiba/ps3_gelic_wireless.h | 1 - drivers/net/wireless/intel/ipw2x00/ipw2100.c | 2 -- drivers/net/wireless/intel/ipw2x00/ipw2100.h | 2 -- drivers/net/wireless/intel/ipw2x00/ipw2200.c | 1 - drivers/net/wireless/intel/ipw2x00/ipw2200.h | 2 -- drivers/net/wireless/intel/ipw2x00/libipw_spy.c | 7 ++++--- include/linux/netdevice.h | 2 -- include/net/iw_handler.h | 18 ------------------ 10 files changed, 4 insertions(+), 33 deletions(-) (limited to 'include/net') diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index 22b07c814f4a..b7d1d4c7a1ab 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -51,7 +51,6 @@ struct_net_device_core_stats* core_stats atomic_t carrier_up_count atomic_t carrier_down_count struct_iw_handler_def* wireless_handlers -struct_iw_public_data* wireless_data struct_ethtool_ops* ethtool_ops struct_l3mdev_ops* l3mdev_ops struct_ndisc_ops* ndisc_ops diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_wireless.c b/drivers/net/ethernet/toshiba/ps3_gelic_wireless.c index 44488c153ea2..4fbe4b7cd12a 100644 --- a/drivers/net/ethernet/toshiba/ps3_gelic_wireless.c +++ b/drivers/net/ethernet/toshiba/ps3_gelic_wireless.c @@ -2566,7 +2566,6 @@ static void gelic_wl_setup_netdev_ops(struct net_device *netdev) netdev->ethtool_ops = &gelic_wl_ethtool_ops; netdev->netdev_ops = &gelic_wl_netdevice_ops; - netdev->wireless_data = &wl->wireless_data; netdev->wireless_handlers = &gelic_wl_wext_handler_def; } diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_wireless.h b/drivers/net/ethernet/toshiba/ps3_gelic_wireless.h index 1f203d1ae8db..dbabf538e10a 100644 --- a/drivers/net/ethernet/toshiba/ps3_gelic_wireless.h +++ b/drivers/net/ethernet/toshiba/ps3_gelic_wireless.h @@ -276,7 +276,6 @@ struct gelic_wl_info { u8 active_bssid[ETH_ALEN]; /* associated bssid */ unsigned int essid_len; - struct iw_public_data wireless_data; struct iw_statistics iwstat; }; diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c index a89e06c1b8ee..15bf35f2de2a 100644 --- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c +++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c @@ -6022,8 +6022,6 @@ static struct net_device *ipw2100_alloc_device(struct pci_dev *pci_dev, dev->netdev_ops = &ipw2100_netdev_ops; dev->ethtool_ops = &ipw2100_ethtool_ops; dev->wireless_handlers = &ipw2100_wx_handler_def; - priv->wireless_data.libipw = priv->ieee; - dev->wireless_data = &priv->wireless_data; dev->watchdog_timeo = 3 * HZ; dev->irq = 0; dev->min_mtu = 68; diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.h b/drivers/net/wireless/intel/ipw2x00/ipw2100.h index b34085ade3aa..6f81f509b9cb 100644 --- a/drivers/net/wireless/intel/ipw2x00/ipw2100.h +++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.h @@ -554,8 +554,6 @@ struct ipw2100_priv { struct net_device *net_dev; struct iw_statistics wstats; - struct iw_public_data wireless_data; - struct tasklet_struct irq_tasklet; struct delayed_work reset_work; diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2200.c b/drivers/net/wireless/intel/ipw2x00/ipw2200.c index c0e9d2109e34..be1d971b3d32 100644 --- a/drivers/net/wireless/intel/ipw2x00/ipw2200.c +++ b/drivers/net/wireless/intel/ipw2x00/ipw2200.c @@ -11645,7 +11645,6 @@ static int ipw_pci_probe(struct pci_dev *pdev, net_dev->netdev_ops = &ipw_netdev_ops; priv->ieee->spy_enabled = true; - net_dev->wireless_data = &priv->wireless_data; net_dev->wireless_handlers = &ipw_wx_handler_def; net_dev->ethtool_ops = &ipw_ethtool_ops; diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2200.h b/drivers/net/wireless/intel/ipw2x00/ipw2200.h index 46f119123b49..d3db54e6d37c 100644 --- a/drivers/net/wireless/intel/ipw2x00/ipw2200.h +++ b/drivers/net/wireless/intel/ipw2x00/ipw2200.h @@ -1274,8 +1274,6 @@ struct ipw_priv { struct iw_statistics wstats; - struct iw_public_data wireless_data; - int user_requested_scan; u8 direct_scan_ssid[IW_ESSID_MAX_SIZE]; u8 direct_scan_ssid_len; diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_spy.c b/drivers/net/wireless/intel/ipw2x00/libipw_spy.c index 979aeb10aeeb..ba876e92f7f6 100644 --- a/drivers/net/wireless/intel/ipw2x00/libipw_spy.c +++ b/drivers/net/wireless/intel/ipw2x00/libipw_spy.c @@ -18,9 +18,10 @@ static struct iw_spy_data *get_spydata(struct net_device *dev) { - if (dev->wireless_data && dev->wireless_data->libipw && - dev->wireless_data->libipw->spy_enabled) - return &dev->wireless_data->libipw->spy_data; + struct libipw_device *ieee = netdev_priv(dev); + + if (ieee->spy_enabled) + return &ieee->spy_data; return NULL; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e87b5e488325..f0abed1e6e45 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1773,7 +1773,6 @@ enum netdev_reg_state { * @wireless_handlers: List of functions to handle Wireless Extensions, * instead of ioctl, * see for details. - * @wireless_data: Instance data managed by the core of wireless extensions * * @netdev_ops: Includes several pointers to callbacks, * if one wants to override the ndo_*() functions @@ -2150,7 +2149,6 @@ struct net_device { #ifdef CONFIG_WIRELESS_EXT const struct iw_handler_def *wireless_handlers; - struct iw_public_data *wireless_data; #endif const struct ethtool_ops *ethtool_ops; #ifdef CONFIG_NET_L3_MASTER_DEV diff --git a/include/net/iw_handler.h b/include/net/iw_handler.h index a7b502958d27..fc44fcca1d5c 100644 --- a/include/net/iw_handler.h +++ b/include/net/iw_handler.h @@ -404,24 +404,6 @@ struct iw_spy_data { u_char spy_thr_under[IW_MAX_SPY]; }; -/* --------------------- DEVICE WIRELESS DATA --------------------- */ -/* - * This is all the wireless data specific to a device instance that - * is managed by the core of Wireless Extensions or the 802.11 layer. - * We only keep pointer to those structures, so that a driver is free - * to share them between instances. - * This structure should be initialised before registering the device. - * Access to this data follow the same rules as any other struct net_device - * data (i.e. valid as long as struct net_device exist, same locking rules). - */ -/* Forward declaration */ -struct libipw_device; -/* The struct */ -struct iw_public_data { - /* Legacy structure managed by the ipw2x00-specific IEEE 802.11 layer */ - struct libipw_device * libipw; -}; - /**************************** PROTOTYPES ****************************/ /* * Functions part of the Wireless Extensions (defined in net/wireless/wext-core.c). -- cgit v1.2.3 From aee809aaa2d13bf560fe38d28c4969605e6d9d0e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 21:47:16 +0200 Subject: wifi: cfg80211: unexport wireless_nlevent_flush() This no longer needs to be exported, so don't export it. Link: https://patch.msgid.link/20241007214715.3dd736dc3ac0.I1388536e99c37f28a007dd753c473ad21513d9a9@changeid Signed-off-by: Johannes Berg --- include/net/iw_handler.h | 6 ------ net/wireless/wext-compat.h | 6 ++++++ net/wireless/wext-core.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/iw_handler.h b/include/net/iw_handler.h index fc44fcca1d5c..804587b7592b 100644 --- a/include/net/iw_handler.h +++ b/include/net/iw_handler.h @@ -413,12 +413,6 @@ struct iw_spy_data { /* Send a single event to user space */ void wireless_send_event(struct net_device *dev, unsigned int cmd, union iwreq_data *wrqu, const char *extra); -#ifdef CONFIG_WEXT_CORE -/* flush all previous wext events - if work is done from netdev notifiers */ -void wireless_nlevent_flush(void); -#else -static inline void wireless_nlevent_flush(void) {} -#endif /* We may need a function to send a stream of events to user space. * More on that later... */ diff --git a/net/wireless/wext-compat.h b/net/wireless/wext-compat.h index 8251ca5df8ae..f680dd134582 100644 --- a/net/wireless/wext-compat.h +++ b/net/wireless/wext-compat.h @@ -5,6 +5,12 @@ #include #include +#ifdef CONFIG_WEXT_CORE +void wireless_nlevent_flush(void); +#else +static inline void wireless_nlevent_flush(void) {} +#endif + int cfg80211_ibss_wext_siwfreq(struct net_device *dev, struct iw_request_info *info, struct iw_freq *wextfreq, char *extra); diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 3bb04b05c5ce..00c640b3e86e 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -20,6 +20,7 @@ #include #include #include +#include "wext-compat.h" typedef int (*wext_ioctl_func)(struct net_device *, struct iwreq *, unsigned int, struct iw_request_info *, @@ -356,7 +357,6 @@ void wireless_nlevent_flush(void) } up_read(&net_rwsem); } -EXPORT_SYMBOL_GPL(wireless_nlevent_flush); static int wext_netdev_notifier_call(struct notifier_block *nb, unsigned long state, void *ptr) -- cgit v1.2.3 From ff919efb5fe8256fba132b5f2c58df3c38bdd0b3 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 22:00:03 +0200 Subject: wireless: wext: shorten struct iw_ioctl_description There's no need for "future" extensions in an internal struct, and we don't need a u32 for flags, use just a u8. Also remove the unused IW_DESCR_FLAG_WAIT flag. Link: https://patch.msgid.link/20241007220003.309bd52fa763.I9a1229fa7f2be53d4f50e63671ed441d0968bb41@changeid Signed-off-by: Johannes Berg --- include/net/iw_handler.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/iw_handler.h b/include/net/iw_handler.h index 804587b7592b..c9b46b996197 100644 --- a/include/net/iw_handler.h +++ b/include/net/iw_handler.h @@ -279,8 +279,6 @@ #define IW_DESCR_FLAG_RESTRICT 0x0004 /* GET : request is ROOT only */ /* SET : Omit payload from generated iwevent */ #define IW_DESCR_FLAG_NOMAX 0x0008 /* GET : no limit on request size */ -/* Driver level flags */ -#define IW_DESCR_FLAG_WAIT 0x0100 /* Wait for driver event */ /****************************** TYPES ******************************/ @@ -373,11 +371,10 @@ struct iw_handler_def { */ struct iw_ioctl_description { __u8 header_type; /* NULL, iw_point or other */ - __u8 token_type; /* Future */ + __u8 flags; /* Special handling of the request */ __u16 token_size; /* Granularity of payload */ __u16 min_tokens; /* Min acceptable token number */ __u16 max_tokens; /* Max acceptable token number */ - __u32 flags; /* Special handling of the request */ }; /* Need to think of short header translation table. Later. */ -- cgit v1.2.3 From da5e06dee58ad153a4933fd40fc53d571bfef373 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sun, 6 Oct 2024 07:26:09 +0900 Subject: net-timestamp: namespacify the sysctl_tstamp_allow_data Let it be tuned in per netns by admins. Signed-off-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241005222609.94980-1-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski --- include/net/netns/core.h | 1 + include/net/sock.h | 2 -- net/core/net_namespace.c | 1 + net/core/skbuff.c | 2 +- net/core/sock.c | 2 -- net/core/sysctl_net_core.c | 18 +++++++++--------- 6 files changed, 12 insertions(+), 14 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/core.h b/include/net/netns/core.h index 78214f1b43a2..9b36f0ff0c20 100644 --- a/include/net/netns/core.h +++ b/include/net/netns/core.h @@ -15,6 +15,7 @@ struct netns_core { int sysctl_somaxconn; int sysctl_optmem_max; u8 sysctl_txrehash; + u8 sysctl_tstamp_allow_data; #ifdef CONFIG_PROC_FS struct prot_inuse __percpu *prot_inuse; diff --git a/include/net/sock.h b/include/net/sock.h index e282127092ab..b32f1424ecc5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2824,8 +2824,6 @@ void sk_get_meminfo(const struct sock *sk, u32 *meminfo); extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; -extern int sysctl_tstamp_allow_data; - extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 105e3cd26763..a5bc1fd8b034 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -317,6 +317,7 @@ static __net_init void preinit_net_sysctl(struct net *net) */ net->core.sysctl_optmem_max = 128 * 1024; net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; + net->core.sysctl_tstamp_allow_data = 1; } /* init code that must occur even if setup_net() is not called. */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 74149dc4ee31..00afeb90c23a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5506,7 +5506,7 @@ static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) { bool ret; - if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly)) + if (likely(tsonly || READ_ONCE(sock_net(sk)->core.sysctl_tstamp_allow_data))) return true; read_lock_bh(&sk->sk_callback_lock); diff --git a/net/core/sock.c b/net/core/sock.c index 846f494a17cf..083d438d8b6f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -286,8 +286,6 @@ EXPORT_SYMBOL(sysctl_rmem_max); __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; -int sysctl_tstamp_allow_data __read_mostly = 1; - DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); EXPORT_SYMBOL_GPL(memalloc_socks_key); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 86a2476678c4..b60fac380cec 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -491,15 +491,6 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "tstamp_allow_data", - .data = &sysctl_tstamp_allow_data, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE - }, #ifdef CONFIG_RPS { .procname = "rps_sock_flow_entries", @@ -665,6 +656,15 @@ static struct ctl_table netns_core_table[] = { .extra2 = SYSCTL_ONE, .proc_handler = proc_dou8vec_minmax, }, + { + .procname = "tstamp_allow_data", + .data = &init_net.core.sysctl_tstamp_allow_data, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, /* sysctl_core_net_init() will set the values after this * to readonly in network namespaces */ -- cgit v1.2.3 From 3fe3dbaf26723c473d42a58b636a2500586b821e Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 7 Oct 2024 01:44:56 +0100 Subject: caif: Remove unused cfsrvl_getphyid cfsrvl_getphyid() has been unused since 2011's commit f36214408470 ("caif: Use RCU and lists in cfcnfg.c for managing caif link layers") Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241007004456.149899-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- include/net/caif/cfsrvl.h | 1 - net/caif/cfsrvl.c | 6 ------ 2 files changed, 7 deletions(-) (limited to 'include/net') diff --git a/include/net/caif/cfsrvl.h b/include/net/caif/cfsrvl.h index 5ee7b322e18b..a000dc45f966 100644 --- a/include/net/caif/cfsrvl.h +++ b/include/net/caif/cfsrvl.h @@ -40,7 +40,6 @@ void cfsrvl_init(struct cfsrvl *service, struct dev_info *dev_info, bool supports_flowctrl); bool cfsrvl_ready(struct cfsrvl *service, int *err); -u8 cfsrvl_getphyid(struct cflayer *layer); static inline void cfsrvl_get(struct cflayer *layr) { diff --git a/net/caif/cfsrvl.c b/net/caif/cfsrvl.c index 9cef9496a707..171fa32ada85 100644 --- a/net/caif/cfsrvl.c +++ b/net/caif/cfsrvl.c @@ -183,12 +183,6 @@ bool cfsrvl_ready(struct cfsrvl *service, int *err) return true; } -u8 cfsrvl_getphyid(struct cflayer *layer) -{ - struct cfsrvl *servl = container_obj(layer); - return servl->dev_info.id; -} - bool cfsrvl_phyid_match(struct cflayer *layer, int phyid) { struct cfsrvl *servl = container_obj(layer); -- cgit v1.2.3 From db03488897a70367aeafe82d07a78943d2a6068e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 9 Oct 2024 08:33:05 +0200 Subject: Revert "wifi: cfg80211: unexport wireless_nlevent_flush()" Revert this, I neglected to take into account the fact that cfg80211 itself can be a module, but wext is always builtin. Fixes: aee809aaa2d1 ("wifi: cfg80211: unexport wireless_nlevent_flush()") Signed-off-by: Johannes Berg --- include/net/iw_handler.h | 6 ++++++ net/wireless/wext-compat.h | 6 ------ net/wireless/wext-core.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/iw_handler.h b/include/net/iw_handler.h index c9b46b996197..b80e474cb0aa 100644 --- a/include/net/iw_handler.h +++ b/include/net/iw_handler.h @@ -410,6 +410,12 @@ struct iw_spy_data { /* Send a single event to user space */ void wireless_send_event(struct net_device *dev, unsigned int cmd, union iwreq_data *wrqu, const char *extra); +#ifdef CONFIG_WEXT_CORE +/* flush all previous wext events - if work is done from netdev notifiers */ +void wireless_nlevent_flush(void); +#else +static inline void wireless_nlevent_flush(void) {} +#endif /* We may need a function to send a stream of events to user space. * More on that later... */ diff --git a/net/wireless/wext-compat.h b/net/wireless/wext-compat.h index f680dd134582..8251ca5df8ae 100644 --- a/net/wireless/wext-compat.h +++ b/net/wireless/wext-compat.h @@ -5,12 +5,6 @@ #include #include -#ifdef CONFIG_WEXT_CORE -void wireless_nlevent_flush(void); -#else -static inline void wireless_nlevent_flush(void) {} -#endif - int cfg80211_ibss_wext_siwfreq(struct net_device *dev, struct iw_request_info *info, struct iw_freq *wextfreq, char *extra); diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 00c640b3e86e..3bb04b05c5ce 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -20,7 +20,6 @@ #include #include #include -#include "wext-compat.h" typedef int (*wext_ioctl_func)(struct net_device *, struct iwreq *, unsigned int, struct iw_request_info *, @@ -357,6 +356,7 @@ void wireless_nlevent_flush(void) } up_read(&net_rwsem); } +EXPORT_SYMBOL_GPL(wireless_nlevent_flush); static int wext_netdev_notifier_call(struct notifier_block *nb, unsigned long state, void *ptr) -- cgit v1.2.3 From 6607c17c6c5e029da03a90085db22daf518232bf Mon Sep 17 00:00:00 2001 From: Shradha Gupta Date: Tue, 8 Oct 2024 00:06:15 -0700 Subject: net: mana: Enable debugfs files for MANA device Implement debugfs in MANA driver to be able to view RX,TX,EQ queue specific attributes and dump their gdma queues. These dumps can be used by other userspace utilities to improve debuggability and troubleshooting Following files are added in debugfs: /sys/kernel/debug/mana/ |-------------- 1 |--------------- EQs | |------- eq0 | | |---head | | |---tail | | |---eq_dump | |------- eq1 | . | . | |--------------- adapter-MTU |--------------- vport0 |------- RX-0 | |---cq_budget | |---cq_dump | |---cq_head | |---cq_tail | |---rq_head | |---rq_nbuf | |---rq_tail | |---rxq_dump |------- RX-1 . . |------- TX-0 | |---cq_budget | |---cq_dump | |---cq_head | |---cq_tail | |---sq_head | |---sq_pend_skb_qlen | |---sq_tail | |---txq_dump |------- TX-1 . . Signed-off-by: Shradha Gupta Reviewed-by: Haiyang Zhang Signed-off-by: David S. Miller --- drivers/net/ethernet/microsoft/mana/gdma_main.c | 43 +++++++++- drivers/net/ethernet/microsoft/mana/mana_en.c | 105 +++++++++++++++++++++++- include/net/mana/gdma.h | 6 +- include/net/mana/mana.h | 8 ++ 4 files changed, 159 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index ca4ed58f1206..e97af7ac2bb2 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* Copyright (c) 2021, Microsoft Corporation. */ +#include #include #include #include @@ -8,6 +9,8 @@ #include +struct dentry *mana_debugfs_root; + static u32 mana_gd_r32(struct gdma_context *g, u64 offset) { return readl(g->bar0_va + offset); @@ -1516,6 +1519,12 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent) gc->bar0_va = bar0_va; gc->dev = &pdev->dev; + if (gc->is_pf) + gc->mana_pci_debugfs = debugfs_create_dir("0", mana_debugfs_root); + else + gc->mana_pci_debugfs = debugfs_create_dir(pci_slot_name(pdev->slot), + mana_debugfs_root); + err = mana_gd_setup(pdev); if (err) goto unmap_bar; @@ -1529,6 +1538,13 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent) cleanup_gd: mana_gd_cleanup(pdev); unmap_bar: + /* + * at this point we know that the other debugfs child dir/files + * are either not yet created or are already cleaned up. + * The pci debugfs folder clean-up now, will only be cleaning up + * adapter-MTU file and apc->mana_pci_debugfs folder. + */ + debugfs_remove_recursive(gc->mana_pci_debugfs); pci_iounmap(pdev, bar0_va); free_gc: pci_set_drvdata(pdev, NULL); @@ -1549,6 +1565,8 @@ static void mana_gd_remove(struct pci_dev *pdev) mana_gd_cleanup(pdev); + debugfs_remove_recursive(gc->mana_pci_debugfs); + pci_iounmap(pdev, gc->bar0_va); vfree(gc); @@ -1600,6 +1618,8 @@ static void mana_gd_shutdown(struct pci_dev *pdev) mana_gd_cleanup(pdev); + debugfs_remove_recursive(gc->mana_pci_debugfs); + pci_disable_device(pdev); } @@ -1619,7 +1639,28 @@ static struct pci_driver mana_driver = { .shutdown = mana_gd_shutdown, }; -module_pci_driver(mana_driver); +static int __init mana_driver_init(void) +{ + int err; + + mana_debugfs_root = debugfs_create_dir("mana", NULL); + + err = pci_register_driver(&mana_driver); + if (err) + debugfs_remove(mana_debugfs_root); + + return err; +} + +static void __exit mana_driver_exit(void) +{ + debugfs_remove(mana_debugfs_root); + + pci_unregister_driver(&mana_driver); +} + +module_init(mana_driver_init); +module_exit(mana_driver_exit); MODULE_DEVICE_TABLE(pci, mana_id_table); diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index c47266d1c7c2..57ac732e7707 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -3,6 +3,7 @@ #include +#include #include #include #include @@ -30,6 +31,21 @@ static void mana_adev_idx_free(int idx) ida_free(&mana_adev_ida, idx); } +static ssize_t mana_dbg_q_read(struct file *filp, char __user *buf, size_t count, + loff_t *pos) +{ + struct gdma_queue *gdma_q = filp->private_data; + + return simple_read_from_buffer(buf, count, pos, gdma_q->queue_mem_ptr, + gdma_q->queue_size); +} + +static const struct file_operations mana_dbg_q_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = mana_dbg_q_read, +}; + /* Microsoft Azure Network Adapter (MANA) functions */ static int mana_open(struct net_device *ndev) @@ -721,6 +737,13 @@ static const struct net_device_ops mana_devops = { static void mana_cleanup_port_context(struct mana_port_context *apc) { + /* + * at this point all dir/files under the vport directory + * are already cleaned up. + * We are sure the apc->mana_port_debugfs remove will not + * cause any freed memory access issues + */ + debugfs_remove(apc->mana_port_debugfs); kfree(apc->rxqs); apc->rxqs = NULL; } @@ -943,6 +966,8 @@ static int mana_query_device_cfg(struct mana_context *ac, u32 proto_major_ver, else gc->adapter_mtu = ETH_FRAME_LEN; + debugfs_create_u16("adapter-MTU", 0400, gc->mana_pci_debugfs, &gc->adapter_mtu); + return 0; } @@ -1228,6 +1253,8 @@ static void mana_destroy_eq(struct mana_context *ac) if (!ac->eqs) return; + debugfs_remove_recursive(ac->mana_eqs_debugfs); + for (i = 0; i < gc->max_num_queues; i++) { eq = ac->eqs[i].eq; if (!eq) @@ -1240,6 +1267,18 @@ static void mana_destroy_eq(struct mana_context *ac) ac->eqs = NULL; } +static void mana_create_eq_debugfs(struct mana_context *ac, int i) +{ + struct mana_eq eq = ac->eqs[i]; + char eqnum[32]; + + sprintf(eqnum, "eq%d", i); + eq.mana_eq_debugfs = debugfs_create_dir(eqnum, ac->mana_eqs_debugfs); + debugfs_create_u32("head", 0400, eq.mana_eq_debugfs, &eq.eq->head); + debugfs_create_u32("tail", 0400, eq.mana_eq_debugfs, &eq.eq->tail); + debugfs_create_file("eq_dump", 0400, eq.mana_eq_debugfs, eq.eq, &mana_dbg_q_fops); +} + static int mana_create_eq(struct mana_context *ac) { struct gdma_dev *gd = ac->gdma_dev; @@ -1260,11 +1299,14 @@ static int mana_create_eq(struct mana_context *ac) spec.eq.context = ac->eqs; spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE; + ac->mana_eqs_debugfs = debugfs_create_dir("EQs", gc->mana_pci_debugfs); + for (i = 0; i < gc->max_num_queues; i++) { spec.eq.msix_index = (i + 1) % gc->num_msix_usable; err = mana_gd_create_mana_eq(gd, &spec, &ac->eqs[i].eq); if (err) goto out; + mana_create_eq_debugfs(ac, i); } return 0; @@ -1871,6 +1913,8 @@ static void mana_destroy_txq(struct mana_port_context *apc) return; for (i = 0; i < apc->num_queues; i++) { + debugfs_remove_recursive(apc->tx_qp[i].mana_tx_debugfs); + napi = &apc->tx_qp[i].tx_cq.napi; if (apc->tx_qp[i].txq.napi_initialized) { napi_synchronize(napi); @@ -1889,6 +1933,31 @@ static void mana_destroy_txq(struct mana_port_context *apc) apc->tx_qp = NULL; } +static void mana_create_txq_debugfs(struct mana_port_context *apc, int idx) +{ + struct mana_tx_qp *tx_qp = &apc->tx_qp[idx]; + char qnum[32]; + + sprintf(qnum, "TX-%d", idx); + tx_qp->mana_tx_debugfs = debugfs_create_dir(qnum, apc->mana_port_debugfs); + debugfs_create_u32("sq_head", 0400, tx_qp->mana_tx_debugfs, + &tx_qp->txq.gdma_sq->head); + debugfs_create_u32("sq_tail", 0400, tx_qp->mana_tx_debugfs, + &tx_qp->txq.gdma_sq->tail); + debugfs_create_u32("sq_pend_skb_qlen", 0400, tx_qp->mana_tx_debugfs, + &tx_qp->txq.pending_skbs.qlen); + debugfs_create_u32("cq_head", 0400, tx_qp->mana_tx_debugfs, + &tx_qp->tx_cq.gdma_cq->head); + debugfs_create_u32("cq_tail", 0400, tx_qp->mana_tx_debugfs, + &tx_qp->tx_cq.gdma_cq->tail); + debugfs_create_u32("cq_budget", 0400, tx_qp->mana_tx_debugfs, + &tx_qp->tx_cq.budget); + debugfs_create_file("txq_dump", 0400, tx_qp->mana_tx_debugfs, + tx_qp->txq.gdma_sq, &mana_dbg_q_fops); + debugfs_create_file("cq_dump", 0400, tx_qp->mana_tx_debugfs, + tx_qp->tx_cq.gdma_cq, &mana_dbg_q_fops); +} + static int mana_create_txq(struct mana_port_context *apc, struct net_device *net) { @@ -2000,6 +2069,8 @@ static int mana_create_txq(struct mana_port_context *apc, gc->cq_table[cq->gdma_id] = cq->gdma_cq; + mana_create_txq_debugfs(apc, i); + netif_napi_add_tx(net, &cq->napi, mana_poll); napi_enable(&cq->napi); txq->napi_initialized = true; @@ -2027,6 +2098,8 @@ static void mana_destroy_rxq(struct mana_port_context *apc, if (!rxq) return; + debugfs_remove_recursive(rxq->mana_rx_debugfs); + napi = &rxq->rx_cq.napi; if (napi_initialized) { @@ -2308,6 +2381,28 @@ out: return NULL; } +static void mana_create_rxq_debugfs(struct mana_port_context *apc, int idx) +{ + struct mana_rxq *rxq; + char qnum[32]; + + rxq = apc->rxqs[idx]; + + sprintf(qnum, "RX-%d", idx); + rxq->mana_rx_debugfs = debugfs_create_dir(qnum, apc->mana_port_debugfs); + debugfs_create_u32("rq_head", 0400, rxq->mana_rx_debugfs, &rxq->gdma_rq->head); + debugfs_create_u32("rq_tail", 0400, rxq->mana_rx_debugfs, &rxq->gdma_rq->tail); + debugfs_create_u32("rq_nbuf", 0400, rxq->mana_rx_debugfs, &rxq->num_rx_buf); + debugfs_create_u32("cq_head", 0400, rxq->mana_rx_debugfs, + &rxq->rx_cq.gdma_cq->head); + debugfs_create_u32("cq_tail", 0400, rxq->mana_rx_debugfs, + &rxq->rx_cq.gdma_cq->tail); + debugfs_create_u32("cq_budget", 0400, rxq->mana_rx_debugfs, &rxq->rx_cq.budget); + debugfs_create_file("rxq_dump", 0400, rxq->mana_rx_debugfs, rxq->gdma_rq, &mana_dbg_q_fops); + debugfs_create_file("cq_dump", 0400, rxq->mana_rx_debugfs, rxq->rx_cq.gdma_cq, + &mana_dbg_q_fops); +} + static int mana_add_rx_queues(struct mana_port_context *apc, struct net_device *ndev) { @@ -2326,6 +2421,8 @@ static int mana_add_rx_queues(struct mana_port_context *apc, u64_stats_init(&rxq->stats.syncp); apc->rxqs[i] = rxq; + + mana_create_rxq_debugfs(apc, i); } apc->default_rxobj = apc->rxqs[0]->rxobj; @@ -2518,14 +2615,19 @@ void mana_query_gf_stats(struct mana_port_context *apc) static int mana_init_port(struct net_device *ndev) { struct mana_port_context *apc = netdev_priv(ndev); + struct gdma_dev *gd = apc->ac->gdma_dev; u32 max_txq, max_rxq, max_queues; int port_idx = apc->port_idx; + struct gdma_context *gc; + char vport[32]; int err; err = mana_init_port_context(apc); if (err) return err; + gc = gd->gdma_context; + err = mana_query_vport_cfg(apc, port_idx, &max_txq, &max_rxq, &apc->indir_table_sz); if (err) { @@ -2542,7 +2644,8 @@ static int mana_init_port(struct net_device *ndev) apc->num_queues = apc->max_queues; eth_hw_addr_set(ndev, apc->mac_addr); - + sprintf(vport, "vport%d", port_idx); + apc->mana_port_debugfs = debugfs_create_dir(vport, gc->mana_pci_debugfs); return 0; reset_apc: diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index de47fa533b15..90f56656b572 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -267,7 +267,8 @@ struct gdma_event { struct gdma_queue; struct mana_eq { - struct gdma_queue *eq; + struct gdma_queue *eq; + struct dentry *mana_eq_debugfs; }; typedef void gdma_eq_callback(void *context, struct gdma_queue *q, @@ -365,6 +366,7 @@ struct gdma_irq_context { struct gdma_context { struct device *dev; + struct dentry *mana_pci_debugfs; /* Per-vPort max number of queues */ unsigned int max_num_queues; @@ -878,5 +880,7 @@ int mana_gd_send_request(struct gdma_context *gc, u32 req_len, const void *req, u32 resp_len, void *resp); int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle); +void mana_register_debugfs(void); +void mana_unregister_debugfs(void); #endif /* _GDMA_H */ diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 9b0faa24b758..0d00b24eacaf 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -350,6 +350,7 @@ struct mana_rxq { int xdp_rc; /* XDP redirect return code */ struct page_pool *page_pool; + struct dentry *mana_rx_debugfs; /* MUST BE THE LAST MEMBER: * Each receive buffer has an associated mana_recv_buf_oob. @@ -363,6 +364,8 @@ struct mana_tx_qp { struct mana_cq tx_cq; mana_handle_t tx_object; + + struct dentry *mana_tx_debugfs; }; struct mana_ethtool_stats { @@ -407,6 +410,7 @@ struct mana_context { u16 num_ports; struct mana_eq *eqs; + struct dentry *mana_eqs_debugfs; struct net_device *ports[MAX_PORTS_IN_MANA_DEV]; }; @@ -468,6 +472,9 @@ struct mana_port_context { bool port_st_save; /* Saved port state */ struct mana_ethtool_stats eth_stats; + + /* Debugfs */ + struct dentry *mana_port_debugfs; }; netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev); @@ -494,6 +501,7 @@ int mana_pre_alloc_rxbufs(struct mana_port_context *apc, int mtu, int num_queues void mana_pre_dealloc_rxbufs(struct mana_port_context *apc); extern const struct ethtool_ops mana_ethtool_ops; +extern struct dentry *mana_debugfs_root; /* A CQ can be created not associated with any EQ */ #define GDMA_CQ_NO_EQ 0xffff -- cgit v1.2.3 From 2b78d30620d7f8a9f9ce312ad21200ec7a554bd9 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 7 Oct 2024 20:24:29 +0200 Subject: ipv4: Convert ip_route_use_hint() to dscp_t. Pass a dscp_t variable to ip_route_use_hint(), instead of a plain u8, to prevent accidental setting of ECN bits in ->flowi4_tos. Only ip_rcv_finish_core() actually calls ip_route_use_hint(). Use the ip4h_dscp() helper to get the DSCP from the IPv4 header. While there, modify the declaration of ip_route_use_hint() in include/net/route.h so that it matches the prototype of its implementation in net/ipv4/route.c. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/c40994fdf804db7a363d04fdee01bf48dddda676.1728302212.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/net/route.h | 4 ++-- net/ipv4/ip_input.c | 4 ++-- net/ipv4/route.c | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index 5e4374d66927..c219c0fecdcf 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -203,8 +203,8 @@ int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct in_device *in_dev, u32 *itag); int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev); -int ip_route_use_hint(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin, +int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, const struct sk_buff *hint); static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index c0a2490eb7c1..89bb63da6852 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -325,8 +325,8 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk, drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (ip_can_use_hint(skb, iph, hint)) { - err = ip_route_use_hint(skb, iph->daddr, iph->saddr, iph->tos, - dev, hint); + err = ip_route_use_hint(skb, iph->daddr, iph->saddr, + ip4h_dscp(iph), dev, hint); if (unlikely(err)) goto drop_error; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6e1cd0065b87..ac03916cfcde 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2136,7 +2136,7 @@ static int ip_mkroute_input(struct sk_buff *skb, * Uses the provided hint instead of performing a route lookup. */ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, + dscp_t dscp, struct net_device *dev, const struct sk_buff *hint) { struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -2160,8 +2160,8 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (rt->rt_type != RTN_LOCAL) goto skip_validate_source; - tos &= INET_DSCP_MASK; - err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag); + err = fib_validate_source(skb, saddr, daddr, inet_dscp_to_dsfield(dscp), + 0, dev, in_dev, &tag); if (err < 0) goto martian_source; -- cgit v1.2.3 From d32976408744a589f04b5c939f8f01f7167e5167 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 7 Oct 2024 20:24:54 +0200 Subject: ipv4: Convert ip_mc_validate_source() to dscp_t. Pass a dscp_t variable to ip_mc_validate_source(), instead of a plain u8, to prevent accidental setting of ECN bits in ->flowi4_tos. Callers of ip_mc_validate_source() to consider are: * ip_route_input_mc() which already has a dscp_t variable to pass as parameter. We just need to remove the inet_dscp_to_dsfield() conversion. * udp_v4_early_demux() which gets the DSCP directly from the IPv4 header and can simply use the ip4h_dscp() helper. Also, stop including net/inet_dscp.h in udp.c as we don't use any of its declarations anymore. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/c91b2cca04718b7ee6cf5b9c1d5b40507d65a8d4.1728302212.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/net/route.h | 3 ++- net/ipv4/route.c | 8 ++++---- net/ipv4/udp.c | 4 ++-- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index c219c0fecdcf..586e59f7ed8a 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -198,8 +198,9 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 fl4->fl4_gre_key = gre_key; return ip_route_output_key(net, fl4); } + int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, + dscp_t dscp, struct net_device *dev, struct in_device *in_dev, u32 *itag); int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 527121be1ba2..1efb65e647c1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1666,7 +1666,7 @@ EXPORT_SYMBOL(rt_dst_clone); /* called in rcu_read_lock() section */ int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, + dscp_t dscp, struct net_device *dev, struct in_device *in_dev, u32 *itag) { int err; @@ -1687,7 +1687,8 @@ int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, ip_hdr(skb)->protocol != IPPROTO_IGMP) return -EINVAL; } else { - err = fib_validate_source(skb, saddr, 0, tos, 0, dev, + err = fib_validate_source(skb, saddr, 0, + inet_dscp_to_dsfield(dscp), 0, dev, in_dev, itag); if (err < 0) return err; @@ -1705,8 +1706,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, u32 itag = 0; int err; - err = ip_mc_validate_source(skb, daddr, saddr, - inet_dscp_to_dsfield(dscp), dev, in_dev, + err = ip_mc_validate_source(skb, daddr, saddr, dscp, dev, in_dev, &itag); if (err) return err; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8accbf4cb295..4b74a25d0b6e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -100,6 +100,7 @@ #include #include #include +#include #include #include #include @@ -115,7 +116,6 @@ #include #include #include -#include #if IS_ENABLED(CONFIG_IPV6) #include #endif @@ -2619,7 +2619,7 @@ int udp_v4_early_demux(struct sk_buff *skb) if (!inet_sk(sk)->inet_daddr && in_dev) return ip_mc_validate_source(skb, iph->daddr, iph->saddr, - iph->tos & INET_DSCP_MASK, + ip4h_dscp(iph), skb->dev, in_dev, &itag); } return 0; -- cgit v1.2.3 From d36236ab52754ef6bd083be945e9c2e93f466022 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 7 Oct 2024 20:25:02 +0200 Subject: ipv4: Convert fib_validate_source() to dscp_t. Pass a dscp_t variable to fib_validate_source(), instead of a plain u8, to prevent accidental setting of ECN bits in ->flowi4_tos. All callers of fib_validate_source() already have a dscp_t variable to pass as parameter. We just need to remove the inet_dscp_to_dsfield() conversions. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/08612a4519bc5a3578bb493fbaad82437ebb73dc.1728302212.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/net/ip_fib.h | 3 ++- net/ipv4/fib_frontend.c | 5 +++-- net/ipv4/route.c | 21 +++++++++------------ 3 files changed, 14 insertions(+), 15 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 967e4dc555fa..06130933542d 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -449,8 +449,9 @@ int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, __be32 fib_compute_spec_dst(struct sk_buff *skb); bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev); int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, - u8 tos, int oif, struct net_device *dev, + dscp_t dscp, int oif, struct net_device *dev, struct in_device *idev, u32 *itag); + #ifdef CONFIG_IP_ROUTE_CLASSID static inline int fib_num_tclassid_users(struct net *net) { diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 793e6781399a..d0fbc8c8c5e6 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -419,7 +419,7 @@ e_rpf: /* Ignore rp_filter for packets protected by IPsec. */ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, - u8 tos, int oif, struct net_device *dev, + dscp_t dscp, int oif, struct net_device *dev, struct in_device *idev, u32 *itag) { int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); @@ -448,7 +448,8 @@ ok: } full_check: - return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag); + return __fib_validate_source(skb, src, dst, inet_dscp_to_dsfield(dscp), + oif, dev, r, idev, itag); } static inline __be32 sk_extract_addr(struct sockaddr *addr) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 1efb65e647c1..a0b091a7df87 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1687,9 +1687,8 @@ int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, ip_hdr(skb)->protocol != IPPROTO_IGMP) return -EINVAL; } else { - err = fib_validate_source(skb, saddr, 0, - inet_dscp_to_dsfield(dscp), 0, dev, - in_dev, itag); + err = fib_validate_source(skb, saddr, 0, dscp, 0, dev, in_dev, + itag); if (err < 0) return err; } @@ -1786,8 +1785,8 @@ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, return -EINVAL; } - err = fib_validate_source(skb, saddr, daddr, inet_dscp_to_dsfield(dscp), - FIB_RES_OIF(*res), in_dev->dev, in_dev, &itag); + err = fib_validate_source(skb, saddr, daddr, dscp, FIB_RES_OIF(*res), + in_dev->dev, in_dev, &itag); if (err < 0) { ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); @@ -2159,8 +2158,8 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (rt->rt_type != RTN_LOCAL) goto skip_validate_source; - err = fib_validate_source(skb, saddr, daddr, inet_dscp_to_dsfield(dscp), - 0, dev, in_dev, &tag); + err = fib_validate_source(skb, saddr, daddr, dscp, 0, dev, in_dev, + &tag); if (err < 0) goto martian_source; @@ -2298,8 +2297,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, } if (res->type == RTN_LOCAL) { - err = fib_validate_source(skb, saddr, daddr, - inet_dscp_to_dsfield(dscp), 0, dev, + err = fib_validate_source(skb, saddr, daddr, dscp, 0, dev, in_dev, &itag); if (err < 0) goto martian_source; @@ -2322,9 +2320,8 @@ brd_input: goto e_inval; if (!ipv4_is_zeronet(saddr)) { - err = fib_validate_source(skb, saddr, 0, - inet_dscp_to_dsfield(dscp), 0, dev, - in_dev, &itag); + err = fib_validate_source(skb, saddr, 0, dscp, 0, dev, in_dev, + &itag); if (err < 0) goto martian_source; } -- cgit v1.2.3 From 87173021f1583ee37f4801fcde354729da8db3dc Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 8 Oct 2024 10:29:03 -0700 Subject: ipv4: Link IPv4 address to per-netns hash table. As a prep for per-netns RTNL conversion, we want to namespacify the IPv4 address hash table and the GC work. Let's allocate the per-netns IPv4 address hash table to net->ipv4.inet_addr_lst and link IPv4 addresses into it. The actual users will be converted later. Note that the IPv6 address hash table is already namespacified. Reviewed-by: Eric Dumazet Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241008172906.1326-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/linux/inetdevice.h | 1 + include/net/netns/ipv4.h | 1 + net/ipv4/devinet.c | 22 +++++++++++++++++++--- 3 files changed, 21 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index cb5280e6cc21..d0c2bf67a9b0 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -142,6 +142,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev) struct in_ifaddr { struct hlist_node hash; + struct hlist_node addr_lst; struct in_ifaddr __rcu *ifa_next; struct in_device *ifa_dev; struct rcu_head rcu_head; diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 276f622f3516..29eba2eaaa26 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -270,5 +270,6 @@ struct netns_ipv4 { atomic_t rt_genid; siphash_key_t ip_id_key; + struct hlist_head *inet_addr_lst; }; #endif diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index ab76744383cf..059807a627a6 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -134,11 +134,13 @@ static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa) ASSERT_RTNL(); hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]); + hlist_add_head_rcu(&ifa->addr_lst, &net->ipv4.inet_addr_lst[hash]); } static void inet_hash_remove(struct in_ifaddr *ifa) { ASSERT_RTNL(); + hlist_del_init_rcu(&ifa->addr_lst); hlist_del_init_rcu(&ifa->hash); } @@ -228,6 +230,7 @@ static struct in_ifaddr *inet_alloc_ifa(struct in_device *in_dev) ifa->ifa_dev = in_dev; INIT_HLIST_NODE(&ifa->hash); + INIT_HLIST_NODE(&ifa->addr_lst); return ifa; } @@ -2663,14 +2666,21 @@ static struct ctl_table ctl_forward_entry[] = { static __net_init int devinet_init_net(struct net *net) { - int err; - struct ipv4_devconf *all, *dflt; #ifdef CONFIG_SYSCTL - struct ctl_table *tbl; struct ctl_table_header *forw_hdr; + struct ctl_table *tbl; #endif + struct ipv4_devconf *all, *dflt; + int err; + int i; err = -ENOMEM; + net->ipv4.inet_addr_lst = kmalloc_array(IN4_ADDR_HSIZE, + sizeof(struct hlist_head), + GFP_KERNEL); + if (!net->ipv4.inet_addr_lst) + goto err_alloc_hash; + all = kmemdup(&ipv4_devconf, sizeof(ipv4_devconf), GFP_KERNEL); if (!all) goto err_alloc_all; @@ -2731,6 +2741,9 @@ static __net_init int devinet_init_net(struct net *net) net->ipv4.forw_hdr = forw_hdr; #endif + for (i = 0; i < IN4_ADDR_HSIZE; i++) + INIT_HLIST_HEAD(&net->ipv4.inet_addr_lst[i]); + net->ipv4.devconf_all = all; net->ipv4.devconf_dflt = dflt; return 0; @@ -2748,6 +2761,8 @@ err_alloc_ctl: err_alloc_dflt: kfree(all); err_alloc_all: + kfree(net->ipv4.inet_addr_lst); +err_alloc_hash: return err; } @@ -2766,6 +2781,7 @@ static __net_exit void devinet_exit_net(struct net *net) #endif kfree(net->ipv4.devconf_dflt); kfree(net->ipv4.devconf_all); + kfree(net->ipv4.inet_addr_lst); } static __net_initdata struct pernet_operations devinet_ops = { -- cgit v1.2.3 From 1675f385213edc14ed849e079d6866b48e552252 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 8 Oct 2024 10:29:05 -0700 Subject: ipv4: Namespacify IPv4 address GC. Each IPv4 address could have a lifetime, which is useful for DHCP, and GC is periodically executed as check_lifetime_work. check_lifetime() does the actual GC under RTNL. 1. Acquire RTNL 2. Iterate inet_addr_lst 3. Remove IPv4 address if expired 4. Release RTNL Namespacifying the GC is required for per-netns RTNL, but using the per-netns hash table will shorten the time on the hash bucket iteration under RTNL. Let's add per-netns GC work and use the per-netns hash table. Reviewed-by: Eric Dumazet Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241008172906.1326-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 1 + net/ipv4/devinet.c | 32 ++++++++++++++++++-------------- 2 files changed, 19 insertions(+), 14 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 29eba2eaaa26..66a4cffc44ee 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -271,5 +271,6 @@ struct netns_ipv4 { atomic_t rt_genid; siphash_key_t ip_id_key; struct hlist_head *inet_addr_lst; + struct delayed_work addr_chk_work; }; #endif diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index cf47b5ac061f..ac245944e89e 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -486,15 +486,12 @@ static void inet_del_ifa(struct in_device *in_dev, __inet_del_ifa(in_dev, ifap, destroy, NULL, 0); } -static void check_lifetime(struct work_struct *work); - -static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime); - static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, u32 portid, struct netlink_ext_ack *extack) { struct in_ifaddr __rcu **last_primary, **ifap; struct in_device *in_dev = ifa->ifa_dev; + struct net *net = dev_net(in_dev->dev); struct in_validator_info ivi; struct in_ifaddr *ifa1; int ret; @@ -563,8 +560,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, inet_hash_insert(dev_net(in_dev->dev), ifa); - cancel_delayed_work(&check_lifetime_work); - queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0); + cancel_delayed_work(&net->ipv4.addr_chk_work); + queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, 0); /* Send message first, then call notifier. Notifier will trigger FIB update, so that @@ -710,16 +707,19 @@ static void check_lifetime(struct work_struct *work) unsigned long now, next, next_sec, next_sched; struct in_ifaddr *ifa; struct hlist_node *n; + struct net *net; int i; + net = container_of(to_delayed_work(work), struct net, ipv4.addr_chk_work); now = jiffies; next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY); for (i = 0; i < IN4_ADDR_HSIZE; i++) { + struct hlist_head *head = &net->ipv4.inet_addr_lst[i]; bool change_needed = false; rcu_read_lock(); - hlist_for_each_entry_rcu(ifa, &inet_addr_lst[i], hash) { + hlist_for_each_entry_rcu(ifa, head, addr_lst) { unsigned long age, tstamp; u32 preferred_lft; u32 valid_lft; @@ -757,7 +757,7 @@ static void check_lifetime(struct work_struct *work) if (!change_needed) continue; rtnl_lock(); - hlist_for_each_entry_safe(ifa, n, &inet_addr_lst[i], hash) { + hlist_for_each_entry_safe(ifa, n, head, addr_lst) { unsigned long age; if (ifa->ifa_flags & IFA_F_PERMANENT) @@ -806,8 +806,8 @@ static void check_lifetime(struct work_struct *work) if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX)) next_sched = now + ADDRCONF_TIMER_FUZZ_MAX; - queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, - next_sched - now); + queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, + next_sched - now); } static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft, @@ -1004,9 +1004,9 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, ifa->ifa_proto = new_proto; set_ifa_lifetime(ifa, valid_lft, prefered_lft); - cancel_delayed_work(&check_lifetime_work); + cancel_delayed_work(&net->ipv4.addr_chk_work); queue_delayed_work(system_power_efficient_wq, - &check_lifetime_work, 0); + &net->ipv4.addr_chk_work, 0); rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid); } return 0; @@ -2743,6 +2743,8 @@ static __net_init int devinet_init_net(struct net *net) for (i = 0; i < IN4_ADDR_HSIZE; i++) INIT_HLIST_HEAD(&net->ipv4.inet_addr_lst[i]); + INIT_DEFERRABLE_WORK(&net->ipv4.addr_chk_work, check_lifetime); + net->ipv4.devconf_all = all; net->ipv4.devconf_dflt = dflt; return 0; @@ -2769,7 +2771,11 @@ static __net_exit void devinet_exit_net(struct net *net) { #ifdef CONFIG_SYSCTL const struct ctl_table *tbl; +#endif + + cancel_delayed_work_sync(&net->ipv4.addr_chk_work); +#ifdef CONFIG_SYSCTL tbl = net->ipv4.forw_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.forw_hdr); __devinet_sysctl_unregister(net, net->ipv4.devconf_dflt, @@ -2806,8 +2812,6 @@ void __init devinet_init(void) register_pernet_subsys(&devinet_ops); register_netdevice_notifier(&ip_netdev_notifier); - queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0); - rtnl_af_register(&inet_af_ops); rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, 0); -- cgit v1.2.3 From 9e542ff8b79ae1871074d3a7dca7de9e7374eeda Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 8 Oct 2024 09:32:04 -0700 Subject: net: Remove likely from l3mdev_master_ifindex_by_index The likely() annotation in l3mdev_master_ifindex_by_index() has been found to be incorrect 100% of the time in real-world workloads (e.g., web servers). Annotated branches shows the following in these servers: correct incorrect % Function File Line 0 169053813 100 l3mdev_master_ifindex_by_index l3mdev.h 81 This is happening because l3mdev_master_ifindex_by_index() is called from __inet_check_established(), which calls l3mdev_master_ifindex_by_index() passing the socked bounded interface. l3mdev_master_ifindex_by_index(net, sk->sk_bound_dev_if); Since most sockets are not going to be bound to a network device, the likely() is giving the wrong assumption. Remove the likely() annotation to ensure more accurate branch prediction. Signed-off-by: Breno Leitao Reviewed-by: David Ahern Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241008163205.3939629-1-leitao@debian.org Signed-off-by: Paolo Abeni --- include/net/l3mdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index 031c661aa14d..2d6141f28b53 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -78,7 +78,7 @@ static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex) struct net_device *dev; int rc = 0; - if (likely(ifindex)) { + if (ifindex) { rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); -- cgit v1.2.3 From 13d68a16430312fc21990f48326366eb73891202 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Oct 2024 10:09:47 +0200 Subject: genetlink: extend info user-storage to match NL cb ctx This allows a more uniform implementation of non-dump and dump operations, and will be used later in the series to avoid some per-operation allocation. Additionally rename the NL_ASSERT_DUMP_CTX_FITS macro, to fit a more extended usage. Suggested-by: Jakub Kicinski Reviewed-by: Jakub Kicinski Reviewed-by: Jiri Pirko Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/1130cc2896626b84587a2a5f96a5c6829638f4da.1728460186.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_mdb.c | 2 +- include/linux/netlink.h | 5 +++-- include/net/genetlink.h | 8 ++++++-- net/core/netdev-genl.c | 2 +- net/core/rtnetlink.c | 2 +- net/devlink/devl_internal.h | 2 +- net/ethtool/rss.c | 2 +- net/netfilter/nf_conntrack_netlink.c | 2 +- net/netlink/genetlink.c | 4 ++-- 9 files changed, 17 insertions(+), 12 deletions(-) (limited to 'include/net') diff --git a/drivers/net/vxlan/vxlan_mdb.c b/drivers/net/vxlan/vxlan_mdb.c index 60eb95a06d55..ebed05a2804c 100644 --- a/drivers/net/vxlan/vxlan_mdb.c +++ b/drivers/net/vxlan/vxlan_mdb.c @@ -284,7 +284,7 @@ int vxlan_mdb_dump(struct net_device *dev, struct sk_buff *skb, ASSERT_RTNL(); - NL_ASSERT_DUMP_CTX_FITS(struct vxlan_mdb_dump_ctx); + NL_ASSERT_CTX_FITS(struct vxlan_mdb_dump_ctx); nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWMDB, sizeof(*bpm), diff --git a/include/linux/netlink.h b/include/linux/netlink.h index b332c2048c75..a3ca198a3a9e 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -34,6 +34,7 @@ struct netlink_skb_parms { #define NETLINK_CB(skb) (*(struct netlink_skb_parms*)&((skb)->cb)) #define NETLINK_CREDS(skb) (&NETLINK_CB((skb)).creds) +#define NETLINK_CTX_SIZE 48 void netlink_table_grab(void); @@ -293,7 +294,7 @@ struct netlink_callback { int flags; bool strict_check; union { - u8 ctx[48]; + u8 ctx[NETLINK_CTX_SIZE]; /* args is deprecated. Cast a struct over ctx instead * for proper type safety. @@ -302,7 +303,7 @@ struct netlink_callback { }; }; -#define NL_ASSERT_DUMP_CTX_FITS(type_name) \ +#define NL_ASSERT_CTX_FITS(type_name) \ BUILD_BUG_ON(sizeof(type_name) > \ sizeof_field(struct netlink_callback, ctx)) diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 9ab49bfeae78..9d3726e8f90e 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -124,7 +124,8 @@ struct genl_family { * @genlhdr: generic netlink message header * @attrs: netlink attributes * @_net: network namespace - * @user_ptr: user pointers + * @ctx: storage space for the use by the family + * @user_ptr: user pointers (deprecated, use ctx instead) * @extack: extended ACK report struct */ struct genl_info { @@ -135,7 +136,10 @@ struct genl_info { struct genlmsghdr * genlhdr; struct nlattr ** attrs; possible_net_t _net; - void * user_ptr[2]; + union { + u8 ctx[NETLINK_CTX_SIZE]; + void * user_ptr[2]; + }; struct netlink_ext_ack *extack; }; diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 1cb954f2d39e..358cba248796 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -24,7 +24,7 @@ struct netdev_nl_dump_ctx { static struct netdev_nl_dump_ctx *netdev_dump_ctx(struct netlink_callback *cb) { - NL_ASSERT_DUMP_CTX_FITS(struct netdev_nl_dump_ctx); + NL_ASSERT_CTX_FITS(struct netdev_nl_dump_ctx); return (struct netdev_nl_dump_ctx *)cb->ctx; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 6d68247aea70..a9b81b7d9746 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -6243,7 +6243,7 @@ static int rtnl_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb) int idx, s_idx; int err; - NL_ASSERT_DUMP_CTX_FITS(struct rtnl_mdb_dump_ctx); + NL_ASSERT_CTX_FITS(struct rtnl_mdb_dump_ctx); if (cb->strict_check) { err = rtnl_mdb_valid_dump_req(cb->nlh, cb->extack); diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index c7a8e13f917c..a9f064ab9ed9 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -166,7 +166,7 @@ int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb, static inline struct devlink_nl_dump_state * devlink_dump_state(struct netlink_callback *cb) { - NL_ASSERT_DUMP_CTX_FITS(struct devlink_nl_dump_state); + NL_ASSERT_CTX_FITS(struct devlink_nl_dump_state); return (struct devlink_nl_dump_state *)cb->ctx; } diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index e07386275e14..7cb106b590ab 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -224,7 +224,7 @@ struct rss_nl_dump_ctx { static struct rss_nl_dump_ctx *rss_dump_ctx(struct netlink_callback *cb) { - NL_ASSERT_DUMP_CTX_FITS(struct rss_nl_dump_ctx); + NL_ASSERT_CTX_FITS(struct rss_nl_dump_ctx); return (struct rss_nl_dump_ctx *)cb->ctx; } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 6a1239433830..36168f8b6efa 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -3870,7 +3870,7 @@ static int __init ctnetlink_init(void) { int ret; - NL_ASSERT_DUMP_CTX_FITS(struct ctnetlink_list_dump_ctx); + NL_ASSERT_CTX_FITS(struct ctnetlink_list_dump_ctx); ret = nfnetlink_subsys_register(&ctnl_subsys); if (ret < 0) { diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index feb54c63a116..29387b605f3e 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -997,7 +997,7 @@ static int genl_start(struct netlink_callback *cb) info->info.attrs = attrs; genl_info_net_set(&info->info, sock_net(cb->skb->sk)); info->info.extack = cb->extack; - memset(&info->info.user_ptr, 0, sizeof(info->info.user_ptr)); + memset(&info->info.ctx, 0, sizeof(info->info.ctx)); cb->data = info; if (ops->start) { @@ -1104,7 +1104,7 @@ static int genl_family_rcv_msg_doit(const struct genl_family *family, info.attrs = attrbuf; info.extack = extack; genl_info_net_set(&info, net); - memset(&info.user_ptr, 0, sizeof(info.user_ptr)); + memset(&info.ctx, 0, sizeof(info.ctx)); if (ops->pre_doit) { err = ops->pre_doit(ops, skb, &info); -- cgit v1.2.3 From 4b623f9f0f59652ea71fcb27d60b4c3b65126dbb Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Oct 2024 10:09:49 +0200 Subject: net-shapers: implement NL get operation Introduce the basic infrastructure to implement the net-shaper core functionality. Each network devices carries a net-shaper cache, the NL get() operation fetches the data from such cache. The cache is initially empty, will be fill by the set()/group() operation implemented later and is destroyed at device cleanup time. The net_shaper_fill_handle(), net_shaper_ctx_init(), and net_shaper_generic_pre() implementations handle generic index type attributes, despite the current caller always pass a constant value to avoid more noise in later patches using them with different attributes. Reviewed-by: Jakub Kicinski Reviewed-by: Jiri Pirko Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/ddd10fd645a9367803ad02fca4a5664ea5ace170.1728460186.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/networking/kapi.rst | 3 + include/linux/netdevice.h | 21 +++ include/net/net_shaper.h | 120 ++++++++++++++ net/core/dev.c | 6 + net/core/dev.h | 6 + net/shaper/shaper.c | 335 +++++++++++++++++++++++++++++++++++++- 6 files changed, 484 insertions(+), 7 deletions(-) create mode 100644 include/net/net_shaper.h (limited to 'include/net') diff --git a/Documentation/networking/kapi.rst b/Documentation/networking/kapi.rst index ea55f462cefa..98682b9a13ee 100644 --- a/Documentation/networking/kapi.rst +++ b/Documentation/networking/kapi.rst @@ -104,6 +104,9 @@ Driver Support .. kernel-doc:: include/linux/netdevice.h :internal: +.. kernel-doc:: include/net/net_shaper.h + :internal: + PHY Support ----------- diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3baf8e539b6f..e6b93d01e631 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1603,6 +1603,14 @@ struct net_device_ops { int (*ndo_hwtstamp_set)(struct net_device *dev, struct kernel_hwtstamp_config *kernel_config, struct netlink_ext_ack *extack); + +#if IS_ENABLED(CONFIG_NET_SHAPER) + /** + * @net_shaper_ops: Device shaping offload operations + * see include/net/net_shapers.h + */ + const struct net_shaper_ops *net_shaper_ops; +#endif }; /** @@ -2406,6 +2414,19 @@ struct net_device { u64 max_pacing_offload_horizon; + /** + * @lock: protects @net_shaper_hierarchy, feel free to use for other + * netdev-scope protection. Ordering: take after rtnl_lock. + */ + struct mutex lock; + +#if IS_ENABLED(CONFIG_NET_SHAPER) + /** + * @net_shaper_hierarchy: data tracking the current shaper status + * see include/net/net_shapers.h + */ + struct net_shaper_hierarchy *net_shaper_hierarchy; +#endif u8 priv[] ____cacheline_aligned __counted_by(priv_len); } ____cacheline_aligned; diff --git a/include/net/net_shaper.h b/include/net/net_shaper.h new file mode 100644 index 000000000000..5c3f49b52fe9 --- /dev/null +++ b/include/net/net_shaper.h @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _NET_SHAPER_H_ +#define _NET_SHAPER_H_ + +#include + +#include + +struct net_device; +struct devlink; +struct netlink_ext_ack; + +enum net_shaper_binding_type { + NET_SHAPER_BINDING_TYPE_NETDEV, + /* NET_SHAPER_BINDING_TYPE_DEVLINK_PORT */ +}; + +struct net_shaper_binding { + enum net_shaper_binding_type type; + union { + struct net_device *netdev; + struct devlink *devlink; + }; +}; + +struct net_shaper_handle { + enum net_shaper_scope scope; + u32 id; +}; + +/** + * struct net_shaper - represents a shaping node on the NIC H/W + * zeroed field are considered not set. + * @parent: Unique identifier for the shaper parent, usually implied + * @handle: Unique identifier for this shaper + * @metric: Specify if the rate limits refers to PPS or BPS + * @bw_min: Minimum guaranteed rate for this shaper + * @bw_max: Maximum peak rate allowed for this shaper + * @burst: Maximum burst for the peek rate of this shaper + * @priority: Scheduling priority for this shaper + * @weight: Scheduling weight for this shaper + */ +struct net_shaper { + struct net_shaper_handle parent; + struct net_shaper_handle handle; + enum net_shaper_metric metric; + u64 bw_min; + u64 bw_max; + u64 burst; + u32 priority; + u32 weight; + + /* private: */ + u32 leaves; /* accounted only for NODE scope */ + struct rcu_head rcu; +}; + +/** + * struct net_shaper_ops - Operations on device H/W shapers + * + * The operations applies to either net_device and devlink objects. + * The initial shaping configuration at device initialization is empty: + * does not constraint the rate in any way. + * The network core keeps track of the applied user-configuration in + * the net_device or devlink structure. + * The operations are serialized via a per device lock. + * + * Device not supporting any kind of nesting should not provide the + * group operation. + * + * Each shaper is uniquely identified within the device with a 'handle' + * comprising the shaper scope and a scope-specific id. + */ +struct net_shaper_ops { + /** + * @group: create the specified shapers scheduling group + * + * Nest the @leaves shapers identified under the * @node shaper. + * All the shapers belong to the device specified by @binding. + * The @leaves arrays size is specified by @leaves_count. + * Create either the @leaves and the @node shaper; or if they already + * exists, links them together in the desired way. + * @leaves scope must be NET_SHAPER_SCOPE_QUEUE. + */ + int (*group)(struct net_shaper_binding *binding, int leaves_count, + const struct net_shaper *leaves, + const struct net_shaper *node, + struct netlink_ext_ack *extack); + + /** + * @set: Updates the specified shaper + * + * Updates or creates the @shaper on the device specified by @binding. + */ + int (*set)(struct net_shaper_binding *binding, + const struct net_shaper *shaper, + struct netlink_ext_ack *extack); + + /** + * @delete: Removes the specified shaper + * + * Removes the shaper configuration as identified by the given @handle + * on the device specified by @binding, restoring the default behavior. + */ + int (*delete)(struct net_shaper_binding *binding, + const struct net_shaper_handle *handle, + struct netlink_ext_ack *extack); + + /** + * @capabilities: get the shaper features supported by the device + * + * Fills the bitmask @cap with the supported capabilities for the + * specified @scope and device specified by @binding. + */ + void (*capabilities)(struct net_shaper_binding *binding, + enum net_shaper_scope scope, unsigned long *cap); +}; + +#endif diff --git a/net/core/dev.c b/net/core/dev.c index ea5fbcd133ae..6e727c49a6f7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11147,6 +11147,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, hash_init(dev->qdisc_hash); #endif + mutex_init(&dev->lock); + dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); @@ -11217,6 +11219,8 @@ void free_netdev(struct net_device *dev) return; } + mutex_destroy(&dev->lock); + kfree(dev->ethtool); netif_free_tx_queues(dev); netif_free_rx_queues(dev); @@ -11426,6 +11430,8 @@ void unregister_netdevice_many_notify(struct list_head *head, mutex_destroy(&dev->ethtool->rss_lock); + net_shaper_flush_netdev(dev); + if (skb) rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh); diff --git a/net/core/dev.h b/net/core/dev.h index 5654325c5b71..13c558874af3 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -35,6 +35,12 @@ void dev_addr_flush(struct net_device *dev); int dev_addr_init(struct net_device *dev); void dev_addr_check(struct net_device *dev); +#if IS_ENABLED(CONFIG_NET_SHAPER) +void net_shaper_flush_netdev(struct net_device *dev); +#else +static inline void net_shaper_flush_netdev(struct net_device *dev) {} +#endif + /* sysctls not referred to from outside net/core/ */ extern int netdev_unregister_timeout_secs; extern int weight_p; diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index a1b20888f502..22daf7dde999 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -1,30 +1,333 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include +#include +#include #include +#include +#include #include +#include +#include +#include #include "shaper_nl_gen.h" +#include "../core/dev.h" + +#define NET_SHAPER_SCOPE_SHIFT 26 +#define NET_SHAPER_ID_MASK GENMASK(NET_SHAPER_SCOPE_SHIFT - 1, 0) +#define NET_SHAPER_SCOPE_MASK GENMASK(31, NET_SHAPER_SCOPE_SHIFT) + +#define NET_SHAPER_ID_UNSPEC NET_SHAPER_ID_MASK + +struct net_shaper_hierarchy { + struct xarray shapers; +}; + +struct net_shaper_nl_ctx { + struct net_shaper_binding binding; + netdevice_tracker dev_tracker; + unsigned long start_index; +}; + +static struct net_shaper_binding *net_shaper_binding_from_ctx(void *ctx) +{ + return &((struct net_shaper_nl_ctx *)ctx)->binding; +} + +static struct net_shaper_hierarchy * +net_shaper_hierarchy(struct net_shaper_binding *binding) +{ + /* Pairs with WRITE_ONCE() in net_shaper_hierarchy_setup. */ + if (binding->type == NET_SHAPER_BINDING_TYPE_NETDEV) + return READ_ONCE(binding->netdev->net_shaper_hierarchy); + + /* No other type supported yet. */ + return NULL; +} + +static int net_shaper_fill_binding(struct sk_buff *msg, + const struct net_shaper_binding *binding, + u32 type) +{ + /* Should never happen, as currently only NETDEV is supported. */ + if (WARN_ON_ONCE(binding->type != NET_SHAPER_BINDING_TYPE_NETDEV)) + return -EINVAL; + + if (nla_put_u32(msg, type, binding->netdev->ifindex)) + return -EMSGSIZE; + + return 0; +} + +static int net_shaper_fill_handle(struct sk_buff *msg, + const struct net_shaper_handle *handle, + u32 type) +{ + struct nlattr *handle_attr; + + if (handle->scope == NET_SHAPER_SCOPE_UNSPEC) + return 0; + + handle_attr = nla_nest_start(msg, type); + if (!handle_attr) + return -EMSGSIZE; + + if (nla_put_u32(msg, NET_SHAPER_A_HANDLE_SCOPE, handle->scope) || + (handle->scope >= NET_SHAPER_SCOPE_QUEUE && + nla_put_u32(msg, NET_SHAPER_A_HANDLE_ID, handle->id))) + goto handle_nest_cancel; + + nla_nest_end(msg, handle_attr); + return 0; + +handle_nest_cancel: + nla_nest_cancel(msg, handle_attr); + return -EMSGSIZE; +} + +static int +net_shaper_fill_one(struct sk_buff *msg, + const struct net_shaper_binding *binding, + const struct net_shaper *shaper, + const struct genl_info *info) +{ + void *hdr; + + hdr = genlmsg_iput(msg, info); + if (!hdr) + return -EMSGSIZE; + + if (net_shaper_fill_binding(msg, binding, NET_SHAPER_A_IFINDEX) || + net_shaper_fill_handle(msg, &shaper->parent, + NET_SHAPER_A_PARENT) || + net_shaper_fill_handle(msg, &shaper->handle, + NET_SHAPER_A_HANDLE) || + ((shaper->bw_min || shaper->bw_max || shaper->burst) && + nla_put_u32(msg, NET_SHAPER_A_METRIC, shaper->metric)) || + (shaper->bw_min && + nla_put_uint(msg, NET_SHAPER_A_BW_MIN, shaper->bw_min)) || + (shaper->bw_max && + nla_put_uint(msg, NET_SHAPER_A_BW_MAX, shaper->bw_max)) || + (shaper->burst && + nla_put_uint(msg, NET_SHAPER_A_BURST, shaper->burst)) || + (shaper->priority && + nla_put_u32(msg, NET_SHAPER_A_PRIORITY, shaper->priority)) || + (shaper->weight && + nla_put_u32(msg, NET_SHAPER_A_WEIGHT, shaper->weight))) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +/* Initialize the context fetching the relevant device and + * acquiring a reference to it. + */ +static int net_shaper_ctx_setup(const struct genl_info *info, int type, + struct net_shaper_nl_ctx *ctx) +{ + struct net *ns = genl_info_net(info); + struct net_device *dev; + int ifindex; + + if (GENL_REQ_ATTR_CHECK(info, type)) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[type]); + dev = netdev_get_by_index(ns, ifindex, &ctx->dev_tracker, GFP_KERNEL); + if (!dev) { + NL_SET_BAD_ATTR(info->extack, info->attrs[type]); + return -ENOENT; + } + + if (!dev->netdev_ops->net_shaper_ops) { + NL_SET_BAD_ATTR(info->extack, info->attrs[type]); + netdev_put(dev, &ctx->dev_tracker); + return -EOPNOTSUPP; + } + + ctx->binding.type = NET_SHAPER_BINDING_TYPE_NETDEV; + ctx->binding.netdev = dev; + return 0; +} + +static void net_shaper_ctx_cleanup(struct net_shaper_nl_ctx *ctx) +{ + if (ctx->binding.type == NET_SHAPER_BINDING_TYPE_NETDEV) + netdev_put(ctx->binding.netdev, &ctx->dev_tracker); +} + +static u32 net_shaper_handle_to_index(const struct net_shaper_handle *handle) +{ + return FIELD_PREP(NET_SHAPER_SCOPE_MASK, handle->scope) | + FIELD_PREP(NET_SHAPER_ID_MASK, handle->id); +} + +static struct net_shaper * +net_shaper_lookup(struct net_shaper_binding *binding, + const struct net_shaper_handle *handle) +{ + struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); + u32 index = net_shaper_handle_to_index(handle); + + return hierarchy ? xa_load(&hierarchy->shapers, index) : NULL; +} + +static int net_shaper_parse_handle(const struct nlattr *attr, + const struct genl_info *info, + struct net_shaper_handle *handle) +{ + struct nlattr *tb[NET_SHAPER_A_HANDLE_MAX + 1]; + struct nlattr *id_attr; + u32 id = 0; + int ret; + + ret = nla_parse_nested(tb, NET_SHAPER_A_HANDLE_MAX, attr, + net_shaper_handle_nl_policy, info->extack); + if (ret < 0) + return ret; + + if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, + NET_SHAPER_A_HANDLE_SCOPE)) + return -EINVAL; + + handle->scope = nla_get_u32(tb[NET_SHAPER_A_HANDLE_SCOPE]); + + /* The default id for NODE scope shapers is an invalid one + * to help the 'group' operation discriminate between new + * NODE shaper creation (ID_UNSPEC) and reuse of existing + * shaper (any other value). + */ + id_attr = tb[NET_SHAPER_A_HANDLE_ID]; + if (id_attr) + id = nla_get_u32(id_attr); + else if (handle->scope == NET_SHAPER_SCOPE_NODE) + id = NET_SHAPER_ID_UNSPEC; + + handle->id = id; + return 0; +} + +static int net_shaper_generic_pre(struct genl_info *info, int type) +{ + struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)info->ctx; + + BUILD_BUG_ON(sizeof(*ctx) > sizeof(info->ctx)); + + return net_shaper_ctx_setup(info, type, ctx); +} + int net_shaper_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { - return -EOPNOTSUPP; + return net_shaper_generic_pre(info, NET_SHAPER_A_IFINDEX); +} + +static void net_shaper_generic_post(struct genl_info *info) +{ + net_shaper_ctx_cleanup((struct net_shaper_nl_ctx *)info->ctx); } void net_shaper_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { + net_shaper_generic_post(info); +} + +int net_shaper_nl_pre_dumpit(struct netlink_callback *cb) +{ + struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)cb->ctx; + const struct genl_info *info = genl_info_dump(cb); + + return net_shaper_ctx_setup(info, NET_SHAPER_A_IFINDEX, ctx); +} + +int net_shaper_nl_post_dumpit(struct netlink_callback *cb) +{ + net_shaper_ctx_cleanup((struct net_shaper_nl_ctx *)cb->ctx); + return 0; } int net_shaper_nl_get_doit(struct sk_buff *skb, struct genl_info *info) { - return -EOPNOTSUPP; + struct net_shaper_binding *binding; + struct net_shaper_handle handle; + struct net_shaper *shaper; + struct sk_buff *msg; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, NET_SHAPER_A_HANDLE)) + return -EINVAL; + + binding = net_shaper_binding_from_ctx(info->ctx); + ret = net_shaper_parse_handle(info->attrs[NET_SHAPER_A_HANDLE], info, + &handle); + if (ret < 0) + return ret; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + rcu_read_lock(); + shaper = net_shaper_lookup(binding, &handle); + if (!shaper) { + NL_SET_BAD_ATTR(info->extack, + info->attrs[NET_SHAPER_A_HANDLE]); + rcu_read_unlock(); + ret = -ENOENT; + goto free_msg; + } + + ret = net_shaper_fill_one(msg, binding, shaper, info); + rcu_read_unlock(); + if (ret) + goto free_msg; + + ret = genlmsg_reply(msg, info); + if (ret) + goto free_msg; + + return 0; + +free_msg: + nlmsg_free(msg); + return ret; } int net_shaper_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { - return -EOPNOTSUPP; + struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)cb->ctx; + const struct genl_info *info = genl_info_dump(cb); + struct net_shaper_hierarchy *hierarchy; + struct net_shaper_binding *binding; + struct net_shaper *shaper; + int ret = 0; + + /* Don't error out dumps performed before any set operation. */ + binding = net_shaper_binding_from_ctx(ctx); + hierarchy = net_shaper_hierarchy(binding); + if (!hierarchy) + return 0; + + rcu_read_lock(); + for (; (shaper = xa_find(&hierarchy->shapers, &ctx->start_index, + U32_MAX, XA_PRESENT)); ctx->start_index++) { + ret = net_shaper_fill_one(skb, binding, shaper, info); + if (ret) + break; + } + rcu_read_unlock(); + + return ret; } int net_shaper_nl_set_doit(struct sk_buff *skb, struct genl_info *info) @@ -37,14 +340,32 @@ int net_shaper_nl_delete_doit(struct sk_buff *skb, struct genl_info *info) return -EOPNOTSUPP; } -int net_shaper_nl_pre_dumpit(struct netlink_callback *cb) +static void net_shaper_flush(struct net_shaper_binding *binding) { - return -EOPNOTSUPP; + struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); + struct net_shaper *cur; + unsigned long index; + + if (!hierarchy) + return; + + xa_lock(&hierarchy->shapers); + xa_for_each(&hierarchy->shapers, index, cur) { + __xa_erase(&hierarchy->shapers, index); + kfree(cur); + } + xa_unlock(&hierarchy->shapers); + kfree(hierarchy); } -int net_shaper_nl_post_dumpit(struct netlink_callback *cb) +void net_shaper_flush_netdev(struct net_device *dev) { - return -EOPNOTSUPP; + struct net_shaper_binding binding = { + .type = NET_SHAPER_BINDING_TYPE_NETDEV, + .netdev = dev, + }; + + net_shaper_flush(&binding); } static int __init shaper_init(void) -- cgit v1.2.3 From d677aebd663ddc287f2b2bda098474694a0ca875 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2024 03:41:00 +0000 Subject: tcp: move sysctl_tcp_l3mdev_accept to netns_ipv4_read_rx sysctl_tcp_l3mdev_accept is read from TCP receive fast path from tcp_v6_early_demux(), __inet6_lookup_established, inet_request_bound_dev_if(). Move it to netns_ipv4_read_rx. Remove the '#ifdef CONFIG_NET_L3_MASTER_DEV' that was guarding its definition. Note this adds a hole of three bytes that could be filled later. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Cc: Wei Wang Cc: Coco Li Link: https://patch.msgid.link/20241010034100.320832-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst | 2 +- include/net/netns/ipv4.h | 5 ++--- net/core/net_namespace.c | 4 +++- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst b/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst index 392e08a6ec04..629da6dc6d74 100644 --- a/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst +++ b/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst @@ -59,7 +59,7 @@ u8 sysctl_udp_early_demux u8 sysctl_nexthop_compat_mode u8 sysctl_fwmark_reflect u8 sysctl_tcp_fwmark_accept -u8 sysctl_tcp_l3mdev_accept +u8 sysctl_tcp_l3mdev_accept read_mostly __inet6_lookup_established/inet_request_bound_dev_if u8 sysctl_tcp_mtu_probing int sysctl_tcp_mtu_probe_floor int sysctl_tcp_base_mss diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 66a4cffc44ee..3b1de80b5c25 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -76,6 +76,8 @@ struct netns_ipv4 { __cacheline_group_begin(netns_ipv4_read_rx); u8 sysctl_ip_early_demux; u8 sysctl_tcp_early_demux; + u8 sysctl_tcp_l3mdev_accept; + /* 3 bytes hole, try to pack */ int sysctl_tcp_reordering; int sysctl_tcp_rmem[3]; __cacheline_group_end(netns_ipv4_read_rx); @@ -151,9 +153,6 @@ struct netns_ipv4 { u8 sysctl_fwmark_reflect; u8 sysctl_tcp_fwmark_accept; -#ifdef CONFIG_NET_L3_MASTER_DEV - u8 sysctl_tcp_l3mdev_accept; -#endif u8 sysctl_tcp_mtu_probing; int sysctl_tcp_mtu_probe_floor; int sysctl_tcp_base_mss; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index a5bc1fd8b034..0a86aff17f51 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1159,11 +1159,13 @@ static void __init netns_ipv4_struct_check(void) sysctl_ip_early_demux); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_early_demux); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_l3mdev_accept); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_reordering); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_rmem); - CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 18); + CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 22); } #endif -- cgit v1.2.3 From 05a344e54d0b4892736526e4a309851da8ee9c89 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 9 Oct 2024 10:32:20 -0700 Subject: netlabel,smack: use lsm_prop for audit data Replace the secid in the netlbl_audit structure with an lsm_prop. Remove scaffolding that was required when the value was a secid. Signed-off-by: Casey Schaufler [PM: fix the subject line] Signed-off-by: Paul Moore --- include/net/netlabel.h | 2 +- net/netlabel/netlabel_unlabeled.c | 5 +---- net/netlabel/netlabel_user.c | 7 +++---- net/netlabel/netlabel_user.h | 6 +----- security/smack/smackfs.c | 4 +--- 5 files changed, 7 insertions(+), 17 deletions(-) (limited to 'include/net') diff --git a/include/net/netlabel.h b/include/net/netlabel.h index 529160f76cac..8de8344ee93c 100644 --- a/include/net/netlabel.h +++ b/include/net/netlabel.h @@ -97,7 +97,7 @@ struct calipso_doi; /* NetLabel audit information */ struct netlbl_audit { - u32 secid; + struct lsm_prop prop; kuid_t loginuid; unsigned int sessionid; }; diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 5925f48a3ade..1bc2d0890a9f 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -1534,14 +1534,11 @@ int __init netlbl_unlabel_defconf(void) int ret_val; struct netlbl_dom_map *entry; struct netlbl_audit audit_info; - struct lsm_prop prop; /* Only the kernel is allowed to call this function and the only time * it is called is at bootup before the audit subsystem is reporting * messages so don't worry to much about these values. */ - security_current_getlsmprop_subj(&prop); - /* scaffolding */ - audit_info.secid = prop.scaffold.secid; + security_current_getlsmprop_subj(&audit_info.prop); audit_info.loginuid = GLOBAL_ROOT_UID; audit_info.sessionid = 0; diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c index 3ed4fea2a2de..81635a13987b 100644 --- a/net/netlabel/netlabel_user.c +++ b/net/netlabel/netlabel_user.c @@ -98,10 +98,9 @@ struct audit_buffer *netlbl_audit_start_common(int type, from_kuid(&init_user_ns, audit_info->loginuid), audit_info->sessionid); - if (audit_info->secid != 0 && - security_secid_to_secctx(audit_info->secid, - &secctx, - &secctx_len) == 0) { + if (lsmprop_is_set(&audit_info->prop) && + security_lsmprop_to_secctx(&audit_info->prop, &secctx, + &secctx_len) == 0) { audit_log_format(audit_buf, " subj=%s", secctx); security_release_secctx(secctx, secctx_len); } diff --git a/net/netlabel/netlabel_user.h b/net/netlabel/netlabel_user.h index 39f4f6df5f51..d4c434956212 100644 --- a/net/netlabel/netlabel_user.h +++ b/net/netlabel/netlabel_user.h @@ -32,11 +32,7 @@ */ static inline void netlbl_netlink_auditinfo(struct netlbl_audit *audit_info) { - struct lsm_prop prop; - - security_current_getlsmprop_subj(&prop); - /* scaffolding */ - audit_info->secid = prop.scaffold.secid; + security_current_getlsmprop_subj(&audit_info->prop); audit_info->loginuid = audit_get_loginuid(current); audit_info->sessionid = audit_get_sessionid(current); } diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index 5dd1e164f9b1..1401412fd794 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -182,11 +182,9 @@ static inline void smack_catset_bit(unsigned int cat, char *catsetp) */ static void smk_netlabel_audit_set(struct netlbl_audit *nap) { - struct smack_known *skp = smk_of_current(); - nap->loginuid = audit_get_loginuid(current); nap->sessionid = audit_get_sessionid(current); - nap->secid = skp->smk_secid; + nap->prop.smack.skp = smk_of_current(); } /* -- cgit v1.2.3 From a716ff52bebfe806fbcf5227cee4c7bda4e6724b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2024 18:44:01 +0000 Subject: fib: rules: use READ_ONCE()/WRITE_ONCE() on ops->fib_rules_seq Using RTNL to protect ops->fib_rules_seq reads seems a big hammer. Writes are protected by RTNL. We can use READ_ONCE() on readers. Constify 'struct net' argument of fib_rules_seq_read() and lookup_rules_ops(). Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: David Ahern Link: https://patch.msgid.link/20241009184405.3752829-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/fib_rules.h | 2 +- net/core/fib_rules.c | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index d17855c52ef9..04383d90a1e3 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -176,7 +176,7 @@ int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table); bool fib_rule_matchall(const struct fib_rule *rule); int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, struct netlink_ext_ack *extack); -unsigned int fib_rules_seq_read(struct net *net, int family); +unsigned int fib_rules_seq_read(const struct net *net, int family); int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 154a2681f55c..82ef090c0037 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -101,7 +101,8 @@ static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, u32 pid); -static struct fib_rules_ops *lookup_rules_ops(struct net *net, int family) +static struct fib_rules_ops *lookup_rules_ops(const struct net *net, + int family) { struct fib_rules_ops *ops; @@ -370,7 +371,9 @@ static int call_fib_rule_notifiers(struct net *net, .rule = rule, }; - ops->fib_rules_seq++; + ASSERT_RTNL(); + /* Paired with READ_ONCE() in fib_rules_seq() */ + WRITE_ONCE(ops->fib_rules_seq, ops->fib_rules_seq + 1); return call_fib_notifiers(net, event_type, &info.info); } @@ -397,17 +400,16 @@ int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, } EXPORT_SYMBOL_GPL(fib_rules_dump); -unsigned int fib_rules_seq_read(struct net *net, int family) +unsigned int fib_rules_seq_read(const struct net *net, int family) { unsigned int fib_rules_seq; struct fib_rules_ops *ops; - ASSERT_RTNL(); - ops = lookup_rules_ops(net, family); if (!ops) return 0; - fib_rules_seq = ops->fib_rules_seq; + /* Paired with WRITE_ONCE() in call_fib_rule_notifiers() */ + fib_rules_seq = READ_ONCE(ops->fib_rules_seq); rules_ops_put(ops); return fib_rules_seq; -- cgit v1.2.3 From 16207384d29287a19f81436e1953b41946aa8258 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2024 18:44:02 +0000 Subject: ipv4: use READ_ONCE()/WRITE_ONCE() on net->ipv4.fib_seq Using RTNL to protect ops->fib_rules_seq reads seems a big hammer. Writes are protected by RTNL. We can use READ_ONCE() when reading it. Constify 'struct net' argument of fib4_rules_seq_read() Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: David Ahern Link: https://patch.msgid.link/20241009184405.3752829-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip_fib.h | 4 ++-- include/net/netns/ipv4.h | 2 +- net/ipv4/fib_notifier.c | 8 ++++---- net/ipv4/fib_rules.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 06130933542d..b6e44f4eaa4c 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -347,7 +347,7 @@ static inline int fib4_rules_dump(struct net *net, struct notifier_block *nb, return 0; } -static inline unsigned int fib4_rules_seq_read(struct net *net) +static inline unsigned int fib4_rules_seq_read(const struct net *net) { return 0; } @@ -411,7 +411,7 @@ static inline bool fib4_has_custom_rules(const struct net *net) bool fib4_rule_default(const struct fib_rule *rule); int fib4_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); -unsigned int fib4_rules_seq_read(struct net *net); +unsigned int fib4_rules_seq_read(const struct net *net); static inline bool fib4_rules_early_flow_dissect(struct net *net, struct sk_buff *skb, diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 3b1de80b5c25..3c014170e001 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -262,7 +262,7 @@ struct netns_ipv4 { #endif struct fib_notifier_ops *notifier_ops; - unsigned int fib_seq; /* protected by rtnl_mutex */ + unsigned int fib_seq; /* writes protected by rtnl_mutex */ struct fib_notifier_ops *ipmr_notifier_ops; unsigned int ipmr_seq; /* protected by rtnl_mutex */ diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c index 0e23ade74493..21c85c80de64 100644 --- a/net/ipv4/fib_notifier.c +++ b/net/ipv4/fib_notifier.c @@ -22,15 +22,15 @@ int call_fib4_notifiers(struct net *net, enum fib_event_type event_type, ASSERT_RTNL(); info->family = AF_INET; - net->ipv4.fib_seq++; + /* Paired with READ_ONCE() in fib4_seq_read() */ + WRITE_ONCE(net->ipv4.fib_seq, net->ipv4.fib_seq + 1); return call_fib_notifiers(net, event_type, info); } static unsigned int fib4_seq_read(struct net *net) { - ASSERT_RTNL(); - - return net->ipv4.fib_seq + fib4_rules_seq_read(net); + /* Paired with WRITE_ONCE() in call_fib4_notifiers() */ + return READ_ONCE(net->ipv4.fib_seq) + fib4_rules_seq_read(net); } static int fib4_dump(struct net *net, struct notifier_block *nb, diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index b07292d50ee7..8325224ef072 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -74,7 +74,7 @@ int fib4_rules_dump(struct net *net, struct notifier_block *nb, return fib_rules_dump(net, nb, AF_INET, extack); } -unsigned int fib4_rules_seq_read(struct net *net) +unsigned int fib4_rules_seq_read(const struct net *net) { return fib_rules_seq_read(net, AF_INET); } -- cgit v1.2.3 From e60ea45447768c48309b944596a8a34f6bae50e2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2024 18:44:03 +0000 Subject: ipv6: use READ_ONCE()/WRITE_ONCE() on fib6_table->fib_seq Using RTNL to protect ops->fib_rules_seq reads seems a big hammer. Writes are protected by RTNL. We can use READ_ONCE() when reading it. Constify 'struct net' argument of fib6_tables_seq_read() and fib6_rules_seq_read(). Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: David Ahern Link: https://patch.msgid.link/20241009184405.3752829-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip6_fib.h | 8 ++++---- net/ipv6/fib6_rules.c | 2 +- net/ipv6/ip6_fib.c | 14 +++++++------- 3 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 6cb867ce4878..7c87873ae211 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -394,7 +394,7 @@ struct fib6_table { struct fib6_node tb6_root; struct inet_peer_base tb6_peers; unsigned int flags; - unsigned int fib_seq; + unsigned int fib_seq; /* writes protected by rtnl_mutex */ struct hlist_head tb6_gc_hlist; /* GC candidates */ #define RT6_TABLE_HAS_DFLT_ROUTER BIT(0) }; @@ -563,7 +563,7 @@ int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, int __net_init fib6_notifier_init(struct net *net); void __net_exit fib6_notifier_exit(struct net *net); -unsigned int fib6_tables_seq_read(struct net *net); +unsigned int fib6_tables_seq_read(const struct net *net); int fib6_tables_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); @@ -632,7 +632,7 @@ void fib6_rules_cleanup(void); bool fib6_rule_default(const struct fib_rule *rule); int fib6_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); -unsigned int fib6_rules_seq_read(struct net *net); +unsigned int fib6_rules_seq_read(const struct net *net); static inline bool fib6_rules_early_flow_dissect(struct net *net, struct sk_buff *skb, @@ -676,7 +676,7 @@ static inline int fib6_rules_dump(struct net *net, struct notifier_block *nb, { return 0; } -static inline unsigned int fib6_rules_seq_read(struct net *net) +static inline unsigned int fib6_rules_seq_read(const struct net *net) { return 0; } diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 04a9ed5e8310..c85c1627cb16 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -56,7 +56,7 @@ int fib6_rules_dump(struct net *net, struct notifier_block *nb, return fib_rules_dump(net, nb, AF_INET6, extack); } -unsigned int fib6_rules_seq_read(struct net *net) +unsigned int fib6_rules_seq_read(const struct net *net) { return fib_rules_seq_read(net, AF_INET6); } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index eb111d20615c..cea160b249d2 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -345,17 +345,17 @@ static void __net_init fib6_tables_init(struct net *net) #endif -unsigned int fib6_tables_seq_read(struct net *net) +unsigned int fib6_tables_seq_read(const struct net *net) { unsigned int h, fib_seq = 0; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { - struct hlist_head *head = &net->ipv6.fib_table_hash[h]; - struct fib6_table *tb; + const struct hlist_head *head = &net->ipv6.fib_table_hash[h]; + const struct fib6_table *tb; hlist_for_each_entry_rcu(tb, head, tb6_hlist) - fib_seq += tb->fib_seq; + fib_seq += READ_ONCE(tb->fib_seq); } rcu_read_unlock(); @@ -400,7 +400,7 @@ int call_fib6_entry_notifiers(struct net *net, .rt = rt, }; - rt->fib6_table->fib_seq++; + WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, event_type, &info.info); } @@ -416,7 +416,7 @@ int call_fib6_multipath_entry_notifiers(struct net *net, .nsiblings = nsiblings, }; - rt->fib6_table->fib_seq++; + WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, event_type, &info.info); } @@ -427,7 +427,7 @@ int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt) .nsiblings = rt->fib6_nsiblings, }; - rt->fib6_table->fib_seq++; + WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info); } -- cgit v1.2.3 From 2698acd6ea4770809af7e65bb8b3250e0a3a807e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2024 18:44:05 +0000 Subject: net: do not acquire rtnl in fib_seq_sum() After we made sure no fib_seq_read() handlers needs RTNL anymore, we can remove RTNL from fib_seq_sum(). Note that after RTNL was dropped, fib_seq_sum() result was possibly outdated anyway. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: David Ahern Link: https://patch.msgid.link/20241009184405.3752829-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/fib_notifier.h | 2 +- net/core/fib_notifier.c | 2 -- net/ipv4/fib_notifier.c | 2 +- net/ipv4/ipmr.c | 2 +- net/ipv6/fib6_notifier.c | 2 +- net/ipv6/ip6mr.c | 2 +- 6 files changed, 5 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h index 6d59221ff05a..48aad6128fea 100644 --- a/include/net/fib_notifier.h +++ b/include/net/fib_notifier.h @@ -28,7 +28,7 @@ enum fib_event_type { struct fib_notifier_ops { int family; struct list_head list; - unsigned int (*fib_seq_read)(struct net *net); + unsigned int (*fib_seq_read)(const struct net *net); int (*fib_dump)(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); struct module *owner; diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c index fc96259807b6..5cdca49b1d7c 100644 --- a/net/core/fib_notifier.c +++ b/net/core/fib_notifier.c @@ -43,7 +43,6 @@ static unsigned int fib_seq_sum(struct net *net) struct fib_notifier_ops *ops; unsigned int fib_seq = 0; - rtnl_lock(); rcu_read_lock(); list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { if (!try_module_get(ops->owner)) @@ -52,7 +51,6 @@ static unsigned int fib_seq_sum(struct net *net) module_put(ops->owner); } rcu_read_unlock(); - rtnl_unlock(); return fib_seq; } diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c index 21c85c80de64..b1551c26554b 100644 --- a/net/ipv4/fib_notifier.c +++ b/net/ipv4/fib_notifier.c @@ -27,7 +27,7 @@ int call_fib4_notifiers(struct net *net, enum fib_event_type event_type, return call_fib_notifiers(net, event_type, info); } -static unsigned int fib4_seq_read(struct net *net) +static unsigned int fib4_seq_read(const struct net *net) { /* Paired with WRITE_ONCE() in call_fib4_notifiers() */ return READ_ONCE(net->ipv4.fib_seq) + fib4_rules_seq_read(net); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 35ed03165184..7a95daeb1946 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -3035,7 +3035,7 @@ static const struct net_protocol pim_protocol = { }; #endif -static unsigned int ipmr_seq_read(struct net *net) +static unsigned int ipmr_seq_read(const struct net *net) { return READ_ONCE(net->ipv4.ipmr_seq) + ipmr_rules_seq_read(net); } diff --git a/net/ipv6/fib6_notifier.c b/net/ipv6/fib6_notifier.c index f87ae33e1d01..949b72610df7 100644 --- a/net/ipv6/fib6_notifier.c +++ b/net/ipv6/fib6_notifier.c @@ -22,7 +22,7 @@ int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, return call_fib_notifiers(net, event_type, info); } -static unsigned int fib6_seq_read(struct net *net) +static unsigned int fib6_seq_read(const struct net *net) { return fib6_tables_seq_read(net) + fib6_rules_seq_read(net); } diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 3f9501fd8c1a..9528e17665fd 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1260,7 +1260,7 @@ static int ip6mr_device_event(struct notifier_block *this, return NOTIFY_DONE; } -static unsigned int ip6mr_seq_read(struct net *net) +static unsigned int ip6mr_seq_read(const struct net *net) { return READ_ONCE(net->ipv6.ipmr_seq) + ip6mr_rules_seq_read(net); } -- cgit v1.2.3 From 7f20dbd7de7b9b2804e6bf54b0c22f2bc447cd64 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 9 Oct 2024 10:28:20 +0800 Subject: net: tunnel: add pskb_inet_may_pull_reason() helper Introduce the function pskb_inet_may_pull_reason() and make pskb_inet_may_pull a simple inline call to it. The drop reasons of it just come from pskb_may_pull_reason(). Signed-off-by: Menglong Dong Reviewed-by: Simon Horman Reviewed-by: Kalesh AP Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 6194fbb564c6..7fc2f7bf837a 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -439,7 +439,8 @@ int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op, int ip_tunnel_encap_setup(struct ip_tunnel *t, struct ip_tunnel_encap *ipencap); -static inline bool pskb_inet_may_pull(struct sk_buff *skb) +static inline enum skb_drop_reason +pskb_inet_may_pull_reason(struct sk_buff *skb) { int nhlen; @@ -456,7 +457,12 @@ static inline bool pskb_inet_may_pull(struct sk_buff *skb) nhlen = 0; } - return pskb_network_may_pull(skb, nhlen); + return pskb_network_may_pull_reason(skb, nhlen); +} + +static inline bool pskb_inet_may_pull(struct sk_buff *skb) +{ + return pskb_inet_may_pull_reason(skb) == SKB_NOT_DROPPED_YET; } /* Variant of pskb_inet_may_pull(). -- cgit v1.2.3 From 9990ddf47d4168088e2246c3d418bf526e40830d Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 9 Oct 2024 10:28:21 +0800 Subject: net: tunnel: make skb_vlan_inet_prepare() return drop reasons Make skb_vlan_inet_prepare return the skb drop reasons, which is just what pskb_may_pull_reason() returns. Meanwhile, adjust all the call of it. Signed-off-by: Menglong Dong Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/bareudp.c | 4 ++-- drivers/net/geneve.c | 4 ++-- drivers/net/vxlan/vxlan_core.c | 2 +- include/net/ip_tunnels.h | 13 ++++++++----- 4 files changed, 13 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index e057526448d7..fa2dd76ba3d9 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -317,7 +317,7 @@ static int bareudp_xmit_skb(struct sk_buff *skb, struct net_device *dev, __be32 saddr; int err; - if (!skb_vlan_inet_prepare(skb, skb->protocol != htons(ETH_P_TEB))) + if (skb_vlan_inet_prepare(skb, skb->protocol != htons(ETH_P_TEB))) return -EINVAL; if (!sock) @@ -387,7 +387,7 @@ static int bareudp6_xmit_skb(struct sk_buff *skb, struct net_device *dev, __be16 sport; int err; - if (!skb_vlan_inet_prepare(skb, skb->protocol != htons(ETH_P_TEB))) + if (skb_vlan_inet_prepare(skb, skb->protocol != htons(ETH_P_TEB))) return -EINVAL; if (!sock) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 7f611c74eb62..2f29b1386b1c 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -827,7 +827,7 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, __be16 sport; int err; - if (!skb_vlan_inet_prepare(skb, inner_proto_inherit)) + if (skb_vlan_inet_prepare(skb, inner_proto_inherit)) return -EINVAL; if (!gs4) @@ -937,7 +937,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, __be16 sport; int err; - if (!skb_vlan_inet_prepare(skb, inner_proto_inherit)) + if (skb_vlan_inet_prepare(skb, inner_proto_inherit)) return -EINVAL; if (!gs6) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 6e9a3795846a..916c3880832e 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2356,7 +2356,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, __be32 vni = 0; no_eth_encap = flags & VXLAN_F_GPE && skb->protocol != htons(ETH_P_TEB); - if (!skb_vlan_inet_prepare(skb, no_eth_encap)) + if (skb_vlan_inet_prepare(skb, no_eth_encap)) goto drop; old_iph = ip_hdr(skb); diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 7fc2f7bf837a..4e4f9e24c9c1 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -467,11 +467,12 @@ static inline bool pskb_inet_may_pull(struct sk_buff *skb) /* Variant of pskb_inet_may_pull(). */ -static inline bool skb_vlan_inet_prepare(struct sk_buff *skb, - bool inner_proto_inherit) +static inline enum skb_drop_reason +skb_vlan_inet_prepare(struct sk_buff *skb, bool inner_proto_inherit) { int nhlen = 0, maclen = inner_proto_inherit ? 0 : ETH_HLEN; __be16 type = skb->protocol; + enum skb_drop_reason reason; /* Essentially this is skb_protocol(skb, true) * And we get MAC len. @@ -492,11 +493,13 @@ static inline bool skb_vlan_inet_prepare(struct sk_buff *skb, /* For ETH_P_IPV6/ETH_P_IP we make sure to pull * a base network header in skb->head. */ - if (!pskb_may_pull(skb, maclen + nhlen)) - return false; + reason = pskb_may_pull_reason(skb, maclen + nhlen); + if (reason) + return reason; skb_set_network_header(skb, maclen); - return true; + + return SKB_NOT_DROPPED_YET; } static inline int ip_encap_hlen(struct ip_tunnel_encap *e) -- cgit v1.2.3 From 4c06d9daf8e6215447ca8a2ddd59fa09862c9bae Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 9 Oct 2024 10:28:22 +0800 Subject: net: vxlan: add skb drop reasons to vxlan_rcv() Introduce skb drop reasons to the function vxlan_rcv(). Following new drop reasons are added: SKB_DROP_REASON_VXLAN_INVALID_HDR SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND SKB_DROP_REASON_IP_TUNNEL_ECN Signed-off-by: Menglong Dong Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/vxlan/vxlan_core.c | 26 ++++++++++++++++++++------ include/net/dropreason-core.h | 16 ++++++++++++++++ 2 files changed, 36 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 916c3880832e..40111d05d18e 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1671,13 +1671,15 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) struct vxlan_metadata _md; struct vxlan_metadata *md = &_md; __be16 protocol = htons(ETH_P_TEB); + enum skb_drop_reason reason; bool raw_proto = false; void *oiph; __be32 vni = 0; int nh; /* Need UDP and VXLAN header to be present */ - if (!pskb_may_pull(skb, VXLAN_HLEN)) + reason = pskb_may_pull_reason(skb, VXLAN_HLEN); + if (reason) goto drop; unparsed = *vxlan_hdr(skb); @@ -1686,6 +1688,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", ntohl(vxlan_hdr(skb)->vx_flags), ntohl(vxlan_hdr(skb)->vx_vni)); + reason = SKB_DROP_REASON_VXLAN_INVALID_HDR; /* Return non vxlan pkt */ goto drop; } @@ -1699,8 +1702,10 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) vni = vxlan_vni(vxlan_hdr(skb)->vx_vni); vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni, &vninode); - if (!vxlan) + if (!vxlan) { + reason = SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND; goto drop; + } /* For backwards compatibility, only allow reserved fields to be * used by VXLAN extensions if explicitly requested. @@ -1713,8 +1718,10 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) } if (__iptunnel_pull_header(skb, VXLAN_HLEN, protocol, raw_proto, - !net_eq(vxlan->net, dev_net(vxlan->dev)))) + !net_eq(vxlan->net, dev_net(vxlan->dev)))) { + reason = SKB_DROP_REASON_NOMEM; goto drop; + } if (vs->flags & VXLAN_F_REMCSUM_RX) if (unlikely(!vxlan_remcsum(&unparsed, skb, vs->flags))) @@ -1728,8 +1735,10 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), flags, key32_to_tunnel_id(vni), sizeof(*md)); - if (!tun_dst) + if (!tun_dst) { + reason = SKB_DROP_REASON_NOMEM; goto drop; + } md = ip_tunnel_info_opts(&tun_dst->u.tun_info); @@ -1753,6 +1762,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) * is more robust and provides a little more security in * adding extensions to VXLAN. */ + reason = SKB_DROP_REASON_VXLAN_INVALID_HDR; goto drop; } @@ -1773,7 +1783,8 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) skb_reset_network_header(skb); - if (!pskb_inet_may_pull(skb)) { + reason = pskb_inet_may_pull_reason(skb); + if (reason) { DEV_STATS_INC(vxlan->dev, rx_length_errors); DEV_STATS_INC(vxlan->dev, rx_errors); vxlan_vnifilter_count(vxlan, vni, vninode, @@ -1785,6 +1796,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) oiph = skb->head + nh; if (!vxlan_ecn_decapsulate(vs, oiph, skb)) { + reason = SKB_DROP_REASON_IP_TUNNEL_ECN; DEV_STATS_INC(vxlan->dev, rx_frame_errors); DEV_STATS_INC(vxlan->dev, rx_errors); vxlan_vnifilter_count(vxlan, vni, vninode, @@ -1799,6 +1811,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) dev_core_stats_rx_dropped_inc(vxlan->dev); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX_DROPS, 0); + reason = SKB_DROP_REASON_DEV_READY; goto drop; } @@ -1811,8 +1824,9 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) return 0; drop: + reason = reason ?: SKB_DROP_REASON_NOT_SPECIFIED; /* Consume bad packet */ - kfree_skb(skb); + kfree_skb_reason(skb, reason); return 0; } diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 4748680e8c88..98259d2b3e92 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -92,6 +92,9 @@ FN(PACKET_SOCK_ERROR) \ FN(TC_CHAIN_NOTFOUND) \ FN(TC_RECLASSIFY_LOOP) \ + FN(VXLAN_INVALID_HDR) \ + FN(VXLAN_VNI_NOT_FOUND) \ + FN(IP_TUNNEL_ECN) \ FNe(MAX) /** @@ -418,6 +421,19 @@ enum skb_drop_reason { * iterations. */ SKB_DROP_REASON_TC_RECLASSIFY_LOOP, + /** + * @SKB_DROP_REASON_VXLAN_INVALID_HDR: VXLAN header is invalid. E.g.: + * 1) reserved fields are not zero + * 2) "I" flag is not set + */ + SKB_DROP_REASON_VXLAN_INVALID_HDR, + /** @SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND: no VXLAN device found for VNI */ + SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND, + /** + * @SKB_DROP_REASON_IP_TUNNEL_ECN: skb is dropped according to + * RFC 6040 4.2, see __INET_ECN_decapsulate() for detail. + */ + SKB_DROP_REASON_IP_TUNNEL_ECN, /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen -- cgit v1.2.3 From 289fd4e75219a96f77c5d679166035cd5118d139 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 9 Oct 2024 10:28:24 +0800 Subject: net: vxlan: make vxlan_snoop() return drop reasons Change the return type of vxlan_snoop() from bool to enum skb_drop_reason. In this commit, two drop reasons are introduced: SKB_DROP_REASON_MAC_INVALID_SOURCE SKB_DROP_REASON_VXLAN_ENTRY_EXISTS Signed-off-by: Menglong Dong Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/vxlan/vxlan_core.c | 17 +++++++++-------- include/net/dropreason-core.h | 9 +++++++++ 2 files changed, 18 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index e3ac46d6e148..947c29d6605c 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1437,9 +1437,10 @@ errout: * and Tunnel endpoint. * Return true if packet is bogus and should be dropped. */ -static bool vxlan_snoop(struct net_device *dev, - union vxlan_addr *src_ip, const u8 *src_mac, - u32 src_ifindex, __be32 vni) +static enum skb_drop_reason vxlan_snoop(struct net_device *dev, + union vxlan_addr *src_ip, + const u8 *src_mac, u32 src_ifindex, + __be32 vni) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; @@ -1447,7 +1448,7 @@ static bool vxlan_snoop(struct net_device *dev, /* Ignore packets from invalid src-address */ if (!is_valid_ether_addr(src_mac)) - return true; + return SKB_DROP_REASON_MAC_INVALID_SOURCE; #if IS_ENABLED(CONFIG_IPV6) if (src_ip->sa.sa_family == AF_INET6 && @@ -1461,15 +1462,15 @@ static bool vxlan_snoop(struct net_device *dev, if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip) && rdst->remote_ifindex == ifindex)) - return false; + return SKB_NOT_DROPPED_YET; /* Don't migrate static entries, drop packets */ if (f->state & (NUD_PERMANENT | NUD_NOARP)) - return true; + return SKB_DROP_REASON_VXLAN_ENTRY_EXISTS; /* Don't override an fdb with nexthop with a learnt entry */ if (rcu_access_pointer(f->nh)) - return true; + return SKB_DROP_REASON_VXLAN_ENTRY_EXISTS; if (net_ratelimit()) netdev_info(dev, @@ -1497,7 +1498,7 @@ static bool vxlan_snoop(struct net_device *dev, spin_unlock(&vxlan->hash_lock[hash_index]); } - return false; + return SKB_NOT_DROPPED_YET; } static bool __vxlan_sock_release_prep(struct vxlan_sock *vs) diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 98259d2b3e92..1cb8d7c953be 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -94,6 +94,8 @@ FN(TC_RECLASSIFY_LOOP) \ FN(VXLAN_INVALID_HDR) \ FN(VXLAN_VNI_NOT_FOUND) \ + FN(MAC_INVALID_SOURCE) \ + FN(VXLAN_ENTRY_EXISTS) \ FN(IP_TUNNEL_ECN) \ FNe(MAX) @@ -429,6 +431,13 @@ enum skb_drop_reason { SKB_DROP_REASON_VXLAN_INVALID_HDR, /** @SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND: no VXLAN device found for VNI */ SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND, + /** @SKB_DROP_REASON_MAC_INVALID_SOURCE: source mac is invalid */ + SKB_DROP_REASON_MAC_INVALID_SOURCE, + /** + * @SKB_DROP_REASON_VXLAN_ENTRY_EXISTS: trying to migrate a static + * entry or an entry pointing to a nexthop. + */ + SKB_DROP_REASON_VXLAN_ENTRY_EXISTS, /** * @SKB_DROP_REASON_IP_TUNNEL_ECN: skb is dropped according to * RFC 6040 4.2, see __INET_ECN_decapsulate() for detail. -- cgit v1.2.3 From d209706f562ee4fa81bdf24cf6b679c3222aa06c Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 9 Oct 2024 10:28:25 +0800 Subject: net: vxlan: make vxlan_set_mac() return drop reasons Change the return type of vxlan_set_mac() from bool to enum skb_drop_reason. In this commit, the drop reason "SKB_DROP_REASON_LOCAL_MAC" is introduced for the case that the source mac of the packet is a local mac. Signed-off-by: Menglong Dong Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/vxlan/vxlan_core.c | 19 ++++++++++--------- include/net/dropreason-core.h | 6 ++++++ 2 files changed, 16 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 947c29d6605c..b4f9f23797de 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1609,9 +1609,9 @@ out: unparsed->vx_flags &= ~VXLAN_GBP_USED_BITS; } -static bool vxlan_set_mac(struct vxlan_dev *vxlan, - struct vxlan_sock *vs, - struct sk_buff *skb, __be32 vni) +static enum skb_drop_reason vxlan_set_mac(struct vxlan_dev *vxlan, + struct vxlan_sock *vs, + struct sk_buff *skb, __be32 vni) { union vxlan_addr saddr; u32 ifindex = skb->dev->ifindex; @@ -1622,7 +1622,7 @@ static bool vxlan_set_mac(struct vxlan_dev *vxlan, /* Ignore packet loops (and multicast echo) */ if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr)) - return false; + return SKB_DROP_REASON_LOCAL_MAC; /* Get address from the outer IP header */ if (vxlan_get_sk_family(vs) == AF_INET) { @@ -1635,11 +1635,11 @@ static bool vxlan_set_mac(struct vxlan_dev *vxlan, #endif } - if ((vxlan->cfg.flags & VXLAN_F_LEARN) && - vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source, ifindex, vni)) - return false; + if (!(vxlan->cfg.flags & VXLAN_F_LEARN)) + return SKB_NOT_DROPPED_YET; - return true; + return vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source, + ifindex, vni); } static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph, @@ -1774,7 +1774,8 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) } if (!raw_proto) { - if (!vxlan_set_mac(vxlan, vs, skb, vni)) + reason = vxlan_set_mac(vxlan, vs, skb, vni); + if (reason) goto drop; } else { skb_reset_mac_header(skb); diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 1cb8d7c953be..fbf92d442c1b 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -97,6 +97,7 @@ FN(MAC_INVALID_SOURCE) \ FN(VXLAN_ENTRY_EXISTS) \ FN(IP_TUNNEL_ECN) \ + FN(LOCAL_MAC) \ FNe(MAX) /** @@ -443,6 +444,11 @@ enum skb_drop_reason { * RFC 6040 4.2, see __INET_ECN_decapsulate() for detail. */ SKB_DROP_REASON_IP_TUNNEL_ECN, + /** + * @SKB_DROP_REASON_LOCAL_MAC: the source MAC address is equal to + * the MAC address of the local netdev. + */ + SKB_DROP_REASON_LOCAL_MAC, /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen -- cgit v1.2.3 From b71a576e452b800efeac49ecca116d954601d911 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 9 Oct 2024 10:28:26 +0800 Subject: net: vxlan: use kfree_skb_reason() in vxlan_xmit() Replace kfree_skb() with kfree_skb_reason() in vxlan_xmit(). Following new skb drop reasons are introduced for vxlan: /* no remote found for xmit */ SKB_DROP_REASON_VXLAN_NO_REMOTE /* packet without necessary metadata reached a device which is * in "external" mode */ SKB_DROP_REASON_TUNNEL_TXINFO Signed-off-by: Menglong Dong Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/vxlan/vxlan_core.c | 6 +++--- include/net/dropreason-core.h | 9 +++++++++ 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index b4f9f23797de..649b4ea5c362 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2730,7 +2730,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) if (info && info->mode & IP_TUNNEL_INFO_TX) vxlan_xmit_one(skb, dev, vni, NULL, false); else - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_TUNNEL_TXINFO); return NETDEV_TX_OK; } } @@ -2793,7 +2793,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) dev_core_stats_tx_dropped_inc(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_VXLAN_NO_REMOTE); return NETDEV_TX_OK; } } @@ -2816,7 +2816,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) if (fdst) vxlan_xmit_one(skb, dev, vni, fdst, did_rsc); else - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_VXLAN_NO_REMOTE); } return NETDEV_TX_OK; diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index fbf92d442c1b..d59bb96c5a02 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -96,7 +96,9 @@ FN(VXLAN_VNI_NOT_FOUND) \ FN(MAC_INVALID_SOURCE) \ FN(VXLAN_ENTRY_EXISTS) \ + FN(VXLAN_NO_REMOTE) \ FN(IP_TUNNEL_ECN) \ + FN(TUNNEL_TXINFO) \ FN(LOCAL_MAC) \ FNe(MAX) @@ -439,11 +441,18 @@ enum skb_drop_reason { * entry or an entry pointing to a nexthop. */ SKB_DROP_REASON_VXLAN_ENTRY_EXISTS, + /** @SKB_DROP_REASON_VXLAN_NO_REMOTE: no remote found for xmit */ + SKB_DROP_REASON_VXLAN_NO_REMOTE, /** * @SKB_DROP_REASON_IP_TUNNEL_ECN: skb is dropped according to * RFC 6040 4.2, see __INET_ECN_decapsulate() for detail. */ SKB_DROP_REASON_IP_TUNNEL_ECN, + /** + * @SKB_DROP_REASON_TUNNEL_TXINFO: packet without necessary metadata + * reached a device which is in "external" mode. + */ + SKB_DROP_REASON_TUNNEL_TXINFO, /** * @SKB_DROP_REASON_LOCAL_MAC: the source MAC address is equal to * the MAC address of the local netdev. -- cgit v1.2.3 From b692bf9a7543af7ad11a59d182a3757578f0ba53 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Mon, 7 Oct 2024 14:24:53 +0200 Subject: xsk: Get rid of xdp_buff_xsk::xskb_list_node Let's bring xdp_buff_xsk back to occupying 2 cachelines by removing xskb_list_node - for the purpose of gathering the xskb frags free_list_node can be used, head of the list (xsk_buff_pool::xskb_list) stays as-is, just reuse the node ptr. It is safe to do as a single xdp_buff_xsk can never reside in two pool's lists simultaneously. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20241007122458.282590-2-maciej.fijalkowski@intel.com --- include/net/xdp_sock_drv.h | 14 +++++++------- include/net/xsk_buff_pool.h | 1 - net/xdp/xsk.c | 4 ++-- net/xdp/xsk_buff_pool.c | 1 - 4 files changed, 9 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 0a5dca2b2b3f..360bc1244c6a 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -126,8 +126,8 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) if (likely(!xdp_buff_has_frags(xdp))) goto out; - list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { - list_del(&pos->xskb_list_node); + list_for_each_entry_safe(pos, tmp, xskb_list, free_list_node) { + list_del(&pos->free_list_node); xp_free(pos); } @@ -140,7 +140,7 @@ static inline void xsk_buff_add_frag(struct xdp_buff *xdp) { struct xdp_buff_xsk *frag = container_of(xdp, struct xdp_buff_xsk, xdp); - list_add_tail(&frag->xskb_list_node, &frag->pool->xskb_list); + list_add_tail(&frag->free_list_node, &frag->pool->xskb_list); } static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) @@ -150,9 +150,9 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) struct xdp_buff_xsk *frag; frag = list_first_entry_or_null(&xskb->pool->xskb_list, - struct xdp_buff_xsk, xskb_list_node); + struct xdp_buff_xsk, free_list_node); if (frag) { - list_del(&frag->xskb_list_node); + list_del(&frag->free_list_node); ret = &frag->xdp; } @@ -163,7 +163,7 @@ static inline void xsk_buff_del_tail(struct xdp_buff *tail) { struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); - list_del(&xskb->xskb_list_node); + list_del(&xskb->free_list_node); } static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) @@ -172,7 +172,7 @@ static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) struct xdp_buff_xsk *frag; frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, - xskb_list_node); + free_list_node); return &frag->xdp; } diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index bacb33f1e3e5..aa7f1d0b3a5e 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -30,7 +30,6 @@ struct xdp_buff_xsk { struct xsk_buff_pool *pool; u64 orig_addr; struct list_head free_list_node; - struct list_head xskb_list_node; }; #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 1140b2a120ca..9c93064349a8 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -171,14 +171,14 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) return 0; xskb_list = &xskb->pool->xskb_list; - list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { + list_for_each_entry_safe(pos, tmp, xskb_list, free_list_node) { if (list_is_singular(xskb_list)) contd = 0; len = pos->xdp.data_end - pos->xdp.data; err = __xsk_rcv_zc(xs, pos, len, contd); if (err) goto err; - list_del(&pos->xskb_list_node); + list_del(&pos->free_list_node); } return 0; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 521a2938e50a..e5368db7d18e 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -102,7 +102,6 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, xskb->pool = pool; xskb->xdp.frame_sz = umem->chunk_size - umem->headroom; INIT_LIST_HEAD(&xskb->free_list_node); - INIT_LIST_HEAD(&xskb->xskb_list_node); if (pool->unaligned) pool->free_heads[i] = xskb; else -- cgit v1.2.3 From 30ec2c1baaead43903ad63ff8e3083949059083c Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Mon, 7 Oct 2024 14:24:54 +0200 Subject: xsk: s/free_list_node/list_node/ Now that free_list_node's purpose is two-folded, make it just a 'list_node'. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20241007122458.282590-3-maciej.fijalkowski@intel.com --- include/net/xdp_sock_drv.h | 14 +++++++------- include/net/xsk_buff_pool.h | 2 +- net/xdp/xsk.c | 4 ++-- net/xdp/xsk_buff_pool.c | 14 +++++++------- 4 files changed, 17 insertions(+), 17 deletions(-) (limited to 'include/net') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 360bc1244c6a..40085afd9160 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -126,8 +126,8 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) if (likely(!xdp_buff_has_frags(xdp))) goto out; - list_for_each_entry_safe(pos, tmp, xskb_list, free_list_node) { - list_del(&pos->free_list_node); + list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { + list_del(&pos->list_node); xp_free(pos); } @@ -140,7 +140,7 @@ static inline void xsk_buff_add_frag(struct xdp_buff *xdp) { struct xdp_buff_xsk *frag = container_of(xdp, struct xdp_buff_xsk, xdp); - list_add_tail(&frag->free_list_node, &frag->pool->xskb_list); + list_add_tail(&frag->list_node, &frag->pool->xskb_list); } static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) @@ -150,9 +150,9 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) struct xdp_buff_xsk *frag; frag = list_first_entry_or_null(&xskb->pool->xskb_list, - struct xdp_buff_xsk, free_list_node); + struct xdp_buff_xsk, list_node); if (frag) { - list_del(&frag->free_list_node); + list_del(&frag->list_node); ret = &frag->xdp; } @@ -163,7 +163,7 @@ static inline void xsk_buff_del_tail(struct xdp_buff *tail) { struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); - list_del(&xskb->free_list_node); + list_del(&xskb->list_node); } static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) @@ -172,7 +172,7 @@ static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) struct xdp_buff_xsk *frag; frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, - free_list_node); + list_node); return &frag->xdp; } diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index aa7f1d0b3a5e..af8b6f776f86 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -29,7 +29,7 @@ struct xdp_buff_xsk { dma_addr_t frame_dma; struct xsk_buff_pool *pool; u64 orig_addr; - struct list_head free_list_node; + struct list_head list_node; }; #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 9c93064349a8..520023405908 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -171,14 +171,14 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) return 0; xskb_list = &xskb->pool->xskb_list; - list_for_each_entry_safe(pos, tmp, xskb_list, free_list_node) { + list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { if (list_is_singular(xskb_list)) contd = 0; len = pos->xdp.data_end - pos->xdp.data; err = __xsk_rcv_zc(xs, pos, len, contd); if (err) goto err; - list_del(&pos->free_list_node); + list_del(&pos->list_node); } return 0; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index e5368db7d18e..973557d5e4f7 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -101,7 +101,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, xskb = &pool->heads[i]; xskb->pool = pool; xskb->xdp.frame_sz = umem->chunk_size - umem->headroom; - INIT_LIST_HEAD(&xskb->free_list_node); + INIT_LIST_HEAD(&xskb->list_node); if (pool->unaligned) pool->free_heads[i] = xskb; else @@ -549,8 +549,8 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool) } else { pool->free_list_cnt--; xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, - free_list_node); - list_del_init(&xskb->free_list_node); + list_node); + list_del_init(&xskb->list_node); } xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM; @@ -616,8 +616,8 @@ static u32 xp_alloc_reused(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u3 i = nb_entries; while (i--) { - xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, free_list_node); - list_del_init(&xskb->free_list_node); + xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, list_node); + list_del_init(&xskb->list_node); *xdp = &xskb->xdp; xdp++; @@ -687,11 +687,11 @@ EXPORT_SYMBOL(xp_can_alloc); void xp_free(struct xdp_buff_xsk *xskb) { - if (!list_empty(&xskb->free_list_node)) + if (!list_empty(&xskb->list_node)) return; xskb->pool->free_list_cnt++; - list_add(&xskb->free_list_node, &xskb->pool->free_list); + list_add(&xskb->list_node, &xskb->pool->free_list); } EXPORT_SYMBOL(xp_free); -- cgit v1.2.3 From bea14124bacbe5c9366381e62635eed28ac892ae Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Mon, 7 Oct 2024 14:24:55 +0200 Subject: xsk: Get rid of xdp_buff_xsk::orig_addr Continue the process of dieting xdp_buff_xsk by removing orig_addr member. It can be calculated from xdp->data_hard_start where it was previously used, so it is not anything that has to be carried around in struct used widely in hot path. This has been used for initializing xdp_buff_xsk::frame_dma during pool setup and as a shortcut in xp_get_handle() to retrieve address provided to xsk Rx queue. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20241007122458.282590-4-maciej.fijalkowski@intel.com --- include/net/xsk_buff_pool.h | 19 +++++++++++-------- net/xdp/xsk.c | 2 +- net/xdp/xsk_buff_pool.c | 4 +++- 3 files changed, 15 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index af8b6f776f86..468a23b1b4c5 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -28,7 +28,6 @@ struct xdp_buff_xsk { dma_addr_t dma; dma_addr_t frame_dma; struct xsk_buff_pool *pool; - u64 orig_addr; struct list_head list_node; }; @@ -119,7 +118,6 @@ void xp_free(struct xdp_buff_xsk *xskb); static inline void xp_init_xskb_addr(struct xdp_buff_xsk *xskb, struct xsk_buff_pool *pool, u64 addr) { - xskb->orig_addr = addr; xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom; } @@ -221,14 +219,19 @@ static inline void xp_release(struct xdp_buff_xsk *xskb) xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; } -static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb) +static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb, + struct xsk_buff_pool *pool) { - u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start; + u64 orig_addr = xskb->xdp.data - pool->addrs; + u64 offset; - offset += xskb->pool->headroom; - if (!xskb->pool->unaligned) - return xskb->orig_addr + offset; - return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); + if (!pool->unaligned) + return orig_addr; + + offset = xskb->xdp.data - xskb->xdp.data_hard_start; + orig_addr -= offset; + offset += pool->headroom; + return orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); } static inline bool xp_tx_metadata_enabled(const struct xsk_buff_pool *pool) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 520023405908..6c31c1de1619 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -141,7 +141,7 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, u64 addr; int err; - addr = xp_get_handle(xskb); + addr = xp_get_handle(xskb, xskb->pool); err = xskq_prod_reserve_desc(xs->rx, addr, len, flags); if (err) { xs->rx_queue_full++; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 973557d5e4f7..7ecd4ccd2473 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -416,8 +416,10 @@ static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_ for (i = 0; i < pool->heads_cnt; i++) { struct xdp_buff_xsk *xskb = &pool->heads[i]; + u64 orig_addr; - xp_init_xskb_dma(xskb, pool, dma_map->dma_pages, xskb->orig_addr); + orig_addr = xskb->xdp.data_hard_start - pool->addrs - pool->headroom; + xp_init_xskb_dma(xskb, pool, dma_map->dma_pages, orig_addr); } } -- cgit v1.2.3 From 6e126872191df946a6fe01b79273119d32d96711 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Mon, 7 Oct 2024 14:24:56 +0200 Subject: xsk: Carry a copy of xdp_zc_max_segs within xsk_buff_pool This so we avoid dereferencing struct net_device within hot path. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20241007122458.282590-5-maciej.fijalkowski@intel.com --- include/net/xsk_buff_pool.h | 1 + net/xdp/xsk_buff_pool.c | 1 + net/xdp/xsk_queue.h | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 468a23b1b4c5..bb03cee716b3 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -76,6 +76,7 @@ struct xsk_buff_pool { u32 chunk_size; u32 chunk_shift; u32 frame_len; + u32 xdp_zc_max_segs; u8 tx_metadata_len; /* inherited from umem */ u8 cached_need_wakeup; bool uses_need_wakeup; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 7ecd4ccd2473..e946ba4a5ccf 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -229,6 +229,7 @@ int xp_assign_dev(struct xsk_buff_pool *pool, goto err_unreg_xsk; } pool->umem->zc = true; + pool->xdp_zc_max_segs = netdev->xdp_zc_max_segs; return 0; err_unreg_xsk: diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 406b20dfee8d..46d87e961ad6 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -260,7 +260,7 @@ u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool, nr_frags = 0; } else { nr_frags++; - if (nr_frags == pool->netdev->xdp_zc_max_segs) { + if (nr_frags == pool->xdp_zc_max_segs) { nr_frags = 0; break; } -- cgit v1.2.3 From 78e2baf3d96edd21c6f26d8afc0e68d02ec2c51c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2024 17:48:13 +0000 Subject: net: add TIME_WAIT logic to sk_to_full_sk() TCP will soon attach TIME_WAIT sockets to some ACK and RST. Make sure sk_to_full_sk() detects this and does not return a non full socket. v3: also changed sk_const_to_full_sk() Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Martin KaFai Lau Reviewed-by: Brian Vazquez Link: https://patch.msgid.link/20241010174817.1543642-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/bpf-cgroup.h | 2 +- include/net/inet_sock.h | 8 ++++++-- net/core/filter.c | 6 +----- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index ce91d9b2acb9..f0f219271daf 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -209,7 +209,7 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk) { \ typeof(sk) __sk = sk_to_full_sk(sk); \ - if (sk_fullsock(__sk) && __sk == skb_to_full_sk(skb) && \ + if (__sk && __sk == skb_to_full_sk(skb) && \ cgroup_bpf_sock_enabled(__sk, CGROUP_INET_EGRESS)) \ __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \ CGROUP_INET_EGRESS); \ diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index f01dd273bea6..56d8bc5593d3 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -321,8 +321,10 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet) static inline struct sock *sk_to_full_sk(struct sock *sk) { #ifdef CONFIG_INET - if (sk && sk->sk_state == TCP_NEW_SYN_RECV) + if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV) sk = inet_reqsk(sk)->rsk_listener; + if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT) + sk = NULL; #endif return sk; } @@ -331,8 +333,10 @@ static inline struct sock *sk_to_full_sk(struct sock *sk) static inline const struct sock *sk_const_to_full_sk(const struct sock *sk) { #ifdef CONFIG_INET - if (sk && sk->sk_state == TCP_NEW_SYN_RECV) + if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV) sk = ((const struct request_sock *)sk)->rsk_listener; + if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT) + sk = NULL; #endif return sk; } diff --git a/net/core/filter.c b/net/core/filter.c index bd0d08bf76bb..202c1d386e19 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6778,8 +6778,6 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk * sock refcnt is decremented to prevent a request_sock leak. */ - if (!sk_fullsock(sk2)) - sk2 = NULL; if (sk2 != sk) { sock_gen_put(sk); /* Ensure there is no need to bump sk2 refcnt */ @@ -6826,8 +6824,6 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk * sock refcnt is decremented to prevent a request_sock leak. */ - if (!sk_fullsock(sk2)) - sk2 = NULL; if (sk2 != sk) { sock_gen_put(sk); /* Ensure there is no need to bump sk2 refcnt */ @@ -7276,7 +7272,7 @@ BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk) { sk = sk_to_full_sk(sk); - if (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE)) + if (sk && sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE)) return (unsigned long)sk; return (unsigned long)NULL; -- cgit v1.2.3 From bc43a3c83cad46a27d6e3bf869acdd926bbe79ad Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2024 17:48:14 +0000 Subject: net_sched: sch_fq: prepare for TIME_WAIT sockets TCP stack is not attaching skb to TIME_WAIT sockets yet, but we would like to allow this in the future. Add sk_listener_or_tw() helper to detect the three states that FQ needs to take care. Like NEW_SYN_RECV, TIME_WAIT are not full sockets and do not contain sk->sk_pacing_status, sk->sk_pacing_rate. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Brian Vazquez Link: https://patch.msgid.link/20241010174817.1543642-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 10 ++++++++++ net/sched/sch_fq.c | 3 ++- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 6da420ab1ee1..2f200d91ef11 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2802,6 +2802,16 @@ static inline bool sk_listener(const struct sock *sk) return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV); } +/* This helper checks if a socket is a LISTEN or NEW_SYN_RECV or TIME_WAIT + * TCP SYNACK messages can be attached to LISTEN or NEW_SYN_RECV (depending on SYNCOOKIE) + * TCP RST and ACK can be attached to TIME_WAIT. + */ +static inline bool sk_listener_or_tw(const struct sock *sk) +{ + return (1 << READ_ONCE(sk->sk_state)) & + (TCPF_LISTEN | TCPF_NEW_SYN_RECV | TCPF_TIME_WAIT); +} + void sock_enable_timestamp(struct sock *sk, enum sock_flags flag); int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type); diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index aeabf45c9200..a97638bef6da 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -362,8 +362,9 @@ static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb, * 3) We do not want to rate limit them (eg SYNFLOOD attack), * especially if the listener set SO_MAX_PACING_RATE * 4) We pretend they are orphaned + * TCP can also associate TIME_WAIT sockets with RST or ACK packets. */ - if (!sk || sk_listener(sk)) { + if (!sk || sk_listener_or_tw(sk)) { unsigned long hash = skb_get_hash(skb) & q->orphan_mask; /* By forcing low order bit to 1, we make sure to not -- cgit v1.2.3 From 5ced52fa8f0dc23adb067f7b8a009a5ee051efb7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2024 17:48:15 +0000 Subject: net: add skb_set_owner_edemux() helper This can be used to attach a socket to an skb, taking a reference on sk->sk_refcnt. This helper might be a NOP if sk->sk_refcnt is zero. Use it from tcp_make_synack(). Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Brian Vazquez Link: https://patch.msgid.link/20241010174817.1543642-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 9 +++++++++ net/core/sock.c | 9 +++------ net/ipv4/tcp_output.c | 2 +- 3 files changed, 13 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 2f200d91ef11..bf7fa3db10ae 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1760,6 +1760,15 @@ void sock_efree(struct sk_buff *skb); #ifdef CONFIG_INET void sock_edemux(struct sk_buff *skb); void sock_pfree(struct sk_buff *skb); + +static inline void skb_set_owner_edemux(struct sk_buff *skb, struct sock *sk) +{ + skb_orphan(skb); + if (refcount_inc_not_zero(&sk->sk_refcnt)) { + skb->sk = sk; + skb->destructor = sock_edemux; + } +} #else #define sock_edemux sock_efree #endif diff --git a/net/core/sock.c b/net/core/sock.c index 083d438d8b6f..f8c0d4eda888 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2592,14 +2592,11 @@ void __sock_wfree(struct sk_buff *skb) void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); - skb->sk = sk; #ifdef CONFIG_INET - if (unlikely(!sk_fullsock(sk))) { - skb->destructor = sock_edemux; - sock_hold(sk); - return; - } + if (unlikely(!sk_fullsock(sk))) + return skb_set_owner_edemux(skb, sk); #endif + skb->sk = sk; skb->destructor = sock_wfree; skb_set_hash_from_sk(skb, sk); /* diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 06200bb111f8..054244ce5117 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3728,7 +3728,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, switch (synack_type) { case TCP_SYNACK_NORMAL: - skb_set_owner_w(skb, req_to_sk(req)); + skb_set_owner_edemux(skb, req_to_sk(req)); break; case TCP_SYNACK_COOKIE: /* Under synflood, we do not attach skb to a socket, -- cgit v1.2.3 From 79636038d37e7bd4d078238f2a3f002cab4423bc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2024 17:48:17 +0000 Subject: ipv4: tcp: give socket pointer to control skbs ip_send_unicast_reply() send orphaned 'control packets'. These are RST packets and also ACK packets sent from TIME_WAIT. Some eBPF programs would prefer to have a meaningful skb->sk pointer as much as possible. This means that TCP can now attach TIME_WAIT sockets to outgoing skbs. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Brian Vazquez Link: https://patch.msgid.link/20241010174817.1543642-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 3 ++- net/ipv4/ip_output.c | 5 ++++- net/ipv4/tcp_ipv4.c | 4 ++-- 3 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index bab084df1567..4be0a6a603b2 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -288,7 +288,8 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg) return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0; } -void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, +void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, + struct sk_buff *skb, const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e5c55a95063d..0065b1996c94 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1596,7 +1596,8 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, * Generic function to send a packet as reply to another packet. * Used to send some TCP resets/acks so far. */ -void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, +void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, + struct sk_buff *skb, const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, @@ -1662,6 +1663,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; + if (orig_sk) + skb_set_owner_edemux(nskb, (struct sock *)orig_sk); if (transmit_time) nskb->tstamp_type = SKB_CLOCK_MONOTONIC; if (txhash) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 985028434f64..9d3dd101ea71 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -907,7 +907,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, ctl_sk->sk_mark = 0; ctl_sk->sk_priority = 0; } - ip_send_unicast_reply(ctl_sk, + ip_send_unicast_reply(ctl_sk, sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len, @@ -1021,7 +1021,7 @@ static void tcp_v4_send_ack(const struct sock *sk, ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); transmit_time = tcp_transmit_time(sk); - ip_send_unicast_reply(ctl_sk, + ip_send_unicast_reply(ctl_sk, sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len, -- cgit v1.2.3 From 95b3120a485f77a9bb8060bf3398311e3dcb6c65 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Oct 2024 16:52:16 -0700 Subject: neighbour: Remove NEIGH_DN_TABLE. Since commit 1202cdd66531 ("Remove DECnet support from kernel"), NEIGH_DN_TABLE is no longer used. MPLS has implicit dependency on it in nla_put_via(), but nla_get_via() does not support DECnet. Let's remove NEIGH_DN_TABLE. Now, neigh_tables[] has only 2 elements and no extra iteration for DECnet in many places. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241014235216.10785-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 1 - net/mpls/af_mpls.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index a44f262a7384..3887ed9e5026 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -239,7 +239,6 @@ struct neigh_table { enum { NEIGH_ARP_TABLE = 0, NEIGH_ND_TABLE = 1, - NEIGH_DN_TABLE = 2, NEIGH_NR_TABLES, NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */ }; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index df62638b6498..a0573847bc55 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1664,7 +1664,7 @@ static int nla_put_via(struct sk_buff *skb, u8 table, const void *addr, int alen) { static const int table_to_family[NEIGH_NR_TABLES + 1] = { - AF_INET, AF_INET6, AF_DECnet, AF_PACKET, + AF_INET, AF_INET6, AF_PACKET, }; struct nlattr *nla; struct rtvia *via; -- cgit v1.2.3 From e1c6c383123ab1caadbfe39b3362ce0cc09dd766 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Oct 2024 13:18:28 -0700 Subject: rtnetlink: Remove rtnl_register() and rtnl_register_module(). No one uses rtnl_register() and rtnl_register_module(). Let's remove them. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014201828.91221-12-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/rtnetlink.h | 15 ++++++---- net/core/rtnetlink.c | 74 ++++++++++++++----------------------------------- 2 files changed, 31 insertions(+), 58 deletions(-) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 2d3eb7cb4dff..bb49c5708ce7 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -29,6 +29,16 @@ static inline enum rtnl_kinds rtnl_msgtype_kind(int msgtype) return msgtype & RTNL_KIND_MASK; } +/** + * struct rtnl_msg_handler - rtnetlink message type and handlers + * + * @owner: NULL for built-in, THIS_MODULE for module + * @protocol: Protocol family or PF_UNSPEC + * @msgtype: rtnetlink message type + * @doit: Function pointer called for each request message + * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message + * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions + */ struct rtnl_msg_handler { struct module *owner; int protocol; @@ -38,11 +48,6 @@ struct rtnl_msg_handler { int flags; }; -void rtnl_register(int protocol, int msgtype, - rtnl_doit_func, rtnl_dumpit_func, unsigned int flags); -int rtnl_register_module(struct module *owner, int protocol, int msgtype, - rtnl_doit_func, rtnl_dumpit_func, unsigned int flags); -int rtnl_unregister(int protocol, int msgtype); void rtnl_unregister_all(int protocol); int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 0fbbfeb2cb50..a9c92392fb1d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -338,57 +338,6 @@ unlock: return ret; } -/** - * rtnl_register_module - Register a rtnetlink message type - * - * @owner: module registering the hook (THIS_MODULE) - * @protocol: Protocol family or PF_UNSPEC - * @msgtype: rtnetlink message type - * @doit: Function pointer called for each request message - * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message - * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions - * - * Like rtnl_register, but for use by removable modules. - */ -int rtnl_register_module(struct module *owner, - int protocol, int msgtype, - rtnl_doit_func doit, rtnl_dumpit_func dumpit, - unsigned int flags) -{ - return rtnl_register_internal(owner, protocol, msgtype, - doit, dumpit, flags); -} -EXPORT_SYMBOL_GPL(rtnl_register_module); - -/** - * rtnl_register - Register a rtnetlink message type - * @protocol: Protocol family or PF_UNSPEC - * @msgtype: rtnetlink message type - * @doit: Function pointer called for each request message - * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message - * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions - * - * Registers the specified function pointers (at least one of them has - * to be non-NULL) to be called whenever a request message for the - * specified protocol family and message type is received. - * - * The special protocol family PF_UNSPEC may be used to define fallback - * function pointers for the case when no entry for the specific protocol - * family exists. - */ -void rtnl_register(int protocol, int msgtype, - rtnl_doit_func doit, rtnl_dumpit_func dumpit, - unsigned int flags) -{ - int err; - - err = rtnl_register_internal(NULL, protocol, msgtype, doit, dumpit, - flags); - if (err) - pr_err("Unable to register rtnetlink message handler, " - "protocol = %d, message type = %d\n", protocol, msgtype); -} - /** * rtnl_unregister - Unregister a rtnetlink message type * @protocol: Protocol family or PF_UNSPEC @@ -396,7 +345,7 @@ void rtnl_register(int protocol, int msgtype, * * Returns 0 on success or a negative error code. */ -int rtnl_unregister(int protocol, int msgtype) +static int rtnl_unregister(int protocol, int msgtype) { struct rtnl_link __rcu **tab; struct rtnl_link *link; @@ -419,7 +368,6 @@ int rtnl_unregister(int protocol, int msgtype) return 0; } -EXPORT_SYMBOL_GPL(rtnl_unregister); /** * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol @@ -454,6 +402,26 @@ void rtnl_unregister_all(int protocol) } EXPORT_SYMBOL_GPL(rtnl_unregister_all); +/** + * __rtnl_register_many - Register rtnetlink message types + * @handlers: Array of struct rtnl_msg_handlers + * @n: The length of @handlers + * + * Registers the specified function pointers (at least one of them has + * to be non-NULL) to be called whenever a request message for the + * specified protocol family and message type is received. + * + * The special protocol family PF_UNSPEC may be used to define fallback + * function pointers for the case when no entry for the specific protocol + * family exists. + * + * When one element of @handlers fails to register, + * 1) built-in: panics. + * 2) modules : the previous successful registrations are unwinded + * and an error is returned. + * + * Use rtnl_register_many(). + */ int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n) { const struct rtnl_msg_handler *handler; -- cgit v1.2.3 From 43c7ce69d28e185f62fe2b8be2c681c5cac0bc6b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:50 -0700 Subject: rtnetlink: Protect struct rtnl_link_ops with SRCU. Once RTNL is replaced with rtnl_net_lock(), we need a mechanism to guarantee that rtnl_link_ops is alive during inflight RTM_NEWLINK even when its module is being unloaded. Let's use SRCU to protect ops. rtnl_link_ops_get() now iterates link_ops under RCU and returns SRCU-protected ops pointer. The caller must call rtnl_link_ops_put() to release the pointer after the use. Also, __rtnl_link_unregister() unlinks the ops first and calls synchronize_srcu() to wait for inflight RTM_NEWLINK requests to complete. Note that link_ops needs to be protected by its dedicated lock when RTNL is removed. Suggested-by: Eric Dumazet Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- include/net/rtnetlink.h | 5 ++- net/core/rtnetlink.c | 83 ++++++++++++++++++++++++++++++++++++------------- 2 files changed, 65 insertions(+), 23 deletions(-) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index bb49c5708ce7..1a6aa5ca74f3 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -3,6 +3,7 @@ #define __NET_RTNETLINK_H #include +#include #include typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *, @@ -69,7 +70,8 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) /** * struct rtnl_link_ops - rtnetlink link operations * - * @list: Used internally + * @list: Used internally, protected by RTNL and SRCU + * @srcu: Used internally * @kind: Identifier * @netns_refund: Physical device, move to init_net on netns exit * @maxtype: Highest device specific netlink attribute number @@ -100,6 +102,7 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) */ struct rtnl_link_ops { struct list_head list; + struct srcu_struct srcu; const char *kind; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 9c9290a6c271..31b105b3a834 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -457,15 +457,29 @@ EXPORT_SYMBOL_GPL(__rtnl_unregister_many); static LIST_HEAD(link_ops); -static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) +static struct rtnl_link_ops *rtnl_link_ops_get(const char *kind, int *srcu_index) { - const struct rtnl_link_ops *ops; + struct rtnl_link_ops *ops; - list_for_each_entry(ops, &link_ops, list) { - if (!strcmp(ops->kind, kind)) - return ops; + rcu_read_lock(); + + list_for_each_entry_rcu(ops, &link_ops, list) { + if (!strcmp(ops->kind, kind)) { + *srcu_index = srcu_read_lock(&ops->srcu); + goto unlock; + } } - return NULL; + + ops = NULL; +unlock: + rcu_read_unlock(); + + return ops; +} + +static void rtnl_link_ops_put(struct rtnl_link_ops *ops, int srcu_index) +{ + srcu_read_unlock(&ops->srcu, srcu_index); } /** @@ -480,8 +494,16 @@ static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) */ int __rtnl_link_register(struct rtnl_link_ops *ops) { - if (rtnl_link_ops_get(ops->kind)) - return -EEXIST; + struct rtnl_link_ops *tmp; + int err; + + /* When RTNL is removed, add lock for link_ops. */ + ASSERT_RTNL(); + + list_for_each_entry(tmp, &link_ops, list) { + if (!strcmp(ops->kind, tmp->kind)) + return -EEXIST; + } /* The check for alloc/setup is here because if ops * does not have that filled up, it is not possible @@ -491,7 +513,12 @@ int __rtnl_link_register(struct rtnl_link_ops *ops) if ((ops->alloc || ops->setup) && !ops->dellink) ops->dellink = unregister_netdevice_queue; - list_add_tail(&ops->list, &link_ops); + err = init_srcu_struct(&ops->srcu); + if (err) + return err; + + list_add_tail_rcu(&ops->list, &link_ops); + return 0; } EXPORT_SYMBOL_GPL(__rtnl_link_register); @@ -542,10 +569,12 @@ void __rtnl_link_unregister(struct rtnl_link_ops *ops) { struct net *net; - for_each_net(net) { + list_del_rcu(&ops->list); + synchronize_srcu(&ops->srcu); + cleanup_srcu_struct(&ops->srcu); + + for_each_net(net) __rtnl_kill_links(net, ops); - } - list_del(&ops->list); } EXPORT_SYMBOL_GPL(__rtnl_link_unregister); @@ -2158,10 +2187,11 @@ static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { [IFLA_XDP_PROG_ID] = { .type = NLA_U32 }, }; -static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla) +static struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla, + int *ops_srcu_index) { - const struct rtnl_link_ops *ops = NULL; struct nlattr *linfo[IFLA_INFO_MAX + 1]; + struct rtnl_link_ops *ops = NULL; if (nla_parse_nested_deprecated(linfo, IFLA_INFO_MAX, nla, ifla_info_policy, NULL) < 0) return NULL; @@ -2170,7 +2200,7 @@ static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla char kind[MODULE_NAME_LEN]; nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); - ops = rtnl_link_ops_get(kind); + ops = rtnl_link_ops_get(kind, ops_srcu_index); } return ops; @@ -2290,8 +2320,8 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { - const struct rtnl_link_ops *kind_ops = NULL; struct netlink_ext_ack *extack = cb->extack; + struct rtnl_link_ops *kind_ops = NULL; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int flags = NLM_F_MULTI; @@ -2302,6 +2332,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) struct net *tgt_net = net; u32 ext_filter_mask = 0; struct net_device *dev; + int ops_srcu_index; int master_idx = 0; int netnsid = -1; int err, i; @@ -2335,7 +2366,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) master_idx = nla_get_u32(tb[i]); break; case IFLA_LINKINFO: - kind_ops = linkinfo_to_kind_ops(tb[i]); + kind_ops = linkinfo_to_kind_ops(tb[i], &ops_srcu_index); break; default: if (cb->strict_check) { @@ -2361,6 +2392,10 @@ walk_entries: if (err < 0) break; } + + if (kind_ops) + rtnl_link_ops_put(kind_ops, ops_srcu_index); + cb->seq = tgt_net->dev_base_seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); if (netnsid >= 0) @@ -3747,8 +3782,9 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr **tb, **linkinfo, **data = NULL; - const struct rtnl_link_ops *ops = NULL; + struct rtnl_link_ops *ops = NULL; struct rtnl_newlink_tbs *tbs; + int ops_srcu_index; int ret; tbs = kmalloc(sizeof(*tbs), GFP_KERNEL); @@ -3780,13 +3816,13 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, char kind[MODULE_NAME_LEN]; nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); - ops = rtnl_link_ops_get(kind); + ops = rtnl_link_ops_get(kind, &ops_srcu_index); #ifdef CONFIG_MODULES if (!ops) { __rtnl_unlock(); request_module("rtnl-link-%s", kind); rtnl_lock(); - ops = rtnl_link_ops_get(kind); + ops = rtnl_link_ops_get(kind, &ops_srcu_index); } #endif } @@ -3800,7 +3836,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, linkinfo[IFLA_INFO_DATA], ops->policy, extack); if (ret < 0) - goto free; + goto put_ops; data = tbs->attr; } @@ -3808,12 +3844,15 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (ops->validate) { ret = ops->validate(tb, data, extack); if (ret < 0) - goto free; + goto put_ops; } } ret = __rtnl_newlink(skb, nlh, ops, tbs, data, extack); +put_ops: + if (ops) + rtnl_link_ops_put(ops, ops_srcu_index); free: kfree(tbs); return ret; -- cgit v1.2.3 From 26eebdc4b005ccd4cf63f4fef4c9c0adf9bfa380 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:56 -0700 Subject: rtnetlink: Return int from rtnl_af_register(). The next patch will add init_srcu_struct() in rtnl_af_register(), then we need to handle its error. Let's add the error handling in advance to make the following patch cleaner. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Matt Johnston Signed-off-by: Paolo Abeni --- include/net/rtnetlink.h | 2 +- net/bridge/br_netlink.c | 6 +++++- net/core/rtnetlink.c | 4 +++- net/ipv4/devinet.c | 3 ++- net/ipv6/addrconf.c | 5 ++++- net/mctp/device.c | 16 +++++++++++----- net/mpls/af_mpls.c | 5 ++++- 7 files changed, 30 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 1a6aa5ca74f3..969138ae2f4b 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -204,7 +204,7 @@ struct rtnl_af_ops { size_t (*get_stats_af_size)(const struct net_device *dev); }; -void rtnl_af_register(struct rtnl_af_ops *ops); +int rtnl_af_register(struct rtnl_af_ops *ops); void rtnl_af_unregister(struct rtnl_af_ops *ops); struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 6b97ae47f855..3e0f47203f2a 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1924,7 +1924,9 @@ int __init br_netlink_init(void) if (err) goto out; - rtnl_af_register(&br_af_ops); + err = rtnl_af_register(&br_af_ops); + if (err) + goto out_vlan; err = rtnl_link_register(&br_link_ops); if (err) @@ -1934,6 +1936,8 @@ int __init br_netlink_init(void) out_af: rtnl_af_unregister(&br_af_ops); +out_vlan: + br_vlan_rtnl_uninit(); out: return err; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 445e6ffed75e..70b663aca209 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -686,11 +686,13 @@ static const struct rtnl_af_ops *rtnl_af_lookup(const int family) * * Returns 0 on success or a negative error code. */ -void rtnl_af_register(struct rtnl_af_ops *ops) +int rtnl_af_register(struct rtnl_af_ops *ops) { rtnl_lock(); list_add_tail_rcu(&ops->list, &rtnl_af_ops); rtnl_unlock(); + + return 0; } EXPORT_SYMBOL_GPL(rtnl_af_register); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index ec29ead83e74..0ff9c0abfaa0 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2827,7 +2827,8 @@ void __init devinet_init(void) register_pernet_subsys(&devinet_ops); register_netdevice_notifier(&ip_netdev_notifier); - rtnl_af_register(&inet_af_ops); + if (rtnl_af_register(&inet_af_ops)) + panic("Unable to register inet_af_ops\n"); rtnl_register_many(devinet_rtnl_msg_handlers); } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ac8645ad2537..d0a99710d65d 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -7468,7 +7468,9 @@ int __init addrconf_init(void) addrconf_verify(&init_net); - rtnl_af_register(&inet6_ops); + err = rtnl_af_register(&inet6_ops); + if (err) + goto erraf; err = rtnl_register_many(addrconf_rtnl_msg_handlers); if (err) @@ -7482,6 +7484,7 @@ int __init addrconf_init(void) errout: rtnl_unregister_all(PF_INET6); rtnl_af_unregister(&inet6_ops); +erraf: unregister_netdevice_notifier(&ipv6_dev_notf); errlo: destroy_workqueue(addrconf_wq); diff --git a/net/mctp/device.c b/net/mctp/device.c index 85cc5f31f1e7..3d75b919995d 100644 --- a/net/mctp/device.c +++ b/net/mctp/device.c @@ -535,14 +535,20 @@ int __init mctp_device_init(void) int err; register_netdevice_notifier(&mctp_dev_nb); - rtnl_af_register(&mctp_af_ops); + + err = rtnl_af_register(&mctp_af_ops); + if (err) + goto err_notifier; err = rtnl_register_many(mctp_device_rtnl_msg_handlers); - if (err) { - rtnl_af_unregister(&mctp_af_ops); - unregister_netdevice_notifier(&mctp_dev_nb); - } + if (err) + goto err_af; + return 0; +err_af: + rtnl_af_unregister(&mctp_af_ops); +err_notifier: + unregister_netdevice_notifier(&mctp_dev_nb); return err; } diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index a0573847bc55..1f63b32d76d6 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -2753,7 +2753,9 @@ static int __init mpls_init(void) dev_add_pack(&mpls_packet_type); - rtnl_af_register(&mpls_af_ops); + err = rtnl_af_register(&mpls_af_ops); + if (err) + goto out_unregister_dev_type; err = rtnl_register_many(mpls_rtnl_msg_handlers); if (err) @@ -2773,6 +2775,7 @@ out_unregister_rtnl: rtnl_unregister_many(mpls_rtnl_msg_handlers); out_unregister_rtnl_af: rtnl_af_unregister(&mpls_af_ops); +out_unregister_dev_type: dev_remove_pack(&mpls_packet_type); out_unregister_pernet: unregister_pernet_subsys(&mpls_net_ops); -- cgit v1.2.3 From 6ab0f866948323724e95cf14d9e47fd77703c192 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:57 -0700 Subject: rtnetlink: Protect struct rtnl_af_ops with SRCU. Once RTNL is replaced with rtnl_net_lock(), we need a mechanism to guarantee that rtnl_af_ops is alive during inflight RTM_SETLINK even when its module is being unloaded. Let's use SRCU to protect ops. rtnl_af_lookup() now iterates rtnl_af_ops under RCU and returns SRCU-protected ops pointer. The caller must call rtnl_af_put() to release the pointer after the use. Also, rtnl_af_unregister() unlinks the ops first and calls synchronize_srcu() to wait for inflight RTM_SETLINK requests to complete. Note that rtnl_af_ops needs to be protected by its dedicated lock when RTNL is removed. Note also that BUG_ON() in do_setlink() is changed to the normal error handling as a different af_ops might be found after validate_linkmsg(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- include/net/rtnetlink.h | 5 +++- net/core/rtnetlink.c | 63 ++++++++++++++++++++++++++++++++++++------------- 2 files changed, 51 insertions(+), 17 deletions(-) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 969138ae2f4b..e0d9a8eae6b6 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -172,7 +172,8 @@ void rtnl_link_unregister(struct rtnl_link_ops *ops); /** * struct rtnl_af_ops - rtnetlink address family operations * - * @list: Used internally + * @list: Used internally, protected by RTNL and SRCU + * @srcu: Used internally * @family: Address family * @fill_link_af: Function to fill IFLA_AF_SPEC with address family * specific netlink attributes. @@ -185,6 +186,8 @@ void rtnl_link_unregister(struct rtnl_link_ops *ops); */ struct rtnl_af_ops { struct list_head list; + struct srcu_struct srcu; + int family; int (*fill_link_af)(struct sk_buff *skb, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 70b663aca209..194a81e5f608 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -666,18 +666,31 @@ static size_t rtnl_link_get_size(const struct net_device *dev) static LIST_HEAD(rtnl_af_ops); -static const struct rtnl_af_ops *rtnl_af_lookup(const int family) +static struct rtnl_af_ops *rtnl_af_lookup(const int family, int *srcu_index) { - const struct rtnl_af_ops *ops; + struct rtnl_af_ops *ops; ASSERT_RTNL(); - list_for_each_entry(ops, &rtnl_af_ops, list) { - if (ops->family == family) - return ops; + rcu_read_lock(); + + list_for_each_entry_rcu(ops, &rtnl_af_ops, list) { + if (ops->family == family) { + *srcu_index = srcu_read_lock(&ops->srcu); + goto unlock; + } } - return NULL; + ops = NULL; +unlock: + rcu_read_unlock(); + + return ops; +} + +static void rtnl_af_put(struct rtnl_af_ops *ops, int srcu_index) +{ + srcu_read_unlock(&ops->srcu, srcu_index); } /** @@ -688,6 +701,11 @@ static const struct rtnl_af_ops *rtnl_af_lookup(const int family) */ int rtnl_af_register(struct rtnl_af_ops *ops) { + int err = init_srcu_struct(&ops->srcu); + + if (err) + return err; + rtnl_lock(); list_add_tail_rcu(&ops->list, &rtnl_af_ops); rtnl_unlock(); @@ -707,6 +725,8 @@ void rtnl_af_unregister(struct rtnl_af_ops *ops) rtnl_unlock(); synchronize_rcu(); + synchronize_srcu(&ops->srcu); + cleanup_srcu_struct(&ops->srcu); } EXPORT_SYMBOL_GPL(rtnl_af_unregister); @@ -2579,20 +2599,24 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], int rem, err; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { - const struct rtnl_af_ops *af_ops; + struct rtnl_af_ops *af_ops; + int af_ops_srcu_index; - af_ops = rtnl_af_lookup(nla_type(af)); + af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index); if (!af_ops) return -EAFNOSUPPORT; if (!af_ops->set_link_af) - return -EOPNOTSUPP; - - if (af_ops->validate_link_af) { + err = -EOPNOTSUPP; + else if (af_ops->validate_link_af) err = af_ops->validate_link_af(dev, af, extack); - if (err < 0) - return err; - } + else + err = 0; + + rtnl_af_put(af_ops, af_ops_srcu_index); + + if (err < 0) + return err; } } @@ -3172,11 +3196,18 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev, int rem; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { - const struct rtnl_af_ops *af_ops; + struct rtnl_af_ops *af_ops; + int af_ops_srcu_index; - BUG_ON(!(af_ops = rtnl_af_lookup(nla_type(af)))); + af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index); + if (!af_ops) { + err = -EAFNOSUPPORT; + goto errout; + } err = af_ops->set_link_af(dev, af, extack); + rtnl_af_put(af_ops, af_ops_srcu_index); + if (err < 0) goto errout; -- cgit v1.2.3 From 83c289e81e88d01e55d6d56531502ed7b4886a05 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 17 Oct 2024 19:19:34 +0300 Subject: net/sched: act_api: unexport tcf_action_dump_1() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This isn't used outside act_api.c, but is called by tcf_dump_walker() prior to its definition. So move it upwards and make it static. Simultaneously, reorder the variable declarations so that they follow the networking "reverse Christmas tree" coding style. Signed-off-by: Vladimir Oltean Reviewed-by: Simon Horman Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20241017161934.3599046-1-vladimir.oltean@nxp.com Signed-off-by: Paolo Abeni --- include/net/act_api.h | 1 - net/sched/act_api.c | 89 +++++++++++++++++++++++++-------------------------- 2 files changed, 44 insertions(+), 46 deletions(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 77ee0c657e2c..404df8557f6a 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -219,7 +219,6 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, int ref, bool terse); int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); -int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); static inline void tcf_action_update_bstats(struct tc_action *a, struct sk_buff *skb) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 5bbfb83ed600..c3f8dd7f8e65 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -504,6 +504,50 @@ nla_put_failure: return -1; } +static int +tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nest; + int err = -EINVAL; + u32 flags; + + if (tcf_action_dump_terse(skb, a, false)) + goto nla_put_failure; + + if (a->hw_stats != TCA_ACT_HW_STATS_ANY && + nla_put_bitfield32(skb, TCA_ACT_HW_STATS, + a->hw_stats, TCA_ACT_HW_STATS_ANY)) + goto nla_put_failure; + + if (a->used_hw_stats_valid && + nla_put_bitfield32(skb, TCA_ACT_USED_HW_STATS, + a->used_hw_stats, TCA_ACT_HW_STATS_ANY)) + goto nla_put_failure; + + flags = a->tcfa_flags & TCA_ACT_FLAGS_USER_MASK; + if (flags && + nla_put_bitfield32(skb, TCA_ACT_FLAGS, + flags, flags)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_ACT_IN_HW_COUNT, a->in_hw_count)) + goto nla_put_failure; + + nest = nla_nest_start_noflag(skb, TCA_ACT_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + err = tcf_action_dump_old(skb, a, bind, ref); + if (err > 0) { + nla_nest_end(skb, nest); + return err; + } + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, struct netlink_callback *cb) { @@ -1190,51 +1234,6 @@ tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref) return a->ops->dump(skb, a, bind, ref); } -int -tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) -{ - int err = -EINVAL; - unsigned char *b = skb_tail_pointer(skb); - struct nlattr *nest; - u32 flags; - - if (tcf_action_dump_terse(skb, a, false)) - goto nla_put_failure; - - if (a->hw_stats != TCA_ACT_HW_STATS_ANY && - nla_put_bitfield32(skb, TCA_ACT_HW_STATS, - a->hw_stats, TCA_ACT_HW_STATS_ANY)) - goto nla_put_failure; - - if (a->used_hw_stats_valid && - nla_put_bitfield32(skb, TCA_ACT_USED_HW_STATS, - a->used_hw_stats, TCA_ACT_HW_STATS_ANY)) - goto nla_put_failure; - - flags = a->tcfa_flags & TCA_ACT_FLAGS_USER_MASK; - if (flags && - nla_put_bitfield32(skb, TCA_ACT_FLAGS, - flags, flags)) - goto nla_put_failure; - - if (nla_put_u32(skb, TCA_ACT_IN_HW_COUNT, a->in_hw_count)) - goto nla_put_failure; - - nest = nla_nest_start_noflag(skb, TCA_ACT_OPTIONS); - if (nest == NULL) - goto nla_put_failure; - err = tcf_action_dump_old(skb, a, bind, ref); - if (err > 0) { - nla_nest_end(skb, nest); - return err; - } - -nla_put_failure: - nlmsg_trim(skb, b); - return -1; -} -EXPORT_SYMBOL(tcf_action_dump_1); - int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, int ref, bool terse) { -- cgit v1.2.3 From c972c1c41d9b20fb38b54e77dcee763e27e715a9 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Oct 2024 18:41:00 -0700 Subject: ipv4: Switch inet_addr_hash() to less predictable hash. Recently, commit 4a0ec2aa0704 ("ipv6: switch inet6_addr_hash() to less predictable hash") and commit 4daf4dc275f1 ("ipv6: switch inet6_acaddr_hash() to less predictable hash") hardened IPv6 address hash functions. inet_addr_hash() is also highly predictable, and a malicious use could abuse a specific bucket. Let's follow the change on IPv4 by using jhash_1word(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241018014100.93776-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- include/net/ip.h | 5 +++++ net/ipv4/devinet.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index 4be0a6a603b2..0e548c1f2a0e 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -690,6 +690,11 @@ static inline unsigned int ipv4_addr_hash(__be32 ip) return (__force unsigned int) ip; } +static inline u32 __ipv4_addr_hash(const __be32 ip, const u32 initval) +{ + return jhash_1word((__force u32)ip, initval); +} + static inline u32 ipv4_portaddr_hash(const struct net *net, __be32 saddr, unsigned int port) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 0ff9c0abfaa0..5f859d01cbbe 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -121,7 +121,7 @@ struct inet_fill_args { static u32 inet_addr_hash(const struct net *net, __be32 addr) { - u32 val = (__force u32) addr ^ net_hash_mix(net); + u32 val = __ipv4_addr_hash(addr, net_hash_mix(net)); return hash_32(val, IN4_ADDR_HSIZE_SHIFT); } -- cgit v1.2.3 From 074a8b54dacc1920f54381f3661ecee6786b0c21 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 7 Oct 2024 15:00:45 +0300 Subject: wifi: mac80211: Add support to indicate that a new interface is to be added Add support to indicate to the driver that an interface is about to be added so that the driver could prepare its resources early if it needs so. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241007144851.e0e8563e1c30.Ifccc96a46a347eb15752caefc9f4eff31f75ed47@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 8 ++++++++ net/mac80211/cfg.c | 18 ++++++++++++++++++ net/mac80211/driver-ops.h | 12 ++++++++++++ net/mac80211/trace.h | 19 +++++++++++++++++++ 4 files changed, 57 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 333e0fae6796..0b8df8ec5a3b 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4444,6 +4444,12 @@ struct ieee80211_prep_tx_info { * if the requested TID-To-Link mapping can be accepted or not. * If it's not accepted the driver may suggest a preferred mapping and * modify @ttlm parameter with the suggested TID-to-Link mapping. + * @prep_add_interface: prepare for interface addition. This can be used by + * drivers to prepare for the addition of a new interface, e.g., allocate + * the needed resources etc. This callback doesn't guarantee that an + * interface with the specified type would be added, and thus drivers that + * implement this callback need to handle such cases. The type is the full + * &enum nl80211_iftype. */ struct ieee80211_ops { void (*tx)(struct ieee80211_hw *hw, @@ -4828,6 +4834,8 @@ struct ieee80211_ops { enum ieee80211_neg_ttlm_res (*can_neg_ttlm)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_neg_ttlm *ttlm); + void (*prep_add_interface)(struct ieee80211_hw *hw, + enum nl80211_iftype type); }; /** diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 847304a3a29a..ce9558cd1576 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -194,6 +194,24 @@ static struct wireless_dev *ieee80211_add_iface(struct wiphy *wiphy, } } + /* Let the driver know that an interface is going to be added. + * Indicate so only for interface types that will be added to the + * driver. + */ + switch (type) { + case NL80211_IFTYPE_AP_VLAN: + break; + case NL80211_IFTYPE_MONITOR: + if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) || + !(params->flags & MONITOR_FLAG_ACTIVE)) + break; + fallthrough; + default: + drv_prep_add_interface(local, + ieee80211_vif_type_p2p(&sdata->vif)); + break; + } + return wdev; } diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index d382d9729e85..48bc2da728c0 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1728,4 +1728,16 @@ drv_can_neg_ttlm(struct ieee80211_local *local, return res; } + +static inline void +drv_prep_add_interface(struct ieee80211_local *local, + enum nl80211_iftype type) +{ + trace_drv_prep_add_interface(local, type); + if (local->ops->prep_add_interface) + local->ops->prep_add_interface(&local->hw, type); + + trace_drv_return_void(local); +} + #endif /* __MAC80211_DRIVER_OPS */ diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index dc498cd8cd91..e6f0ce8e5d43 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -3154,6 +3154,25 @@ TRACE_EVENT(drv_neg_ttlm_res, LOCAL_PR_ARG, VIF_PR_ARG, __entry->res ) ); + +TRACE_EVENT(drv_prep_add_interface, + TP_PROTO(struct ieee80211_local *local, + enum nl80211_iftype type), + + TP_ARGS(local, type), + TP_STRUCT__entry(LOCAL_ENTRY + __field(u32, type) + ), + + TP_fast_assign(LOCAL_ASSIGN; + __entry->type = type; + ), + + TP_printk(LOCAL_PR_FMT " type: %u\n ", + LOCAL_PR_ARG, __entry->type + ) +); + #endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3 From 62262dd00c319195f2e14022903b7ebbb53119bc Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 15:00:46 +0300 Subject: wifi: cfg80211: disallow SMPS in AP mode In practice, userspace hasn't been able to set this for many years, and mac80211 has already rejected it (which is now no longer needed), so reject SMPS mode (other than "OFF" to be a bit more compatible) in AP mode. Also remove the parameter from the AP settings struct. Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241007144851.fe1fc46484cf.I8676fb52b818a4bedeb9c25b901e1396277ffc0b@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/quantenna/qtnfmac/commands.c | 2 +- include/net/cfg80211.h | 2 -- net/mac80211/cfg.c | 3 --- net/wireless/nl80211.c | 30 +++-------------------- 4 files changed, 4 insertions(+), 33 deletions(-) (limited to 'include/net') diff --git a/drivers/net/wireless/quantenna/qtnfmac/commands.c b/drivers/net/wireless/quantenna/qtnfmac/commands.c index 9540ad6196d7..956c5763662f 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/commands.c +++ b/drivers/net/wireless/quantenna/qtnfmac/commands.c @@ -257,7 +257,7 @@ int qtnf_cmd_send_start_ap(struct qtnf_vif *vif, cmd->beacon_interval = cpu_to_le16(s->beacon_interval); cmd->hidden_ssid = qlink_hidden_ssid_nl2q(s->hidden_ssid); cmd->inactivity_timeout = cpu_to_le16(s->inactivity_timeout); - cmd->smps_mode = s->smps_mode; + cmd->smps_mode = NL80211_SMPS_OFF; cmd->p2p_ctwindow = s->p2p_ctwindow; cmd->p2p_opp_ps = s->p2p_opp_ps; cmd->pbss = s->pbss; diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 69ec1eb41a09..c8ce5c2e14f4 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1460,7 +1460,6 @@ struct cfg80211_unsol_bcast_probe_resp { * @crypto: crypto settings * @privacy: the BSS uses privacy * @auth_type: Authentication type (algorithm) - * @smps_mode: SMPS mode * @inactivity_timeout: time in seconds to determine station's inactivity. * @p2p_ctwindow: P2P CT Window * @p2p_opp_ps: P2P opportunistic PS @@ -1498,7 +1497,6 @@ struct cfg80211_ap_settings { struct cfg80211_crypto_settings crypto; bool privacy; enum nl80211_auth_type auth_type; - enum nl80211_smps_mode smps_mode; int inactivity_timeout; u8 p2p_ctwindow; bool p2p_opp_ps; diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index ce9558cd1576..548b9bbdac04 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1312,9 +1312,6 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, if (old) return -EALREADY; - if (params->smps_mode != NL80211_SMPS_OFF) - return -EOPNOTSUPP; - link->smps_mode = IEEE80211_SMPS_OFF; link->needed_rx_chains = sdata->local->rx_chains; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 4e3609176880..fb35c03af34c 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6227,33 +6227,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) goto out; } - if (info->attrs[NL80211_ATTR_SMPS_MODE]) { - params->smps_mode = - nla_get_u8(info->attrs[NL80211_ATTR_SMPS_MODE]); - switch (params->smps_mode) { - case NL80211_SMPS_OFF: - break; - case NL80211_SMPS_STATIC: - if (!(rdev->wiphy.features & - NL80211_FEATURE_STATIC_SMPS)) { - err = -EINVAL; - goto out; - } - break; - case NL80211_SMPS_DYNAMIC: - if (!(rdev->wiphy.features & - NL80211_FEATURE_DYNAMIC_SMPS)) { - err = -EINVAL; - goto out; - } - break; - default: - err = -EINVAL; - goto out; - } - } else { - params->smps_mode = NL80211_SMPS_OFF; - } + if (info->attrs[NL80211_ATTR_SMPS_MODE] && + nla_get_u8(info->attrs[NL80211_ATTR_SMPS_MODE]) != NL80211_SMPS_OFF) + return -EOPNOTSUPP; params->pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]); if (params->pbss && !rdev->wiphy.bands[NL80211_BAND_60GHZ]) { -- cgit v1.2.3 From 9c5f2c7eeb585834f8dadb552b4fd811dd2dee6f Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Mon, 7 Oct 2024 15:00:47 +0300 Subject: wifi: mac80211: rename IEEE80211_CHANCTX_CHANGE_MIN_WIDTH The name is misleading, this actually indicates that ieee80211_chanctx_conf::min_def was updated. Rename it to IEEE80211_CHANCTX_CHANGE_MIN_DEF. Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241007144851.726b5f12ae0c.I3bd9e594c9d2735183ec049a4c7224bd0a9599c9@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 4 ++-- include/net/mac80211.h | 4 ++-- net/mac80211/chan.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index a327893c6dce..11cd1a8fdd9e 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -5067,7 +5067,7 @@ void iwl_mvm_change_chanctx(struct ieee80211_hw *hw, (changed & ~(IEEE80211_CHANCTX_CHANGE_WIDTH | IEEE80211_CHANCTX_CHANGE_RX_CHAINS | IEEE80211_CHANCTX_CHANGE_RADAR | - IEEE80211_CHANCTX_CHANGE_MIN_WIDTH)), + IEEE80211_CHANCTX_CHANGE_MIN_DEF)), "Cannot change PHY. Ref=%d, changed=0x%X\n", phy_ctxt->ref, changed)) return; @@ -5075,7 +5075,7 @@ void iwl_mvm_change_chanctx(struct ieee80211_hw *hw, guard(mvm)(mvm); /* we are only changing the min_width, may be a noop */ - if (changed == IEEE80211_CHANCTX_CHANGE_MIN_WIDTH) { + if (changed == IEEE80211_CHANCTX_CHANGE_MIN_DEF) { if (phy_ctxt->width == def->width) return; diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 0b8df8ec5a3b..c42ad5a0c303 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -213,7 +213,7 @@ struct ieee80211_low_level_stats { * @IEEE80211_CHANCTX_CHANGE_RADAR: radar detection flag changed * @IEEE80211_CHANCTX_CHANGE_CHANNEL: switched to another operating channel, * this is used only with channel switching with CSA - * @IEEE80211_CHANCTX_CHANGE_MIN_WIDTH: The min required channel width changed + * @IEEE80211_CHANCTX_CHANGE_MIN_DEF: The min chandef changed * @IEEE80211_CHANCTX_CHANGE_AP: The AP channel definition changed, so (wider * bandwidth) OFDMA settings need to be changed * @IEEE80211_CHANCTX_CHANGE_PUNCTURING: The punctured channel(s) bitmap @@ -224,7 +224,7 @@ enum ieee80211_chanctx_change { IEEE80211_CHANCTX_CHANGE_RX_CHAINS = BIT(1), IEEE80211_CHANCTX_CHANGE_RADAR = BIT(2), IEEE80211_CHANCTX_CHANGE_CHANNEL = BIT(3), - IEEE80211_CHANCTX_CHANGE_MIN_WIDTH = BIT(4), + IEEE80211_CHANCTX_CHANGE_MIN_DEF = BIT(4), IEEE80211_CHANCTX_CHANGE_AP = BIT(5), IEEE80211_CHANCTX_CHANGE_PUNCTURING = BIT(6), }; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index a155e418d26b..3f7df45b0431 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -409,7 +409,7 @@ _ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, if (!ctx->driver_present) return 0; - return IEEE80211_CHANCTX_CHANGE_MIN_WIDTH; + return IEEE80211_CHANCTX_CHANGE_MIN_DEF; } static void ieee80211_chan_bw_change(struct ieee80211_local *local, -- cgit v1.2.3 From e21dd758cf4c51d508e6665b653c5836103d1027 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Mon, 7 Oct 2024 15:00:48 +0300 Subject: wifi: mac80211: make bss_param_ch_cnt available for the low level driver Drivers may need to track this. Make it available for them, and maintain the value when beacons are received. When link X receives a beacon, iterate the RNR elements and update all the links with their respective data. Track the link id that updated the data so that each link can know whether the update came from its own beacon or from another link. In case, the update came from the link's own beacon, always update the updater link id. The purpose is to let the low level driver know if a link is losing its beacons. If link X is losing its beacons, it can still track the bss_param_ch_cnt and know where the update came from. Signed-off-by: Emmanuel Grumbach Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241007144851.e2d8d1a722ad.I04b883daba2cd48e5730659eb62ca1614c899cbb@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 15 ++++++++ net/mac80211/ieee80211_i.h | 2 - net/mac80211/mlme.c | 93 +++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 106 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index c42ad5a0c303..20562676e9cd 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -740,6 +740,19 @@ struct ieee80211_parsed_tpe { * @eht_80mhz_full_bw_ul_mumimo: in AP-mode, does this BSS support the * reception of an EHT TB PPDU on an RU that spans the entire PPDU * bandwidth + * @bss_param_ch_cnt: in BSS-mode, the BSS params change count. This + * information is the latest known value. It can come from this link's + * beacon or from a beacon sent by another link. + * @bss_param_ch_cnt_link_id: in BSS-mode, the link_id to which the beacon + * that updated &bss_param_ch_cnt belongs. E.g. if link 1 doesn't hear + * its beacons, and link 2 sent a beacon with an RNR element that updated + * link 1's BSS params change count, then, link 1's + * bss_param_ch_cnt_link_id will be 2. That means that link 1 knows that + * link 2 was the link that updated its bss_param_ch_cnt value. + * In case link 1 hears its beacon again, bss_param_ch_cnt_link_id will + * be updated to 1, even if bss_param_ch_cnt didn't change. This allows + * the link to know that it heard the latest value from its own beacon + * (as opposed to hearing its value from another link's beacon). */ struct ieee80211_bss_conf { struct ieee80211_vif *vif; @@ -834,6 +847,8 @@ struct ieee80211_bss_conf { bool eht_su_beamformee; bool eht_mu_beamformer; bool eht_80mhz_full_bw_ul_mumimo; + u8 bss_param_ch_cnt; + u8 bss_param_ch_cnt_link_id; }; /** diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index afb867dc6b24..d884d05826d3 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1015,8 +1015,6 @@ struct ieee80211_link_data_managed { int wmm_last_param_set; int mu_edca_last_param_set; - - u8 bss_param_ch_cnt; }; struct ieee80211_link_data_ap { diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 0303972c23e4..93a11a59339c 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2643,6 +2643,89 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, &ifmgd->csa_connection_drop_work); } +struct sta_bss_param_ch_cnt_data { + struct ieee80211_sub_if_data *sdata; + u8 reporting_link_id; + u8 mld_id; +}; + +static enum cfg80211_rnr_iter_ret +ieee80211_sta_bss_param_ch_cnt_iter(void *_data, u8 type, + const struct ieee80211_neighbor_ap_info *info, + const u8 *tbtt_info, u8 tbtt_info_len) +{ + struct sta_bss_param_ch_cnt_data *data = _data; + struct ieee80211_sub_if_data *sdata = data->sdata; + const struct ieee80211_tbtt_info_ge_11 *ti; + u8 bss_param_ch_cnt; + int link_id; + + if (type != IEEE80211_TBTT_INFO_TYPE_TBTT) + return RNR_ITER_CONTINUE; + + if (tbtt_info_len < sizeof(*ti)) + return RNR_ITER_CONTINUE; + + ti = (const void *)tbtt_info; + + if (ti->mld_params.mld_id != data->mld_id) + return RNR_ITER_CONTINUE; + + link_id = le16_get_bits(ti->mld_params.params, + IEEE80211_RNR_MLD_PARAMS_LINK_ID); + bss_param_ch_cnt = + le16_get_bits(ti->mld_params.params, + IEEE80211_RNR_MLD_PARAMS_BSS_CHANGE_COUNT); + + if (bss_param_ch_cnt != 255 && + link_id < ARRAY_SIZE(sdata->link)) { + struct ieee80211_link_data *link = + sdata_dereference(sdata->link[link_id], sdata); + + if (link && link->conf->bss_param_ch_cnt != bss_param_ch_cnt) { + link->conf->bss_param_ch_cnt = bss_param_ch_cnt; + link->conf->bss_param_ch_cnt_link_id = + data->reporting_link_id; + } + } + + return RNR_ITER_CONTINUE; +} + +static void +ieee80211_mgd_update_bss_param_ch_cnt(struct ieee80211_sub_if_data *sdata, + struct ieee80211_bss_conf *bss_conf, + struct ieee802_11_elems *elems) +{ + struct sta_bss_param_ch_cnt_data data = { + .reporting_link_id = bss_conf->link_id, + .sdata = sdata, + }; + int bss_param_ch_cnt; + + if (!elems->ml_basic) + return; + + data.mld_id = ieee80211_mle_get_mld_id((const void *)elems->ml_basic); + + cfg80211_iter_rnr(elems->ie_start, elems->total_len, + ieee80211_sta_bss_param_ch_cnt_iter, &data); + + bss_param_ch_cnt = + ieee80211_mle_get_bss_param_ch_cnt((const void *)elems->ml_basic); + + /* + * Update bss_param_ch_cnt_link_id even if bss_param_ch_cnt + * didn't change to indicate that we got a beacon on our own + * link. + */ + if (bss_param_ch_cnt >= 0 && bss_param_ch_cnt != 255) { + bss_conf->bss_param_ch_cnt = bss_param_ch_cnt; + bss_conf->bss_param_ch_cnt_link_id = + bss_conf->link_id; + } +} + static bool ieee80211_find_80211h_pwr_constr(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, @@ -4667,7 +4750,8 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, ret = false; goto out; } - link->u.mgd.bss_param_ch_cnt = bss_param_ch_cnt; + bss_conf->bss_param_ch_cnt = bss_param_ch_cnt; + bss_conf->bss_param_ch_cnt_link_id = link_id; } } else if (elems->parse_error & IEEE80211_PARSE_ERR_DUP_NEST_ML_BASIC || !elems->prof || @@ -4677,6 +4761,7 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, } else { const u8 *ptr = elems->prof->variable + elems->prof->sta_info_len - 1; + int bss_param_ch_cnt; /* * During parsing, we validated that these fields exist, @@ -4684,8 +4769,10 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, */ capab_info = get_unaligned_le16(ptr); assoc_data->link[link_id].status = get_unaligned_le16(ptr + 2); - link->u.mgd.bss_param_ch_cnt = + bss_param_ch_cnt = ieee80211_mle_basic_sta_prof_bss_param_ch_cnt(elems->prof); + bss_conf->bss_param_ch_cnt = bss_param_ch_cnt; + bss_conf->bss_param_ch_cnt_link_id = link_id; if (assoc_data->link[link_id].status != WLAN_STATUS_SUCCESS) { link_info(link, "association response status code=%u\n", @@ -6913,6 +7000,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, /* note that after this elems->ml_basic can no longer be used fully */ ieee80211_mgd_check_cross_link_csa(sdata, rx_status->link_id, elems); + ieee80211_mgd_update_bss_param_ch_cnt(sdata, bss_conf, elems); + if (!link->u.mgd.disable_wmm_tracking && ieee80211_sta_wmm_params(local, link, elems->wmm_param, elems->wmm_param_len, -- cgit v1.2.3 From 88b67e91e2928c3311f3658d1270b40708b0de00 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 15:00:54 +0300 Subject: wifi: mac80211: call rate_control_rate_update() for link STA In order to update the right link information, call the update rate_control_rate_update() with the right link_sta, and then pass that through to the driver's sta_rc_update() method. The software rate control still doesn't support it, but that'll be skipped by not having a rate control ref. Since it now operates on a link sta, rename the driver method. Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241007144851.5851b6b5fd41.Ibdf50d96afa4b761dd9b9dfd54a1147e77a75329@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath10k/mac.c | 5 +++-- drivers/net/wireless/ath/ath11k/mac.c | 5 +++-- drivers/net/wireless/ath/ath12k/mac.c | 5 +++-- drivers/net/wireless/ath/ath9k/htc_drv_main.c | 6 ++++-- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 5 +++-- drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c | 2 +- drivers/net/wireless/intel/iwlwifi/mvm/mvm.h | 2 +- drivers/net/wireless/mediatek/mt76/mt7915/main.c | 5 +++-- drivers/net/wireless/mediatek/mt76/mt7996/main.c | 5 +++-- drivers/net/wireless/realtek/rtw88/mac80211.c | 6 ++++-- drivers/net/wireless/realtek/rtw89/mac80211.c | 6 ++++-- drivers/net/wireless/ti/wlcore/main.c | 5 +++-- drivers/net/wireless/virtual/mac80211_hwsim.c | 8 ++++---- include/net/mac80211.h | 12 ++++++------ net/mac80211/chan.c | 2 +- net/mac80211/driver-ops.c | 15 ++++++++------- net/mac80211/driver-ops.h | 6 +++--- net/mac80211/ibss.c | 3 ++- net/mac80211/mesh_plink.c | 3 ++- net/mac80211/rate.c | 8 ++++---- net/mac80211/rate.h | 3 +-- net/mac80211/rx.c | 4 ++-- net/mac80211/tdls.c | 3 ++- net/mac80211/trace.h | 15 +++++++++------ net/mac80211/vht.c | 3 +-- 25 files changed, 80 insertions(+), 62 deletions(-) (limited to 'include/net') diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 646e1737d4c4..41ab83c3d3f7 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -8507,9 +8507,10 @@ exit: static void ath10k_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, + struct ieee80211_link_sta *link_sta, u32 changed) { + struct ieee80211_sta *sta = link_sta->sta; struct ath10k *ar = hw->priv; struct ath10k_sta *arsta = (struct ath10k_sta *)sta->drv_priv; struct ath10k_vif *arvif = (void *)vif->drv_priv; @@ -9450,7 +9451,7 @@ static const struct ieee80211_ops ath10k_ops = { .reconfig_complete = ath10k_reconfig_complete, .get_survey = ath10k_get_survey, .set_bitrate_mask = ath10k_mac_op_set_bitrate_mask, - .sta_rc_update = ath10k_sta_rc_update, + .link_sta_rc_update = ath10k_sta_rc_update, .offset_tsf = ath10k_offset_tsf, .ampdu_action = ath10k_ampdu_action, .get_et_sset_count = ath10k_debug_get_et_sset_count, diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index f8068d2e848c..e6acbff06749 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -5079,9 +5079,10 @@ static void ath11k_mac_op_sta_set_4addr(struct ieee80211_hw *hw, static void ath11k_mac_op_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, + struct ieee80211_link_sta *link_sta, u32 changed) { + struct ieee80211_sta *sta = link_sta->sta; struct ath11k *ar = hw->priv; struct ath11k_sta *arsta = ath11k_sta_to_arsta(sta); struct ath11k_vif *arvif = ath11k_vif_to_arvif(vif); @@ -9708,7 +9709,7 @@ static const struct ieee80211_ops ath11k_ops = { .sta_state = ath11k_mac_op_sta_state, .sta_set_4addr = ath11k_mac_op_sta_set_4addr, .sta_set_txpwr = ath11k_mac_op_sta_set_txpwr, - .sta_rc_update = ath11k_mac_op_sta_rc_update, + .link_sta_rc_update = ath11k_mac_op_sta_rc_update, .conf_tx = ath11k_mac_op_conf_tx, .set_antenna = ath11k_mac_op_set_antenna, .get_antenna = ath11k_mac_op_get_antenna, diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 137394c36460..acf5628adda5 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -4737,9 +4737,10 @@ out: static void ath12k_mac_op_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, + struct ieee80211_link_sta *link_sta, u32 changed) { + struct ieee80211_sta *sta = link_sta->sta; struct ath12k *ar; struct ath12k_sta *arsta = ath12k_sta_to_arsta(sta); struct ath12k_vif *arvif = ath12k_vif_to_arvif(vif); @@ -8681,7 +8682,7 @@ static const struct ieee80211_ops ath12k_ops = { .set_rekey_data = ath12k_mac_op_set_rekey_data, .sta_state = ath12k_mac_op_sta_state, .sta_set_txpwr = ath12k_mac_op_sta_set_txpwr, - .sta_rc_update = ath12k_mac_op_sta_rc_update, + .link_sta_rc_update = ath12k_mac_op_sta_rc_update, .conf_tx = ath12k_mac_op_conf_tx, .set_antenna = ath12k_mac_op_set_antenna, .get_antenna = ath12k_mac_op_get_antenna, diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_main.c b/drivers/net/wireless/ath/ath9k/htc_drv_main.c index 8a03bcc2789e..57094bd45d98 100644 --- a/drivers/net/wireless/ath/ath9k/htc_drv_main.c +++ b/drivers/net/wireless/ath/ath9k/htc_drv_main.c @@ -1357,8 +1357,10 @@ static int ath9k_htc_sta_remove(struct ieee80211_hw *hw, static void ath9k_htc_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, u32 changed) + struct ieee80211_link_sta *link_sta, + u32 changed) { + struct ieee80211_sta *sta = link_sta->sta; struct ath9k_htc_sta *ista = (struct ath9k_htc_sta *) sta->drv_priv; if (!(changed & IEEE80211_RC_SUPP_RATES_CHANGED)) @@ -1883,7 +1885,7 @@ struct ieee80211_ops ath9k_htc_ops = { .sta_add = ath9k_htc_sta_add, .sta_remove = ath9k_htc_sta_remove, .conf_tx = ath9k_htc_conf_tx, - .sta_rc_update = ath9k_htc_sta_rc_update, + .link_sta_rc_update = ath9k_htc_sta_rc_update, .bss_info_changed = ath9k_htc_bss_info_changed, .set_key = ath9k_htc_set_key, .get_tsf = ath9k_htc_get_tsf, diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 11cd1a8fdd9e..3721d6349cc5 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -4236,8 +4236,9 @@ int iwl_mvm_mac_set_rts_threshold(struct ieee80211_hw *hw, u32 value) } void iwl_mvm_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, u32 changed) + struct ieee80211_link_sta *link_sta, u32 changed) { + struct ieee80211_sta *sta = link_sta->sta; struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw); if (changed & (IEEE80211_RC_BW_CHANGED | @@ -6562,7 +6563,7 @@ const struct ieee80211_ops iwl_mvm_hw_ops = { .allow_buffered_frames = iwl_mvm_mac_allow_buffered_frames, .release_buffered_frames = iwl_mvm_mac_release_buffered_frames, .set_rts_threshold = iwl_mvm_mac_set_rts_threshold, - .sta_rc_update = iwl_mvm_sta_rc_update, + .link_sta_rc_update = iwl_mvm_sta_rc_update, .conf_tx = iwl_mvm_mac_conf_tx, .mgd_prepare_tx = iwl_mvm_mac_mgd_prepare_tx, .mgd_complete_tx = iwl_mvm_mac_mgd_complete_tx, diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c index f2378e0fb2fb..7de6c96646ca 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c @@ -1401,7 +1401,7 @@ const struct ieee80211_ops iwl_mvm_mld_hw_ops = { .allow_buffered_frames = iwl_mvm_mac_allow_buffered_frames, .release_buffered_frames = iwl_mvm_mac_release_buffered_frames, .set_rts_threshold = iwl_mvm_mac_set_rts_threshold, - .sta_rc_update = iwl_mvm_sta_rc_update, + .link_sta_rc_update = iwl_mvm_sta_rc_update, .conf_tx = iwl_mvm_mld_mac_conf_tx, .mgd_prepare_tx = iwl_mvm_mac_mgd_prepare_tx, .mgd_complete_tx = iwl_mvm_mac_mgd_complete_tx, diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h index ef07cff203b0..6246ffce7cf8 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h @@ -2914,7 +2914,7 @@ iwl_mvm_mac_release_buffered_frames(struct ieee80211_hw *hw, bool more_data); int iwl_mvm_mac_set_rts_threshold(struct ieee80211_hw *hw, u32 value); void iwl_mvm_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, u32 changed); + struct ieee80211_link_sta *link_sta, u32 changed); void iwl_mvm_mac_mgd_prepare_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_prep_tx_info *info); diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/main.c b/drivers/net/wireless/mediatek/mt76/mt7915/main.c index d75e8dea1fbd..c6f498fc81ff 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/main.c @@ -1163,9 +1163,10 @@ static void mt7915_sta_rc_work(void *data, struct ieee80211_sta *sta) static void mt7915_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, + struct ieee80211_link_sta *link_sta, u32 changed) { + struct ieee80211_sta *sta = link_sta->sta; struct mt7915_phy *phy = mt7915_hw_phy(hw); struct mt7915_dev *dev = phy->dev; struct mt7915_sta *msta = (struct mt7915_sta *)sta->drv_priv; @@ -1709,7 +1710,7 @@ const struct ieee80211_ops mt7915_ops = { .stop_ap = mt7915_stop_ap, .sta_state = mt76_sta_state, .sta_pre_rcu_remove = mt76_sta_pre_rcu_remove, - .sta_rc_update = mt7915_sta_rc_update, + .link_sta_rc_update = mt7915_sta_rc_update, .set_key = mt7915_set_key, .ampdu_action = mt7915_ampdu_action, .set_rts_threshold = mt7915_set_rts_threshold, diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/main.c b/drivers/net/wireless/mediatek/mt76/mt7996/main.c index 39f071ece35e..2b34ae5e0cb5 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7996/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7996/main.c @@ -1060,9 +1060,10 @@ static void mt7996_sta_rc_work(void *data, struct ieee80211_sta *sta) static void mt7996_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, + struct ieee80211_link_sta *link_sta, u32 changed) { + struct ieee80211_sta *sta = link_sta->sta; struct mt7996_phy *phy = mt7996_hw_phy(hw); struct mt7996_dev *dev = phy->dev; @@ -1472,7 +1473,7 @@ const struct ieee80211_ops mt7996_ops = { .sta_add = mt7996_sta_add, .sta_remove = mt7996_sta_remove, .sta_pre_rcu_remove = mt76_sta_pre_rcu_remove, - .sta_rc_update = mt7996_sta_rc_update, + .link_sta_rc_update = mt7996_sta_rc_update, .set_key = mt7996_set_key, .ampdu_action = mt7996_ampdu_action, .set_rts_threshold = mt7996_set_rts_threshold, diff --git a/drivers/net/wireless/realtek/rtw88/mac80211.c b/drivers/net/wireless/realtek/rtw88/mac80211.c index b39e90fb66b4..026fbf4ad9cc 100644 --- a/drivers/net/wireless/realtek/rtw88/mac80211.c +++ b/drivers/net/wireless/realtek/rtw88/mac80211.c @@ -928,8 +928,10 @@ static int rtw_ops_set_sar_specs(struct ieee80211_hw *hw, static void rtw_ops_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, u32 changed) + struct ieee80211_link_sta *link_sta, + u32 changed) { + struct ieee80211_sta *sta = link_sta->sta; struct rtw_dev *rtwdev = hw->priv; struct rtw_sta_info *si = (struct rtw_sta_info *)sta->drv_priv; @@ -973,7 +975,7 @@ const struct ieee80211_ops rtw_ops = { .reconfig_complete = rtw_reconfig_complete, .hw_scan = rtw_ops_hw_scan, .cancel_hw_scan = rtw_ops_cancel_hw_scan, - .sta_rc_update = rtw_ops_sta_rc_update, + .link_sta_rc_update = rtw_ops_sta_rc_update, .set_sar_specs = rtw_ops_set_sar_specs, #ifdef CONFIG_PM .suspend = rtw_ops_suspend, diff --git a/drivers/net/wireless/realtek/rtw89/mac80211.c b/drivers/net/wireless/realtek/rtw89/mac80211.c index 1ee63a85308f..3f33c3a2ae7d 100644 --- a/drivers/net/wireless/realtek/rtw89/mac80211.c +++ b/drivers/net/wireless/realtek/rtw89/mac80211.c @@ -1290,8 +1290,10 @@ out: static void rtw89_ops_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, u32 changed) + struct ieee80211_link_sta *link_sta, + u32 changed) { + struct ieee80211_sta *sta = link_sta->sta; struct rtw89_dev *rtwdev = hw->priv; rtw89_phy_ra_update_sta(rtwdev, sta, changed); @@ -1593,7 +1595,7 @@ const struct ieee80211_ops rtw89_ops = { .remain_on_channel = rtw89_ops_remain_on_channel, .cancel_remain_on_channel = rtw89_ops_cancel_remain_on_channel, .set_sar_specs = rtw89_ops_set_sar_specs, - .sta_rc_update = rtw89_ops_sta_rc_update, + .link_sta_rc_update = rtw89_ops_sta_rc_update, .set_tid_config = rtw89_ops_set_tid_config, #ifdef CONFIG_PM .suspend = rtw89_ops_suspend, diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c index 0c77b8524160..986b07bfa0ee 100644 --- a/drivers/net/wireless/ti/wlcore/main.c +++ b/drivers/net/wireless/ti/wlcore/main.c @@ -5789,9 +5789,10 @@ static int wlcore_op_cancel_remain_on_channel(struct ieee80211_hw *hw, static void wlcore_op_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, + struct ieee80211_link_sta *link_sta, u32 changed) { + struct ieee80211_sta *sta = link_sta->sta; struct wl12xx_vif *wlvif = wl12xx_vif_to_data(vif); wl1271_debug(DEBUG_MAC80211, "mac80211 sta_rc_update"); @@ -6052,7 +6053,7 @@ static const struct ieee80211_ops wl1271_ops = { .assign_vif_chanctx = wlcore_op_assign_vif_chanctx, .unassign_vif_chanctx = wlcore_op_unassign_vif_chanctx, .switch_vif_chanctx = wlcore_op_switch_vif_chanctx, - .sta_rc_update = wlcore_op_sta_rc_update, + .link_sta_rc_update = wlcore_op_sta_rc_update, .sta_statistics = wlcore_op_sta_statistics, .get_expected_throughput = wlcore_op_get_expected_throughput, CFG80211_TESTMODE_CMD(wl1271_tm_cmd) diff --git a/drivers/net/wireless/virtual/mac80211_hwsim.c b/drivers/net/wireless/virtual/mac80211_hwsim.c index f0e528abb1b4..fe3e2dc1626b 100644 --- a/drivers/net/wireless/virtual/mac80211_hwsim.c +++ b/drivers/net/wireless/virtual/mac80211_hwsim.c @@ -2594,10 +2594,11 @@ static void mac80211_hwsim_link_info_changed(struct ieee80211_hw *hw, static void mac80211_hwsim_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, + struct ieee80211_link_sta *link_sta, u32 changed) { struct mac80211_hwsim_data *data = hw->priv; + struct ieee80211_sta *sta = link_sta->sta; u32 bw = U32_MAX; int link_id; @@ -2607,7 +2608,6 @@ mac80211_hwsim_sta_rc_update(struct ieee80211_hw *hw, link_id++) { enum nl80211_chan_width confbw = NL80211_CHAN_WIDTH_20_NOHT; struct ieee80211_bss_conf *vif_conf; - struct ieee80211_link_sta *link_sta; link_sta = rcu_dereference(sta->link[link_id]); @@ -2659,7 +2659,7 @@ static int mac80211_hwsim_sta_add(struct ieee80211_hw *hw, hwsim_check_magic(vif); hwsim_set_sta_magic(sta); - mac80211_hwsim_sta_rc_update(hw, vif, sta, 0); + mac80211_hwsim_sta_rc_update(hw, vif, &sta->deflink, 0); if (sta->valid_links) { WARN(hweight16(sta->valid_links) > 1, @@ -3961,7 +3961,7 @@ out: .link_info_changed = mac80211_hwsim_link_info_changed, \ .tx_last_beacon = mac80211_hwsim_tx_last_beacon, \ .sta_notify = mac80211_hwsim_sta_notify, \ - .sta_rc_update = mac80211_hwsim_sta_rc_update, \ + .link_sta_rc_update = mac80211_hwsim_sta_rc_update, \ .conf_tx = mac80211_hwsim_conf_tx, \ .get_survey = mac80211_hwsim_get_survey, \ CFG80211_TESTMODE_CMD(mac80211_hwsim_testmode_cmd) \ diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 20562676e9cd..7a6edc04e212 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4090,8 +4090,8 @@ struct ieee80211_prep_tx_info { * in @sta_state. * The callback can sleep. * - * @sta_rc_update: Notifies the driver of changes to the bitrates that can be - * used to transmit to the station. The changes are advertised with bits + * @link_sta_rc_update: Notifies the driver of changes to the bitrates that can + * be used to transmit to the station. The changes are advertised with bits * from &enum ieee80211_rate_control_changed and the values are reflected * in the station data. This callback should only be used when the driver * uses hardware rate control (%IEEE80211_HW_HAS_RATE_CONTROL) since @@ -4581,10 +4581,10 @@ struct ieee80211_ops { void (*sta_pre_rcu_remove)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); - void (*sta_rc_update)(struct ieee80211_hw *hw, - struct ieee80211_vif *vif, - struct ieee80211_sta *sta, - u32 changed); + void (*link_sta_rc_update)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_link_sta *link_sta, + u32 changed); void (*sta_rate_tbl_update)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index b355e9af268d..2d7ec557cbd6 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -467,7 +467,7 @@ static void ieee80211_chan_bw_change(struct ieee80211_local *local, continue; link_sta->pub->bandwidth = new_sta_bw; - rate_control_rate_update(local, sband, sta, link_id, + rate_control_rate_update(local, sband, link_sta, IEEE80211_RC_BW_CHANGED); } } diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index fe868b521622..84d048339113 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -181,9 +181,10 @@ int drv_sta_set_txpwr(struct ieee80211_local *local, return ret; } -void drv_sta_rc_update(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, - struct ieee80211_sta *sta, u32 changed) +void drv_link_sta_rc_update(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_link_sta *link_sta, + u32 changed) { sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) @@ -193,10 +194,10 @@ void drv_sta_rc_update(struct ieee80211_local *local, (sdata->vif.type != NL80211_IFTYPE_ADHOC && sdata->vif.type != NL80211_IFTYPE_MESH_POINT)); - trace_drv_sta_rc_update(local, sdata, sta, changed); - if (local->ops->sta_rc_update) - local->ops->sta_rc_update(&local->hw, &sdata->vif, - sta, changed); + trace_drv_link_sta_rc_update(local, sdata, link_sta, changed); + if (local->ops->link_sta_rc_update) + local->ops->link_sta_rc_update(&local->hw, &sdata->vif, + link_sta, changed); trace_drv_return_void(local); } diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 48bc2da728c0..edd1e4d4ad9d 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -594,9 +594,9 @@ int drv_sta_set_txpwr(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta); -void drv_sta_rc_update(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, - struct ieee80211_sta *sta, u32 changed); +void drv_link_sta_rc_update(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_link_sta *link_sta, u32 changed); static inline void drv_sta_rate_tbl_update(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 08fac295ad5b..a1b4178deccf 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -1072,7 +1072,8 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, if (sta->sta.deflink.rx_nss != rx_nss) changed |= IEEE80211_RC_NSS_CHANGED; - drv_sta_rc_update(local, sdata, &sta->sta, changed); + drv_link_sta_rc_update(local, sdata, &sta->sta.deflink, + changed); } rcu_read_unlock(); diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index be0700d64549..6ea35c88dc48 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -486,10 +486,11 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, sta->sta.deflink.bandwidth = IEEE80211_STA_RX_BW_20; } + /* FIXME: this check is wrong without SW rate control */ if (!test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) rate_control_rate_init(&sta->deflink); else - rate_control_rate_update(local, sband, sta, 0, changed); + rate_control_rate_update(local, sband, &sta->deflink, changed); out: spin_unlock_bh(&sta->mesh->plink_lock); } diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 23b4f1af37e0..63fe58311a77 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -113,16 +113,15 @@ void rate_control_tx_status(struct ieee80211_local *local, void rate_control_rate_update(struct ieee80211_local *local, struct ieee80211_supported_band *sband, - struct sta_info *sta, unsigned int link_id, + struct link_sta_info *link_sta, u32 changed) { struct rate_control_ref *ref = local->rate_ctrl; + struct sta_info *sta = link_sta->sta; struct ieee80211_sta *ista = &sta->sta; void *priv_sta = sta->rate_ctrl_priv; struct ieee80211_chanctx_conf *chanctx_conf; - WARN_ON(link_id != 0); - if (ref && ref->ops->rate_update) { rcu_read_lock(); @@ -140,7 +139,8 @@ void rate_control_rate_update(struct ieee80211_local *local, } if (sta->uploaded) - drv_sta_rc_update(local, sta->sdata, &sta->sta, changed); + drv_link_sta_rc_update(local, sta->sdata, link_sta->pub, + changed); } int ieee80211_rate_control_register(const struct rate_control_ops *ops) diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h index 8d3c8903b4ae..673aa9efe30b 100644 --- a/net/mac80211/rate.h +++ b/net/mac80211/rate.h @@ -33,8 +33,7 @@ void rate_control_rate_init(struct link_sta_info *link_sta); void rate_control_rate_init_all_links(struct sta_info *sta); void rate_control_rate_update(struct ieee80211_local *local, struct ieee80211_supported_band *sband, - struct sta_info *sta, - unsigned int link_id, + struct link_sta_info *link_sta, u32 changed); static inline void *rate_control_alloc_sta(struct rate_control_ref *ref, diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 694b43091fec..d7b294221d43 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3568,7 +3568,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) sband = rx->local->hw.wiphy->bands[status->band]; - rate_control_rate_update(local, sband, rx->sta, 0, + rate_control_rate_update(local, sband, rx->link_sta, IEEE80211_RC_SMPS_CHANGED); cfg80211_sta_opmode_change_notify(sdata->dev, rx->sta->addr, @@ -3605,7 +3605,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) ieee80211_sta_rx_bw_to_chan_width(rx->link_sta); sta_opmode.changed = STA_OPMODE_MAX_BW_CHANGED; - rate_control_rate_update(local, sband, rx->sta, 0, + rate_control_rate_update(local, sband, rx->link_sta, IEEE80211_RC_BW_CHANGED); cfg80211_sta_opmode_change_notify(sdata->dev, rx->sta->addr, diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index f07b40916485..2f92e7c7f203 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -1342,7 +1342,8 @@ static void iee80211_tdls_recalc_chanctx(struct ieee80211_sub_if_data *sdata, bw = min(bw, ieee80211_sta_cap_rx_bw(&sta->deflink)); if (bw != sta->sta.deflink.bandwidth) { sta->sta.deflink.bandwidth = bw; - rate_control_rate_update(local, sband, sta, 0, + rate_control_rate_update(local, sband, + &sta->deflink, IEEE80211_RC_BW_CHANGED); /* * if a TDLS peer BW was updated, we need to diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index e6f0ce8e5d43..7a4985fc2b16 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -939,31 +939,34 @@ TRACE_EVENT(drv_sta_set_txpwr, ) ); -TRACE_EVENT(drv_sta_rc_update, +TRACE_EVENT(drv_link_sta_rc_update, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, - struct ieee80211_sta *sta, + struct ieee80211_link_sta *link_sta, u32 changed), - TP_ARGS(local, sdata, sta, changed), + TP_ARGS(local, sdata, link_sta, changed), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY STA_ENTRY __field(u32, changed) + __field(u32, link_id) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; - STA_ASSIGN; + STA_NAMED_ASSIGN(link_sta->sta); __entry->changed = changed; + __entry->link_id = link_sta->link_id; ), TP_printk( - LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " changed: 0x%x", - LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->changed + LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " (link %d) changed: 0x%x", + LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->link_id, + __entry->changed ) ); diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index eafe47bf201a..0d439cefb445 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -766,8 +766,7 @@ void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, if (changed > 0) { ieee80211_recalc_min_chandef(sdata, link_sta->link_id); - rate_control_rate_update(local, sband, link_sta->sta, - link_sta->link_id, changed); + rate_control_rate_update(local, sband, link_sta, changed); } } -- cgit v1.2.3 From 751e7489c1d74b94ffffbed619d8fd724eeff4ee Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 15:00:58 +0300 Subject: wifi: mac80211: expose ieee80211_chan_width_to_rx_bw() to drivers Drivers might need to also do this calculation, no point in them duplicating the code. Since it's so simple, just make it an inline. Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241007144851.af003cb4a088.I8b5d29504b726caae24af6013c65b3daebe842a2@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 27 +++++++++++++++++++++++++++ net/mac80211/ieee80211_i.h | 2 -- net/mac80211/vht.c | 22 ---------------------- 3 files changed, 27 insertions(+), 24 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 7a6edc04e212..32bdda90a4ac 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -7704,6 +7704,33 @@ void ieee80211_set_active_links_async(struct ieee80211_vif *vif, */ void ieee80211_send_teardown_neg_ttlm(struct ieee80211_vif *vif); +/** + * ieee80211_chan_width_to_rx_bw - convert channel width to STA RX bandwidth + * @width: the channel width value to convert + * Return: the STA RX bandwidth value for the channel width + */ +static inline enum ieee80211_sta_rx_bandwidth +ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width) +{ + switch (width) { + default: + WARN_ON_ONCE(1); + fallthrough; + case NL80211_CHAN_WIDTH_20_NOHT: + case NL80211_CHAN_WIDTH_20: + return IEEE80211_STA_RX_BW_20; + case NL80211_CHAN_WIDTH_40: + return IEEE80211_STA_RX_BW_40; + case NL80211_CHAN_WIDTH_80: + return IEEE80211_STA_RX_BW_80; + case NL80211_CHAN_WIDTH_160: + case NL80211_CHAN_WIDTH_80P80: + return IEEE80211_STA_RX_BW_160; + case NL80211_CHAN_WIDTH_320: + return IEEE80211_STA_RX_BW_320; + } +} + /* for older drivers - let's not document these ... */ int ieee80211_emulate_add_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 45987add530f..d20c2e796703 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2194,8 +2194,6 @@ ieee80211_sta_cur_vht_bw(struct link_sta_info *link_sta) return _ieee80211_sta_cur_vht_bw(link_sta, NULL); } void ieee80211_sta_init_nss(struct link_sta_info *link_sta); -enum ieee80211_sta_rx_bandwidth -ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width); enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct link_sta_info *link_sta); void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index 0d439cefb445..6a20fa099190 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -479,28 +479,6 @@ ieee80211_sta_rx_bw_to_chan_width(struct link_sta_info *link_sta) } } -enum ieee80211_sta_rx_bandwidth -ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width) -{ - switch (width) { - case NL80211_CHAN_WIDTH_20_NOHT: - case NL80211_CHAN_WIDTH_20: - return IEEE80211_STA_RX_BW_20; - case NL80211_CHAN_WIDTH_40: - return IEEE80211_STA_RX_BW_40; - case NL80211_CHAN_WIDTH_80: - return IEEE80211_STA_RX_BW_80; - case NL80211_CHAN_WIDTH_160: - case NL80211_CHAN_WIDTH_80P80: - return IEEE80211_STA_RX_BW_160; - case NL80211_CHAN_WIDTH_320: - return IEEE80211_STA_RX_BW_320; - default: - WARN_ON_ONCE(1); - return IEEE80211_STA_RX_BW_20; - } -} - /* FIXME: rename/move - this deals with everything not just VHT */ enum ieee80211_sta_rx_bandwidth _ieee80211_sta_cur_vht_bw(struct link_sta_info *link_sta, -- cgit v1.2.3 From 3607798ad9bdef35ad08489a8239390fccaac6b5 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 9 Oct 2024 10:25:42 +0200 Subject: wifi: cfg80211: add option for vif allowed radios This allows users to prevent a vif from affecting radios other than the configured ones. This can be useful in cases where e.g. an AP is running on one radio, and triggering a scan on another radio should not disturb it. Changing the allowed radios list for a vif is supported, but only while it is down. While it is possible to achieve the same by always explicitly specifying a frequency list for scan requests and ensuring that the wrong channel/band is never accidentally set on an unrelated interface, this change makes multi-radio wiphy setups a lot easier to deal with for CLI users. By itself, this patch only enforces the radio mask for scanning requests and remain-on-channel. Follow-up changes build on this to limit configured frequencies. Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/eefcb218780f71a1549875d149f1196486762756.1728462320.git-series.nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 14 +++++++++++ include/uapi/linux/nl80211.h | 5 ++++ net/wireless/core.c | 2 ++ net/wireless/nl80211.c | 60 ++++++++++++++++++++++++++++++++++++++------ net/wireless/scan.c | 10 +++++--- net/wireless/util.c | 29 +++++++++++++++++++++ 6 files changed, 109 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index c8ce5c2e14f4..95d05e67e69a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -6221,6 +6221,7 @@ enum ieee80211_ap_reg_power { * entered. * @links.cac_time_ms: CAC time in ms * @valid_links: bitmap describing what elements of @links are valid + * @radio_mask: Bitmask of radios that this interface is allowed to operate on. */ struct wireless_dev { struct wiphy *wiphy; @@ -6333,6 +6334,8 @@ struct wireless_dev { unsigned int cac_time_ms; } links[IEEE80211_MLD_MAX_NUM_LINKS]; u16 valid_links; + + u32 radio_mask; }; static inline const u8 *wdev_address(struct wireless_dev *wdev) @@ -6518,6 +6521,17 @@ static inline bool cfg80211_channel_is_psc(struct ieee80211_channel *chan) bool cfg80211_radio_chandef_valid(const struct wiphy_radio *radio, const struct cfg80211_chan_def *chandef); +/** + * cfg80211_wdev_channel_allowed - Check if the wdev may use the channel + * + * @wdev: the wireless device + * @chan: channel to check + * + * Return: whether or not the wdev may use the channel + */ +bool cfg80211_wdev_channel_allowed(struct wireless_dev *wdev, + struct ieee80211_channel *chan); + /** * ieee80211_get_response_rate - get basic rate for a given rate * diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index f97f5adc8d51..d31ccee99cc7 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2868,6 +2868,9 @@ enum nl80211_commands { * nested item, it contains attributes defined in * &enum nl80211_if_combination_attrs. * + * @NL80211_ATTR_VIF_RADIO_MASK: Bitmask of allowed radios (u32). + * A value of 0 means all radios. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3416,6 +3419,8 @@ enum nl80211_attrs { NL80211_ATTR_WIPHY_RADIOS, NL80211_ATTR_WIPHY_INTERFACE_COMBINATIONS, + NL80211_ATTR_VIF_RADIO_MASK, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/core.c b/net/wireless/core.c index 4c8d8f167409..93d62a1d3a45 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1430,6 +1430,8 @@ void cfg80211_init_wdev(struct wireless_dev *wdev) /* allow mac80211 to determine the timeout */ wdev->ps_timeout = -1; + wdev->radio_mask = BIT(wdev->wiphy->n_radio) - 1; + if ((wdev->iftype == NL80211_IFTYPE_STATION || wdev->iftype == NL80211_IFTYPE_P2P_CLIENT || wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index fb35c03af34c..a330347dd7a3 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -829,6 +829,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MLO_TTLM_DLINK] = NLA_POLICY_EXACT_LEN(sizeof(u16) * 8), [NL80211_ATTR_MLO_TTLM_ULINK] = NLA_POLICY_EXACT_LEN(sizeof(u16) * 8), [NL80211_ATTR_ASSOC_SPP_AMSDU] = { .type = NLA_FLAG }, + [NL80211_ATTR_VIF_RADIO_MASK] = { .type = NLA_U32 }, }; /* policy for the key attributes */ @@ -3996,7 +3997,8 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag nla_put_u32(msg, NL80211_ATTR_GENERATION, rdev->devlist_generation ^ (cfg80211_rdev_list_generation << 2)) || - nla_put_u8(msg, NL80211_ATTR_4ADDR, wdev->use_4addr)) + nla_put_u8(msg, NL80211_ATTR_4ADDR, wdev->use_4addr) || + nla_put_u32(msg, NL80211_ATTR_VIF_RADIO_MASK, wdev->radio_mask)) goto nla_put_failure; if (rdev->ops->get_channel && !wdev->valid_links) { @@ -4312,6 +4314,29 @@ static int nl80211_valid_4addr(struct cfg80211_registered_device *rdev, return -EOPNOTSUPP; } +static int nl80211_parse_vif_radio_mask(struct genl_info *info, + u32 *radio_mask) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct nlattr *attr = info->attrs[NL80211_ATTR_VIF_RADIO_MASK]; + u32 mask, allowed; + + if (!attr) { + *radio_mask = 0; + return 0; + } + + allowed = BIT(rdev->wiphy.n_radio) - 1; + mask = nla_get_u32(attr); + if (mask & ~allowed) + return -EINVAL; + if (!mask) + mask = allowed; + *radio_mask = mask; + + return 1; +} + static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -4319,6 +4344,8 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) int err; enum nl80211_iftype otype, ntype; struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + u32 radio_mask = 0; bool change = false; memset(¶ms, 0, sizeof(params)); @@ -4332,8 +4359,6 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) } if (info->attrs[NL80211_ATTR_MESH_ID]) { - struct wireless_dev *wdev = dev->ieee80211_ptr; - if (ntype != NL80211_IFTYPE_MESH_POINT) return -EINVAL; if (otype != NL80211_IFTYPE_MESH_POINT) @@ -4364,6 +4389,12 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) if (err > 0) change = true; + err = nl80211_parse_vif_radio_mask(info, &radio_mask); + if (err < 0) + return err; + if (err && netif_running(dev)) + return -EBUSY; + if (change) err = cfg80211_change_iface(rdev, dev, ntype, ¶ms); else @@ -4372,11 +4403,11 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) if (!err && params.use_4addr != -1) dev->ieee80211_ptr->use_4addr = params.use_4addr; - if (change && !err) { - struct wireless_dev *wdev = dev->ieee80211_ptr; + if (radio_mask) + wdev->radio_mask = radio_mask; + if (change && !err) nl80211_notify_iface(rdev, wdev, NL80211_CMD_SET_INTERFACE); - } return err; } @@ -4387,6 +4418,7 @@ static int _nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) struct vif_params params; struct wireless_dev *wdev; struct sk_buff *msg; + u32 radio_mask; int err; enum nl80211_iftype type = NL80211_IFTYPE_UNSPECIFIED; @@ -4424,6 +4456,10 @@ static int _nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) if (err < 0) return err; + err = nl80211_parse_vif_radio_mask(info, &radio_mask); + if (err < 0) + return err; + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; @@ -4465,6 +4501,9 @@ static int _nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) break; } + if (radio_mask) + wdev->radio_mask = radio_mask; + if (nl80211_send_iface(msg, info->snd_portid, info->snd_seq, 0, rdev, wdev, NL80211_CMD_NEW_INTERFACE) < 0) { nlmsg_free(msg); @@ -9156,6 +9195,9 @@ static bool cfg80211_off_channel_oper_allowed(struct wireless_dev *wdev, lockdep_assert_wiphy(wdev->wiphy); + if (!cfg80211_wdev_channel_allowed(wdev, chan)) + return false; + if (!cfg80211_beaconing_iface_active(wdev)) return true; @@ -9368,7 +9410,8 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) } /* ignore disabled channels */ - if (chan->flags & IEEE80211_CHAN_DISABLED) + if (chan->flags & IEEE80211_CHAN_DISABLED || + !cfg80211_wdev_channel_allowed(wdev, chan)) continue; request->channels[i] = chan; @@ -9388,7 +9431,8 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) chan = &wiphy->bands[band]->channels[j]; - if (chan->flags & IEEE80211_CHAN_DISABLED) + if (chan->flags & IEEE80211_CHAN_DISABLED || + !cfg80211_wdev_channel_allowed(wdev, chan)) continue; request->channels[i] = chan; diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 8ba618f4734f..8e3d46bf4836 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -956,7 +956,8 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) struct ieee80211_channel *chan = ieee80211_get_channel(&rdev->wiphy, ap->center_freq); - if (!chan || chan->flags & IEEE80211_CHAN_DISABLED) + if (!chan || chan->flags & IEEE80211_CHAN_DISABLED || + !cfg80211_wdev_channel_allowed(rdev_req->wdev, chan)) continue; for (i = 0; i < rdev_req->n_channels; i++) { @@ -3515,9 +3516,12 @@ int cfg80211_wext_siwscan(struct net_device *dev, continue; for (j = 0; j < wiphy->bands[band]->n_channels; j++) { + struct ieee80211_channel *chan; + /* ignore disabled channels */ - if (wiphy->bands[band]->channels[j].flags & - IEEE80211_CHAN_DISABLED) + chan = &wiphy->bands[band]->channels[j]; + if (chan->flags & IEEE80211_CHAN_DISABLED || + !cfg80211_wdev_channel_allowed(creq->wdev, chan)) continue; /* If we have a wireless request structure and the diff --git a/net/wireless/util.c b/net/wireless/util.c index 93a9c32418a6..040d62051eb9 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -2923,3 +2923,32 @@ bool cfg80211_radio_chandef_valid(const struct wiphy_radio *radio, return true; } EXPORT_SYMBOL(cfg80211_radio_chandef_valid); + +bool cfg80211_wdev_channel_allowed(struct wireless_dev *wdev, + struct ieee80211_channel *chan) +{ + struct wiphy *wiphy = wdev->wiphy; + const struct wiphy_radio *radio; + struct cfg80211_chan_def chandef; + u32 radio_mask; + int i; + + radio_mask = wdev->radio_mask; + if (!wiphy->n_radio || radio_mask == BIT(wiphy->n_radio) - 1) + return true; + + cfg80211_chandef_create(&chandef, chan, NL80211_CHAN_HT20); + for (i = 0; i < wiphy->n_radio; i++) { + if (!(radio_mask & BIT(i))) + continue; + + radio = &wiphy->radio[i]; + if (!cfg80211_radio_chandef_valid(radio, &chandef)) + continue; + + return true; + } + + return false; +} +EXPORT_SYMBOL(cfg80211_wdev_channel_allowed); -- cgit v1.2.3 From ebda716ea4da03326ac4d0a71604d18aa8a2e695 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 9 Oct 2024 10:25:45 +0200 Subject: wifi: cfg80211: report per wiphy radio antenna mask With multi-radio devices, each radio typically gets a fixed set of antennas. In order to be able to disable specific antennas for some radios, user space needs to know which antenna mask bits are assigned to which radio. Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/e0a26afa2c88eaa188ec96ec6d17ecac4e827641.1728462320.git-series.nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 ++++ include/uapi/linux/nl80211.h | 3 +++ net/wireless/nl80211.c | 5 +++++ 3 files changed, 12 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 95d05e67e69a..3100733f3e23 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5434,6 +5434,8 @@ struct wiphy_radio_freq_range { * @iface_combinations: Valid interface combinations array, should not * list single interface types. * @n_iface_combinations: number of entries in @iface_combinations array. + * + * @antenna_mask: bitmask of antennas connected to this radio. */ struct wiphy_radio { const struct wiphy_radio_freq_range *freq_range; @@ -5441,6 +5443,8 @@ struct wiphy_radio { const struct ieee80211_iface_combination *iface_combinations; int n_iface_combinations; + + u32 antenna_mask; }; #define CFG80211_HW_TIMESTAMP_ALL_PEERS 0xffff diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index d31ccee99cc7..1b8827f920ff 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -8036,6 +8036,8 @@ enum nl80211_ap_settings_flags { * @NL80211_WIPHY_RADIO_ATTR_INTERFACE_COMBINATION: Supported interface * combination for this radio. Attribute may be present multiple times * and contains attributes defined in &enum nl80211_if_combination_attrs. + * @NL80211_WIPHY_RADIO_ATTR_ANTENNA_MASK: bitmask (u32) of antennas + * connected to this radio. * * @__NL80211_WIPHY_RADIO_ATTR_LAST: Internal * @NL80211_WIPHY_RADIO_ATTR_MAX: Highest attribute @@ -8046,6 +8048,7 @@ enum nl80211_wiphy_radio_attrs { NL80211_WIPHY_RADIO_ATTR_INDEX, NL80211_WIPHY_RADIO_ATTR_FREQ_RANGE, NL80211_WIPHY_RADIO_ATTR_INTERFACE_COMBINATION, + NL80211_WIPHY_RADIO_ATTR_ANTENNA_MASK, /* keep last */ __NL80211_WIPHY_RADIO_ATTR_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index a330347dd7a3..aa78f18dd454 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2431,6 +2431,11 @@ static int nl80211_put_radio(struct wiphy *wiphy, struct sk_buff *msg, int idx) if (nla_put_u32(msg, NL80211_WIPHY_RADIO_ATTR_INDEX, idx)) goto nla_put_failure; + if (r->antenna_mask && + nla_put_u32(msg, NL80211_WIPHY_RADIO_ATTR_ANTENNA_MASK, + r->antenna_mask)) + goto nla_put_failure; + for (i = 0; i < r->n_freq_range; i++) { const struct wiphy_radio_freq_range *range = &r->freq_range[i]; -- cgit v1.2.3 From 006a97ceb6732c861c0e2fa3b6a34512caac9354 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 9 Oct 2024 10:25:46 +0200 Subject: wifi: mac80211: remove status->ampdu_delimiter_crc This was never used by any driver, so remove it to free up some space. Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/e6fee6eed49b105261830db1c74f13841fb9616c.1728462320.git-series.nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 +----- net/mac80211/rx.c | 7 +------ 2 files changed, 2 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 32bdda90a4ac..d4f23224eff9 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1463,8 +1463,6 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * @RX_FLAG_AMPDU_IS_LAST: this subframe is the last subframe of the A-MPDU * @RX_FLAG_AMPDU_DELIM_CRC_ERROR: A delimiter CRC error has been detected * on this subframe - * @RX_FLAG_AMPDU_DELIM_CRC_KNOWN: The delimiter CRC field is known (the CRC - * is stored in the @ampdu_delimiter_crc field) * @RX_FLAG_MIC_STRIPPED: The mic was stripped of this packet. Decryption was * done by the hardware * @RX_FLAG_ONLY_MONITOR: Report frame only to monitor interfaces without @@ -1536,7 +1534,7 @@ enum mac80211_rx_flags { RX_FLAG_AMPDU_LAST_KNOWN = BIT(12), RX_FLAG_AMPDU_IS_LAST = BIT(13), RX_FLAG_AMPDU_DELIM_CRC_ERROR = BIT(14), - RX_FLAG_AMPDU_DELIM_CRC_KNOWN = BIT(15), + /* one free bit at 15 */ RX_FLAG_MACTIME = BIT(16) | BIT(17), RX_FLAG_MACTIME_PLCP_START = 1 << 16, RX_FLAG_MACTIME_START = 2 << 16, @@ -1633,7 +1631,6 @@ enum mac80211_rx_encoding { * @rx_flags: internal RX flags for mac80211 * @ampdu_reference: A-MPDU reference number, must be a different value for * each A-MPDU but the same for each subframe within one A-MPDU - * @ampdu_delimiter_crc: A-MPDU delimiter CRC * @zero_length_psdu_type: radiotap type of the 0-length PSDU * @link_valid: if the link which is identified by @link_id is valid. This flag * is set only when connection is MLO. @@ -1671,7 +1668,6 @@ struct ieee80211_rx_status { s8 signal; u8 chains; s8 chain_signal[IEEE80211_MAX_CHAINS]; - u8 ampdu_delimiter_crc; u8 zero_length_psdu_type; u8 link_valid:1, link_id:4; }; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index d7b294221d43..a34523bbd156 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -508,18 +508,13 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, flags |= IEEE80211_RADIOTAP_AMPDU_IS_LAST; if (status->flag & RX_FLAG_AMPDU_DELIM_CRC_ERROR) flags |= IEEE80211_RADIOTAP_AMPDU_DELIM_CRC_ERR; - if (status->flag & RX_FLAG_AMPDU_DELIM_CRC_KNOWN) - flags |= IEEE80211_RADIOTAP_AMPDU_DELIM_CRC_KNOWN; if (status->flag & RX_FLAG_AMPDU_EOF_BIT_KNOWN) flags |= IEEE80211_RADIOTAP_AMPDU_EOF_KNOWN; if (status->flag & RX_FLAG_AMPDU_EOF_BIT) flags |= IEEE80211_RADIOTAP_AMPDU_EOF; put_unaligned_le16(flags, pos); pos += 2; - if (status->flag & RX_FLAG_AMPDU_DELIM_CRC_KNOWN) - *pos++ = status->ampdu_delimiter_crc; - else - *pos++ = 0; + *pos++ = 0; *pos++ = 0; } -- cgit v1.2.3 From 9c4f830927750a2bf9fd9426a5257f0fdce3b662 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 9 Oct 2024 10:25:47 +0200 Subject: wifi: cfg80211: pass net_device to .set_monitor_channel Preparation for allowing multiple monitor interfaces with different channels on a multi-radio wiphy. Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/35fa652dbfebf93343f8b9a08fdef0467a2a02dc.1728462320.git-series.nbd@nbd.name Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/wil6210/cfg80211.c | 1 + drivers/net/wireless/marvell/libertas/cfg.c | 1 + drivers/net/wireless/microchip/wilc1000/cfg80211.c | 3 ++- include/net/cfg80211.h | 1 + net/mac80211/cfg.c | 1 + net/wireless/chan.c | 3 ++- net/wireless/core.h | 1 + net/wireless/nl80211.c | 2 +- net/wireless/rdev-ops.h | 5 +++-- net/wireless/trace.h | 10 ++++++---- net/wireless/wext-compat.c | 2 +- 11 files changed, 20 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/drivers/net/wireless/ath/wil6210/cfg80211.c b/drivers/net/wireless/ath/wil6210/cfg80211.c index e8f1d30a8d73..a1a0a9223e74 100644 --- a/drivers/net/wireless/ath/wil6210/cfg80211.c +++ b/drivers/net/wireless/ath/wil6210/cfg80211.c @@ -1493,6 +1493,7 @@ out: } static int wil_cfg80211_set_channel(struct wiphy *wiphy, + struct net_device *dev, struct cfg80211_chan_def *chandef) { struct wil6210_priv *wil = wiphy_to_wil(wiphy); diff --git a/drivers/net/wireless/marvell/libertas/cfg.c b/drivers/net/wireless/marvell/libertas/cfg.c index afe9bcd3ad46..2e2c193716d9 100644 --- a/drivers/net/wireless/marvell/libertas/cfg.c +++ b/drivers/net/wireless/marvell/libertas/cfg.c @@ -486,6 +486,7 @@ static int lbs_add_wps_enrollee_tlv(u8 *tlv, const u8 *ie, size_t ie_len) */ static int lbs_cfg_set_monitor_channel(struct wiphy *wiphy, + struct net_device *dev, struct cfg80211_chan_def *chandef) { struct lbs_private *priv = wiphy_priv(wiphy); diff --git a/drivers/net/wireless/microchip/wilc1000/cfg80211.c b/drivers/net/wireless/microchip/wilc1000/cfg80211.c index b0dae6f7c633..e96736cc7259 100644 --- a/drivers/net/wireless/microchip/wilc1000/cfg80211.c +++ b/drivers/net/wireless/microchip/wilc1000/cfg80211.c @@ -231,6 +231,7 @@ struct wilc_vif *wilc_get_wl_to_vif(struct wilc *wl) } static int set_channel(struct wiphy *wiphy, + struct net_device *dev, struct cfg80211_chan_def *chandef) { struct wilc *wl = wiphy_priv(wiphy); @@ -1424,7 +1425,7 @@ static int start_ap(struct wiphy *wiphy, struct net_device *dev, struct wilc_vif *vif = netdev_priv(dev); int ret; - ret = set_channel(wiphy, &settings->chandef); + ret = set_channel(wiphy, dev, &settings->chandef); if (ret != 0) netdev_err(dev, "Error in setting channel\n"); diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 3100733f3e23..5feb93ba0400 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4694,6 +4694,7 @@ struct cfg80211_ops { struct ieee80211_channel *chan); int (*set_monitor_channel)(struct wiphy *wiphy, + struct net_device *dev, struct cfg80211_chan_def *chandef); int (*scan)(struct wiphy *wiphy, diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index ecc138869b4b..492349d6f7bb 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -897,6 +897,7 @@ static int ieee80211_get_station(struct wiphy *wiphy, struct net_device *dev, } static int ieee80211_set_monitor_channel(struct wiphy *wiphy, + struct net_device *dev, struct cfg80211_chan_def *chandef) { struct ieee80211_local *local = wiphy_priv(wiphy); diff --git a/net/wireless/chan.c b/net/wireless/chan.c index afd86f7c66ce..40b6375a5de4 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -1628,6 +1628,7 @@ bool cfg80211_reg_check_beaconing(struct wiphy *wiphy, EXPORT_SYMBOL(cfg80211_reg_check_beaconing); int cfg80211_set_monitor_channel(struct cfg80211_registered_device *rdev, + struct net_device *dev, struct cfg80211_chan_def *chandef) { if (!rdev->ops->set_monitor_channel) @@ -1635,7 +1636,7 @@ int cfg80211_set_monitor_channel(struct cfg80211_registered_device *rdev, if (!cfg80211_has_monitors_only(rdev)) return -EBUSY; - return rdev_set_monitor_channel(rdev, chandef); + return rdev_set_monitor_channel(rdev, dev, chandef); } bool cfg80211_any_usable_channels(struct wiphy *wiphy, diff --git a/net/wireless/core.h b/net/wireless/core.h index 3b3e3cd7027a..4c45f994a8c0 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -516,6 +516,7 @@ static inline unsigned int elapsed_jiffies_msecs(unsigned long start) } int cfg80211_set_monitor_channel(struct cfg80211_registered_device *rdev, + struct net_device *dev, struct cfg80211_chan_def *chandef); int ieee80211_get_ratemask(struct ieee80211_supported_band *sband, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index aa78f18dd454..84015f56e93a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3567,7 +3567,7 @@ static int __nl80211_set_channel(struct cfg80211_registered_device *rdev, case NL80211_IFTYPE_MESH_POINT: return cfg80211_set_mesh_channel(rdev, wdev, &chandef); case NL80211_IFTYPE_MONITOR: - return cfg80211_set_monitor_channel(rdev, &chandef); + return cfg80211_set_monitor_channel(rdev, dev, &chandef); default: break; } diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index f5adbf6b5c84..adb6105bbb7d 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -445,11 +445,12 @@ rdev_libertas_set_mesh_channel(struct cfg80211_registered_device *rdev, static inline int rdev_set_monitor_channel(struct cfg80211_registered_device *rdev, + struct net_device *dev, struct cfg80211_chan_def *chandef) { int ret; - trace_rdev_set_monitor_channel(&rdev->wiphy, chandef); - ret = rdev->ops->set_monitor_channel(&rdev->wiphy, chandef); + trace_rdev_set_monitor_channel(&rdev->wiphy, dev, chandef); + ret = rdev->ops->set_monitor_channel(&rdev->wiphy, dev, chandef); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 97c21b627791..d5c9bb614fa6 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1318,19 +1318,21 @@ TRACE_EVENT(rdev_libertas_set_mesh_channel, ); TRACE_EVENT(rdev_set_monitor_channel, - TP_PROTO(struct wiphy *wiphy, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_chan_def *chandef), - TP_ARGS(wiphy, chandef), + TP_ARGS(wiphy, netdev, chandef), TP_STRUCT__entry( WIPHY_ENTRY + NETDEV_ENTRY CHAN_DEF_ENTRY ), TP_fast_assign( WIPHY_ASSIGN; + NETDEV_ASSIGN; CHAN_DEF_ASSIGN(chandef); ), - TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT, - WIPHY_PR_ARG, CHAN_DEF_PR_ARG) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT, + WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG) ); TRACE_EVENT(rdev_auth, diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 0c8d3797a02e..90d5c0592667 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -821,7 +821,7 @@ static int cfg80211_wext_siwfreq(struct net_device *dev, ret = -EINVAL; break; } - ret = cfg80211_set_monitor_channel(rdev, &chandef); + ret = cfg80211_set_monitor_channel(rdev, dev, &chandef); break; case NL80211_IFTYPE_MESH_POINT: freq = cfg80211_wext_freq(wextfreq); -- cgit v1.2.3 From 9d40f7e32774279be7e5a7a278d7a290872b2f81 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 9 Oct 2024 10:25:48 +0200 Subject: wifi: mac80211: add flag to opt out of virtual monitor support This is useful for multi-radio devices that are capable of monitoring on multiple channels simultanenously. When this flag is set, each monitor interface is passed to the driver individually and can have a configured channel. The vif mac address for non-active monitor interfaces is cleared, in order to allow the driver to tell them apart from active ones. Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/3c55505ee0cf0a5f141fbcb30d1e8be8d9f40373.1728462320.git-series.nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 ++++++ net/mac80211/cfg.c | 44 ++++++++++++++++++++++++++++---------------- net/mac80211/chan.c | 14 +++++++++++++- net/mac80211/debugfs.c | 1 + net/mac80211/driver-ops.c | 1 + net/mac80211/iface.c | 22 +++++++++++++++++----- net/mac80211/rx.c | 3 +++ net/mac80211/tx.c | 6 ++++-- net/mac80211/util.c | 14 ++++++++++---- 9 files changed, 83 insertions(+), 28 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index d4f23224eff9..b4f246cdcca4 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2694,6 +2694,11 @@ struct ieee80211_txq { * a virtual monitor interface when monitor interfaces are the only * active interfaces. * + * @IEEE80211_HW_NO_VIRTUAL_MONITOR: The driver would like to be informed + * of any monitor interface, as well as their configured channel. + * This is useful for supporting multiple monitor interfaces on different + * channels. + * * @IEEE80211_HW_NO_AUTO_VIF: The driver would like for no wlanX to * be created. It is expected user-space will create vifs as * desired (and thus have them named as desired). @@ -2853,6 +2858,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_SUPPORTS_DYNAMIC_PS, IEEE80211_HW_MFP_CAPABLE, IEEE80211_HW_WANT_MONITOR_VIF, + IEEE80211_HW_NO_VIRTUAL_MONITOR, IEEE80211_HW_NO_AUTO_VIF, IEEE80211_HW_SW_CRYPTO_CONTROL, IEEE80211_HW_SUPPORT_FAST_XMIT, diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 492349d6f7bb..6c0b228523cb 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -105,8 +105,11 @@ static int ieee80211_set_mon_options(struct ieee80211_sub_if_data *sdata, } /* also validate MU-MIMO change */ - monitor_sdata = wiphy_dereference(local->hw.wiphy, - local->monitor_sdata); + if (ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) + monitor_sdata = sdata; + else + monitor_sdata = wiphy_dereference(local->hw.wiphy, + local->monitor_sdata); if (!monitor_sdata && (params->vht_mumimo_groups || params->vht_mumimo_follow_addr)) @@ -114,7 +117,9 @@ static int ieee80211_set_mon_options(struct ieee80211_sub_if_data *sdata, /* apply all changes now - no failures allowed */ - if (monitor_sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) + if (monitor_sdata && + (ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) || + ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR))) ieee80211_set_mu_mimo_follow(monitor_sdata, params); if (params->flags) { @@ -907,22 +912,25 @@ static int ieee80211_set_monitor_channel(struct wiphy *wiphy, lockdep_assert_wiphy(local->hw.wiphy); - if (cfg80211_chandef_identical(&local->monitor_chanreq.oper, - &chanreq.oper)) - return 0; + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + if (!ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { + if (cfg80211_chandef_identical(&local->monitor_chanreq.oper, + &chanreq.oper)) + return 0; - sdata = wiphy_dereference(local->hw.wiphy, - local->monitor_sdata); - if (!sdata) - goto done; + sdata = wiphy_dereference(wiphy, local->monitor_sdata); + if (!sdata) + goto done; + } - if (cfg80211_chandef_identical(&sdata->vif.bss_conf.chanreq.oper, + if (rcu_access_pointer(sdata->deflink.conf->chanctx_conf) && + cfg80211_chandef_identical(&sdata->vif.bss_conf.chanreq.oper, &chanreq.oper)) return 0; ieee80211_link_release_channel(&sdata->deflink); ret = ieee80211_link_use_channel(&sdata->deflink, &chanreq, - IEEE80211_CHANCTX_EXCLUSIVE); + IEEE80211_CHANCTX_SHARED); if (ret) return ret; done: @@ -3084,7 +3092,8 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy, if (wdev) { sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); - if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { + if (sdata->vif.type == NL80211_IFTYPE_MONITOR && + !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) return -EOPNOTSUPP; @@ -3118,7 +3127,8 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy, local->user_power_level = user_power_level; list_for_each_entry(sdata, &local->interfaces, list) { - if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { + if (sdata->vif.type == NL80211_IFTYPE_MONITOR && + !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { has_monitor = true; continue; } @@ -3139,7 +3149,8 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy, } } list_for_each_entry(sdata, &local->interfaces, list) { - if (sdata->vif.type == NL80211_IFTYPE_MONITOR) + if (sdata->vif.type == NL80211_IFTYPE_MONITOR && + !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) continue; for (int link_id = 0; @@ -4342,7 +4353,8 @@ static int ieee80211_cfg_get_channel(struct wiphy *wiphy, if (chanctx_conf) { *chandef = link->conf->chanreq.oper; ret = 0; - } else if (local->open_count > 0 && + } else if (!ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR) && + local->open_count > 0 && local->open_count == local->monitors && sdata->vif.type == NL80211_IFTYPE_MONITOR) { *chandef = local->monitor_chanreq.oper; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 996965005d49..a442cb667520 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -347,6 +347,10 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local, case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_NAN: continue; + case NL80211_IFTYPE_MONITOR: + WARN_ON_ONCE(!ieee80211_hw_check(&local->hw, + NO_VIRTUAL_MONITOR)); + fallthrough; case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_MESH_POINT: case NL80211_IFTYPE_OCB: @@ -355,7 +359,6 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local, case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: - case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: WARN_ON_ONCE(1); @@ -964,6 +967,10 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, if (!link->sdata->u.mgd.associated) continue; break; + case NL80211_IFTYPE_MONITOR: + if (!ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) + continue; + break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_MESH_POINT: @@ -976,6 +983,11 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, if (rcu_access_pointer(link->conf->chanctx_conf) != &chanctx->conf) continue; + if (link->sdata->vif.type == NL80211_IFTYPE_MONITOR) { + rx_chains_dynamic = rx_chains_static = local->rx_chains; + break; + } + switch (link->smps_mode) { default: WARN_ONCE(1, "Invalid SMPS mode %d\n", diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 02b5476a4376..b777240c924e 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -456,6 +456,7 @@ static const char *hw_flag_names[] = { FLAG(SUPPORTS_DYNAMIC_PS), FLAG(MFP_CAPABLE), FLAG(WANT_MONITOR_VIF), + FLAG(NO_VIRTUAL_MONITOR), FLAG(NO_AUTO_VIF), FLAG(SW_CRYPTO_CONTROL), FLAG(SUPPORT_FAST_XMIT), diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index 84d048339113..299d38e9e863 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -65,6 +65,7 @@ int drv_add_interface(struct ieee80211_local *local, if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_AP_VLAN || (sdata->vif.type == NL80211_IFTYPE_MONITOR && !ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) && + !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR) && !(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE)))) return -EINVAL; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 7a99fa057cd9..683cc50cc266 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -279,8 +279,13 @@ static int _ieee80211_change_mac(struct ieee80211_sub_if_data *sdata, ret = eth_mac_addr(sdata->dev, sa); if (ret == 0) { - memcpy(sdata->vif.addr, sa->sa_data, ETH_ALEN); - ether_addr_copy(sdata->vif.bss_conf.addr, sdata->vif.addr); + if (check_dup) { + memcpy(sdata->vif.addr, sa->sa_data, ETH_ALEN); + ether_addr_copy(sdata->vif.bss_conf.addr, sdata->vif.addr); + } else { + memset(sdata->vif.addr, 0, ETH_ALEN); + memset(sdata->vif.bss_conf.addr, 0, ETH_ALEN); + } } /* Regardless of eth_mac_addr() return we still want to add the @@ -699,9 +704,11 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do ieee80211_recalc_idle(local); ieee80211_recalc_offload(local); - if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE)) + if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) && + !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) break; + ieee80211_link_release_channel(&sdata->deflink); fallthrough; default: if (!going_down) @@ -1131,7 +1138,8 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) ASSERT_RTNL(); lockdep_assert_wiphy(local->hw.wiphy); - if (local->monitor_sdata) + if (local->monitor_sdata || + ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) return 0; sdata = kzalloc(sizeof(*sdata) + local->hw.vif_data_size, GFP_KERNEL); @@ -1193,6 +1201,9 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; + if (ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) + return; + ASSERT_RTNL(); lockdep_assert_wiphy(local->hw.wiphy); @@ -1328,7 +1339,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) break; } - if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) { + if ((sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) || + ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { res = drv_add_interface(local, sdata); if (res) goto err_stop; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index a34523bbd156..d032bfb00ade 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -840,6 +840,9 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, bool last_monitor = list_is_last(&sdata->u.mntr.list, &local->mon_list); + if (ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) + ieee80211_handle_mu_mimo_mon(sdata, origskb, rtap_space); + if (!monskb) monskb = ieee80211_make_monitor_skb(local, &origskb, rate, rtap_space, diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index f10379bef9fb..a24636bda679 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1763,7 +1763,8 @@ static bool __ieee80211_tx(struct ieee80211_local *local, switch (sdata->vif.type) { case NL80211_IFTYPE_MONITOR: - if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) { + if ((sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) || + ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { vif = &sdata->vif; break; } @@ -3952,7 +3953,8 @@ begin: switch (tx.sdata->vif.type) { case NL80211_IFTYPE_MONITOR: - if (tx.sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) { + if ((tx.sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) || + ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { vif = &tx.sdata->vif; break; } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index bd93de637f94..a4e1301cc999 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -756,7 +756,8 @@ static void __iterate_interfaces(struct ieee80211_local *local, lockdep_is_held(&local->hw.wiphy->mtx)) { switch (sdata->vif.type) { case NL80211_IFTYPE_MONITOR: - if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE)) + if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) && + !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) continue; break; case NL80211_IFTYPE_AP_VLAN: @@ -1873,8 +1874,10 @@ int ieee80211_reconfig(struct ieee80211_local *local) } list_for_each_entry(sdata, &local->interfaces, list) { + if (sdata->vif.type == NL80211_IFTYPE_MONITOR && + !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) + continue; if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN && - sdata->vif.type != NL80211_IFTYPE_MONITOR && ieee80211_sdata_running(sdata)) { res = drv_add_interface(local, sdata); if (WARN_ON(res)) @@ -1887,11 +1890,14 @@ int ieee80211_reconfig(struct ieee80211_local *local) */ if (res) { list_for_each_entry_continue_reverse(sdata, &local->interfaces, - list) + list) { + if (sdata->vif.type == NL80211_IFTYPE_MONITOR && + !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) + continue; if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN && - sdata->vif.type != NL80211_IFTYPE_MONITOR && ieee80211_sdata_running(sdata)) drv_remove_interface(local, sdata); + } ieee80211_handle_reconfig_failure(local); return res; } -- cgit v1.2.3 From a77e527b470cc38754c730bce1483711f643bb60 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 9 Oct 2024 10:25:49 +0200 Subject: wifi: cfg80211: add monitor SKIP_TX flag This can be used to indicate that the user is not interested in receiving locally sent packets on the monitor interface. Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/f0c20f832eadd36c71fba9a2a16ba57d78389b6c.1728462320.git-series.nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 2 ++ net/wireless/nl80211.c | 1 + 3 files changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5feb93ba0400..8f9853b1a5d1 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2267,6 +2267,7 @@ static inline int cfg80211_get_station(struct net_device *dev, * @MONITOR_FLAG_OTHER_BSS: disable BSSID filtering * @MONITOR_FLAG_COOK_FRAMES: report frames after processing * @MONITOR_FLAG_ACTIVE: active monitor, ACKs frames on its MAC address + * @MONITOR_FLAG_SKIP_TX: do not pass locally transmitted frames */ enum monitor_flags { MONITOR_FLAG_CHANGED = BIT(__NL80211_MNTR_FLAG_INVALID), @@ -2276,6 +2277,7 @@ enum monitor_flags { MONITOR_FLAG_OTHER_BSS = BIT(NL80211_MNTR_FLAG_OTHER_BSS), MONITOR_FLAG_COOK_FRAMES = BIT(NL80211_MNTR_FLAG_COOK_FRAMES), MONITOR_FLAG_ACTIVE = BIT(NL80211_MNTR_FLAG_ACTIVE), + MONITOR_FLAG_SKIP_TX = BIT(NL80211_MNTR_FLAG_SKIP_TX), }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 1b8827f920ff..6d11437596b9 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4703,6 +4703,7 @@ enum nl80211_survey_info { * overrides all other flags. * @NL80211_MNTR_FLAG_ACTIVE: use the configured MAC address * and ACK incoming unicast packets. + * @NL80211_MNTR_FLAG_SKIP_TX: do not pass local tx packets * * @__NL80211_MNTR_FLAG_AFTER_LAST: internal use * @NL80211_MNTR_FLAG_MAX: highest possible monitor flag @@ -4715,6 +4716,7 @@ enum nl80211_mntr_flags { NL80211_MNTR_FLAG_OTHER_BSS, NL80211_MNTR_FLAG_COOK_FRAMES, NL80211_MNTR_FLAG_ACTIVE, + NL80211_MNTR_FLAG_SKIP_TX, /* keep last */ __NL80211_MNTR_FLAG_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 84015f56e93a..4a8c3b6d49d1 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4206,6 +4206,7 @@ static const struct nla_policy mntr_flags_policy[NL80211_MNTR_FLAG_MAX + 1] = { [NL80211_MNTR_FLAG_OTHER_BSS] = { .type = NLA_FLAG }, [NL80211_MNTR_FLAG_COOK_FRAMES] = { .type = NLA_FLAG }, [NL80211_MNTR_FLAG_ACTIVE] = { .type = NLA_FLAG }, + [NL80211_MNTR_FLAG_SKIP_TX] = { .type = NLA_FLAG }, }; static int parse_monitor_flags(struct nlattr *nla, u32 *mntrflags) -- cgit v1.2.3 From 68ed5c38b512b734caf3da1f87db4a99fcfe3002 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Oct 2024 11:31:33 -0700 Subject: phonet: Pass net and ifindex to phonet_address_notify(). Currently, phonet_address_notify() fetches netns and ifindex from dev. Once addr_doit() is converted to RCU, phonet_address_notify() will be called outside of RCU due to GFP_KERNEL, and dev will be unavailable there. Let's pass net and ifindex to phonet_address_notify(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/phonet/pn_dev.h | 2 +- net/phonet/pn_dev.c | 10 +++++++--- net/phonet/pn_netlink.c | 12 ++++++------ 3 files changed, 14 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/phonet/pn_dev.h b/include/net/phonet/pn_dev.h index e9dc8dca5817..6b2102b4ece3 100644 --- a/include/net/phonet/pn_dev.h +++ b/include/net/phonet/pn_dev.h @@ -38,7 +38,7 @@ int phonet_address_add(struct net_device *dev, u8 addr); int phonet_address_del(struct net_device *dev, u8 addr); u8 phonet_address_get(struct net_device *dev, u8 addr); int phonet_address_lookup(struct net *net, u8 addr); -void phonet_address_notify(int event, struct net_device *dev, u8 addr); +void phonet_address_notify(struct net *net, int event, u32 ifindex, u8 addr); int phonet_route_add(struct net_device *dev, u8 daddr); int phonet_route_del(struct net_device *dev, u8 daddr); diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index cde671d29d5d..2e7d850dc726 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -98,10 +98,13 @@ static void phonet_device_destroy(struct net_device *dev) mutex_unlock(&pndevs->lock); if (pnd) { + struct net *net = dev_net(dev); + u32 ifindex = dev->ifindex; u8 addr; for_each_set_bit(addr, pnd->addrs, 64) - phonet_address_notify(RTM_DELADDR, dev, addr); + phonet_address_notify(net, RTM_DELADDR, ifindex, addr); + kfree(pnd); } } @@ -244,8 +247,9 @@ static int phonet_device_autoconf(struct net_device *dev) ret = phonet_address_add(dev, req.ifr_phonet_autoconf.device); if (ret) return ret; - phonet_address_notify(RTM_NEWADDR, dev, - req.ifr_phonet_autoconf.device); + + phonet_address_notify(dev_net(dev), RTM_NEWADDR, dev->ifindex, + req.ifr_phonet_autoconf.device); return 0; } diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c index 3205d2457477..23097085ad38 100644 --- a/net/phonet/pn_netlink.c +++ b/net/phonet/pn_netlink.c @@ -22,7 +22,7 @@ static int fill_addr(struct sk_buff *skb, u32 ifindex, u8 addr, u32 portid, u32 seq, int event); -void phonet_address_notify(int event, struct net_device *dev, u8 addr) +void phonet_address_notify(struct net *net, int event, u32 ifindex, u8 addr) { struct sk_buff *skb; int err = -ENOBUFS; @@ -32,17 +32,17 @@ void phonet_address_notify(int event, struct net_device *dev, u8 addr) if (skb == NULL) goto errout; - err = fill_addr(skb, dev->ifindex, addr, 0, 0, event); + err = fill_addr(skb, ifindex, addr, 0, 0, event); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } - rtnl_notify(skb, dev_net(dev), 0, - RTNLGRP_PHONET_IFADDR, NULL, GFP_KERNEL); + + rtnl_notify(skb, net, 0, RTNLGRP_PHONET_IFADDR, NULL, GFP_KERNEL); return; errout: - rtnl_set_sk_err(dev_net(dev), RTNLGRP_PHONET_IFADDR, err); + rtnl_set_sk_err(net, RTNLGRP_PHONET_IFADDR, err); } static const struct nla_policy ifa_phonet_policy[IFA_MAX+1] = { @@ -89,7 +89,7 @@ static int addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh, else err = phonet_address_del(dev, pnaddr); if (!err) - phonet_address_notify(nlh->nlmsg_type, dev, pnaddr); + phonet_address_notify(net, nlh->nlmsg_type, ifm->ifa_index, pnaddr); return err; } -- cgit v1.2.3 From 42f5fe1dc4babad1c49bcc4121983fffccee3cd9 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Oct 2024 11:31:34 -0700 Subject: phonet: Convert phonet_device_list.lock to spinlock_t. addr_doit() calls phonet_address_add() or phonet_address_del() for RTM_NEWADDR or RTM_DELADDR, respectively. Both functions only touch phonet_device_list(dev_net(dev)), which is currently protected by RTNL and its dedicated mutex, phonet_device_list.lock. We will convert addr_doit() to RCU and cannot use mutex inside RCU. Let's convert the mutex to spinlock_t. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/phonet/pn_dev.h | 3 ++- net/phonet/pn_dev.c | 26 +++++++++++++++++--------- 2 files changed, 19 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/phonet/pn_dev.h b/include/net/phonet/pn_dev.h index 6b2102b4ece3..ac0331d83a81 100644 --- a/include/net/phonet/pn_dev.h +++ b/include/net/phonet/pn_dev.h @@ -12,12 +12,13 @@ #include #include +#include struct net; struct phonet_device_list { struct list_head list; - struct mutex lock; + spinlock_t lock; }; struct phonet_device_list *phonet_device_list(struct net *net); diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index 2e7d850dc726..545279ef5910 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -54,7 +54,7 @@ static struct phonet_device *__phonet_device_alloc(struct net_device *dev) pnd->netdev = dev; bitmap_zero(pnd->addrs, 64); - BUG_ON(!mutex_is_locked(&pndevs->lock)); + lockdep_assert_held(&pndevs->lock); list_add_rcu(&pnd->list, &pndevs->list); return pnd; } @@ -64,7 +64,8 @@ static struct phonet_device *__phonet_get(struct net_device *dev) struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); struct phonet_device *pnd; - BUG_ON(!mutex_is_locked(&pndevs->lock)); + lockdep_assert_held(&pndevs->lock); + list_for_each_entry(pnd, &pndevs->list, list) { if (pnd->netdev == dev) return pnd; @@ -91,11 +92,13 @@ static void phonet_device_destroy(struct net_device *dev) ASSERT_RTNL(); - mutex_lock(&pndevs->lock); + spin_lock(&pndevs->lock); + pnd = __phonet_get(dev); if (pnd) list_del_rcu(&pnd->list); - mutex_unlock(&pndevs->lock); + + spin_unlock(&pndevs->lock); if (pnd) { struct net *net = dev_net(dev); @@ -136,7 +139,8 @@ int phonet_address_add(struct net_device *dev, u8 addr) struct phonet_device *pnd; int err = 0; - mutex_lock(&pndevs->lock); + spin_lock(&pndevs->lock); + /* Find or create Phonet-specific device data */ pnd = __phonet_get(dev); if (pnd == NULL) @@ -145,7 +149,9 @@ int phonet_address_add(struct net_device *dev, u8 addr) err = -ENOMEM; else if (test_and_set_bit(addr >> 2, pnd->addrs)) err = -EEXIST; - mutex_unlock(&pndevs->lock); + + spin_unlock(&pndevs->lock); + return err; } @@ -155,7 +161,8 @@ int phonet_address_del(struct net_device *dev, u8 addr) struct phonet_device *pnd; int err = 0; - mutex_lock(&pndevs->lock); + spin_lock(&pndevs->lock); + pnd = __phonet_get(dev); if (!pnd || !test_and_clear_bit(addr >> 2, pnd->addrs)) { err = -EADDRNOTAVAIL; @@ -164,7 +171,8 @@ int phonet_address_del(struct net_device *dev, u8 addr) list_del_rcu(&pnd->list); else pnd = NULL; - mutex_unlock(&pndevs->lock); + + spin_unlock(&pndevs->lock); if (pnd) kfree_rcu(pnd, rcu); @@ -313,7 +321,7 @@ static int __net_init phonet_init_net(struct net *net) return -ENOMEM; INIT_LIST_HEAD(&pnn->pndevs.list); - mutex_init(&pnn->pndevs.lock); + spin_lock_init(&pnn->pndevs.lock); mutex_init(&pnn->routes.lock); return 0; } -- cgit v1.2.3 From de51ad08b1177bbbb8b60cb7dd4c3c5dd50d262f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Oct 2024 11:31:38 -0700 Subject: phonet: Pass net and ifindex to rtm_phonet_notify(). Currently, rtm_phonet_notify() fetches netns and ifindex from dev. Once route_doit() is converted to RCU, rtm_phonet_notify() will be called outside of RCU due to GFP_KERNEL, and dev will be unavailable there. Let's pass net and ifindex to rtm_phonet_notify(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/phonet/pn_dev.h | 2 +- net/phonet/pn_dev.c | 10 +++++++--- net/phonet/pn_netlink.c | 16 +++++++++------- 3 files changed, 17 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/phonet/pn_dev.h b/include/net/phonet/pn_dev.h index ac0331d83a81..021e524fd20a 100644 --- a/include/net/phonet/pn_dev.h +++ b/include/net/phonet/pn_dev.h @@ -43,7 +43,7 @@ void phonet_address_notify(struct net *net, int event, u32 ifindex, u8 addr); int phonet_route_add(struct net_device *dev, u8 daddr); int phonet_route_del(struct net_device *dev, u8 daddr); -void rtm_phonet_notify(int event, struct net_device *dev, u8 dst); +void rtm_phonet_notify(struct net *net, int event, u32 ifindex, u8 dst); struct net_device *phonet_route_get_rcu(struct net *net, u8 daddr); struct net_device *phonet_route_output(struct net *net, u8 daddr); diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index 545279ef5910..6ded0d347b9f 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -263,9 +263,13 @@ static int phonet_device_autoconf(struct net_device *dev) static void phonet_route_autodel(struct net_device *dev) { - struct phonet_net *pnn = phonet_pernet(dev_net(dev)); - unsigned int i; + struct net *net = dev_net(dev); DECLARE_BITMAP(deleted, 64); + u32 ifindex = dev->ifindex; + struct phonet_net *pnn; + unsigned int i; + + pnn = phonet_pernet(net); /* Remove left-over Phonet routes */ bitmap_zero(deleted, 64); @@ -281,7 +285,7 @@ static void phonet_route_autodel(struct net_device *dev) return; /* short-circuit RCU */ synchronize_rcu(); for_each_set_bit(i, deleted, 64) { - rtm_phonet_notify(RTM_DELROUTE, dev, i); + rtm_phonet_notify(net, RTM_DELROUTE, ifindex, i); dev_put(dev); } } diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c index c9a4215ec560..bfec5bd639b6 100644 --- a/net/phonet/pn_netlink.c +++ b/net/phonet/pn_netlink.c @@ -200,7 +200,7 @@ nla_put_failure: return -EMSGSIZE; } -void rtm_phonet_notify(int event, struct net_device *dev, u8 dst) +void rtm_phonet_notify(struct net *net, int event, u32 ifindex, u8 dst) { struct sk_buff *skb; int err = -ENOBUFS; @@ -210,17 +210,17 @@ void rtm_phonet_notify(int event, struct net_device *dev, u8 dst) if (skb == NULL) goto errout; - err = fill_route(skb, dev->ifindex, dst, 0, 0, event); + err = fill_route(skb, ifindex, dst, 0, 0, event); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } - rtnl_notify(skb, dev_net(dev), 0, - RTNLGRP_PHONET_ROUTE, NULL, GFP_KERNEL); + + rtnl_notify(skb, net, 0, RTNLGRP_PHONET_ROUTE, NULL, GFP_KERNEL); return; errout: - rtnl_set_sk_err(dev_net(dev), RTNLGRP_PHONET_ROUTE, err); + rtnl_set_sk_err(net, RTNLGRP_PHONET_ROUTE, err); } static const struct nla_policy rtm_phonet_policy[RTA_MAX+1] = { @@ -235,6 +235,7 @@ static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr *tb[RTA_MAX+1]; struct net_device *dev; struct rtmsg *rtm; + u32 ifindex; int err; u8 dst; @@ -260,7 +261,8 @@ static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh, if (dst & 3) /* Phonet addresses only have 6 high-order bits */ return -EINVAL; - dev = __dev_get_by_index(net, nla_get_u32(tb[RTA_OIF])); + ifindex = nla_get_u32(tb[RTA_OIF]); + dev = __dev_get_by_index(net, ifindex); if (dev == NULL) return -ENODEV; @@ -269,7 +271,7 @@ static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh, else err = phonet_route_del(dev, dst); if (!err) - rtm_phonet_notify(nlh->nlmsg_type, dev, dst); + rtm_phonet_notify(net, nlh->nlmsg_type, ifindex, dst); return err; } -- cgit v1.2.3 From 3deec3b4afb4c767007eae1eeedbcf3da599395b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Oct 2024 11:31:39 -0700 Subject: phonet: Convert phonet_routes.lock to spinlock_t. route_doit() calls phonet_route_add() or phonet_route_del() for RTM_NEWROUTE or RTM_DELROUTE, respectively. Both functions only touch phonet_pernet(dev_net(dev))->routes, which is currently protected by RTNL and its dedicated mutex, phonet_routes.lock. We will convert route_doit() to RCU and cannot use mutex inside RCU. Let's convert the mutex to spinlock_t. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/phonet/pn_dev.h | 1 - net/phonet/pn_dev.c | 23 ++++++++++++++--------- 2 files changed, 14 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/phonet/pn_dev.h b/include/net/phonet/pn_dev.h index 021e524fd20a..37a3e83531c6 100644 --- a/include/net/phonet/pn_dev.h +++ b/include/net/phonet/pn_dev.h @@ -11,7 +11,6 @@ #define PN_DEV_H #include -#include #include struct net; diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index 6ded0d347b9f..19234d664c4f 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -22,7 +22,7 @@ #include struct phonet_routes { - struct mutex lock; + spinlock_t lock; struct net_device __rcu *table[64]; }; @@ -273,13 +273,15 @@ static void phonet_route_autodel(struct net_device *dev) /* Remove left-over Phonet routes */ bitmap_zero(deleted, 64); - mutex_lock(&pnn->routes.lock); - for (i = 0; i < 64; i++) + + spin_lock(&pnn->routes.lock); + for (i = 0; i < 64; i++) { if (rcu_access_pointer(pnn->routes.table[i]) == dev) { RCU_INIT_POINTER(pnn->routes.table[i], NULL); set_bit(i, deleted); } - mutex_unlock(&pnn->routes.lock); + } + spin_unlock(&pnn->routes.lock); if (bitmap_empty(deleted, 64)) return; /* short-circuit RCU */ @@ -326,7 +328,7 @@ static int __net_init phonet_init_net(struct net *net) INIT_LIST_HEAD(&pnn->pndevs.list); spin_lock_init(&pnn->pndevs.lock); - mutex_init(&pnn->routes.lock); + spin_lock_init(&pnn->routes.lock); return 0; } @@ -376,13 +378,15 @@ int phonet_route_add(struct net_device *dev, u8 daddr) int err = -EEXIST; daddr = daddr >> 2; - mutex_lock(&routes->lock); + + spin_lock(&routes->lock); if (routes->table[daddr] == NULL) { rcu_assign_pointer(routes->table[daddr], dev); dev_hold(dev); err = 0; } - mutex_unlock(&routes->lock); + spin_unlock(&routes->lock); + return err; } @@ -392,12 +396,13 @@ int phonet_route_del(struct net_device *dev, u8 daddr) struct phonet_routes *routes = &pnn->routes; daddr = daddr >> 2; - mutex_lock(&routes->lock); + + spin_lock(&routes->lock); if (rcu_access_pointer(routes->table[daddr]) == dev) RCU_INIT_POINTER(routes->table[daddr], NULL); else dev = NULL; - mutex_unlock(&routes->lock); + spin_unlock(&routes->lock); if (!dev) return -ENOENT; -- cgit v1.2.3 From 26d8db55eeacb7dc78672523f57825916d203de4 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 21 Oct 2024 11:32:29 -0700 Subject: rtnetlink: Define RTNL_FLAG_DOIT_PERNET for per-netns RTNL doit(). We will push RTNL down to each doit() as rtnl_net_lock(). We can use RTNL_FLAG_DOIT_UNLOCKED to call doit() without RTNL, but doit() will still hold RTNL. Let's define RTNL_FLAG_DOIT_PERNET as an alias of RTNL_FLAG_DOIT_UNLOCKED. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/rtnetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index e0d9a8eae6b6..b260c0cc9671 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -12,6 +12,7 @@ typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *); enum rtnl_link_flags { RTNL_FLAG_DOIT_UNLOCKED = BIT(0), +#define RTNL_FLAG_DOIT_PERNET RTNL_FLAG_DOIT_UNLOCKED RTNL_FLAG_BULK_DEL_SUPPORTED = BIT(1), RTNL_FLAG_DUMP_UNLOCKED = BIT(2), RTNL_FLAG_DUMP_SPLIT_NLM_DONE = BIT(3), /* legacy behavior */ -- cgit v1.2.3 From 1ddf9916ac09313128e40d6581cef889c0b4ce84 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 23 Oct 2024 12:53:42 +0200 Subject: xfrm: Add support for per cpu xfrm state handling. Currently all flows for a certain SA must be processed by the same cpu to avoid packet reordering and lock contention of the xfrm state lock. To get rid of this limitation, the IETF standardized per cpu SAs in RFC 9611. This patch implements the xfrm part of it. We add the cpu as a lookup key for xfrm states and a config option to generate acquire messages for each cpu. With that, we can have on each cpu a SA with identical traffic selector so that flows can be processed in parallel on all cpus. Signed-off-by: Steffen Klassert Tested-by: Antony Antony Tested-by: Tobias Brunner --- include/net/xfrm.h | 5 ++-- include/uapi/linux/xfrm.h | 2 ++ net/key/af_key.c | 7 +++--- net/xfrm/xfrm_compat.c | 6 +++-- net/xfrm/xfrm_state.c | 58 ++++++++++++++++++++++++++++++++++++++--------- net/xfrm/xfrm_user.c | 56 +++++++++++++++++++++++++++++++++++++++++---- 6 files changed, 112 insertions(+), 22 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index a0bdd58f401c..f5275618e744 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -188,6 +188,7 @@ struct xfrm_state { refcount_t refcnt; spinlock_t lock; + u32 pcpu_num; struct xfrm_id id; struct xfrm_selector sel; struct xfrm_mark mark; @@ -1684,7 +1685,7 @@ struct xfrmk_spdinfo { u32 spdhmcnt; }; -struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq); +struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num); int xfrm_state_delete(struct xfrm_state *x); int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync); int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid); @@ -1796,7 +1797,7 @@ int verify_spi_info(u8 proto, u32 min, u32 max, struct netlink_ext_ack *extack); int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi, struct netlink_ext_ack *extack); struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, - u8 mode, u32 reqid, u32 if_id, u8 proto, + u8 mode, u32 reqid, u32 if_id, u32 pcpu_num, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create, unsigned short family); diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index f28701500714..d73a97e3030a 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -322,6 +322,7 @@ enum xfrm_attr_type_t { XFRMA_MTIMER_THRESH, /* __u32 in seconds for input SA */ XFRMA_SA_DIR, /* __u8 */ XFRMA_NAT_KEEPALIVE_INTERVAL, /* __u32 in seconds for NAT keepalive */ + XFRMA_SA_PCPU, /* __u32 */ __XFRMA_MAX #define XFRMA_OUTPUT_MARK XFRMA_SET_MARK /* Compatibility */ @@ -437,6 +438,7 @@ struct xfrm_userpolicy_info { #define XFRM_POLICY_LOCALOK 1 /* Allow user to override global policy */ /* Automatically expand selector to include matching ICMP payloads. */ #define XFRM_POLICY_ICMP 2 +#define XFRM_POLICY_CPU_ACQUIRE 4 __u8 share; }; diff --git a/net/key/af_key.c b/net/key/af_key.c index f79fb99271ed..c56bb4f451e6 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1354,7 +1354,7 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_ } if (hdr->sadb_msg_seq) { - x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq); + x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq, UINT_MAX); if (x && !xfrm_addr_equal(&x->id.daddr, xdaddr, family)) { xfrm_state_put(x); x = NULL; @@ -1362,7 +1362,8 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_ } if (!x) - x = xfrm_find_acq(net, &dummy_mark, mode, reqid, 0, proto, xdaddr, xsaddr, 1, family); + x = xfrm_find_acq(net, &dummy_mark, mode, reqid, 0, UINT_MAX, + proto, xdaddr, xsaddr, 1, family); if (x == NULL) return -ENOENT; @@ -1417,7 +1418,7 @@ static int pfkey_acquire(struct sock *sk, struct sk_buff *skb, const struct sadb if (hdr->sadb_msg_seq == 0 || hdr->sadb_msg_errno == 0) return 0; - x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq); + x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq, UINT_MAX); if (x == NULL) return 0; diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index 91357ccaf4af..5b9ee63e30b6 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -132,6 +132,7 @@ static const struct nla_policy compat_policy[XFRMA_MAX+1] = { [XFRMA_MTIMER_THRESH] = { .type = NLA_U32 }, [XFRMA_SA_DIR] = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT), [XFRMA_NAT_KEEPALIVE_INTERVAL] = { .type = NLA_U32 }, + [XFRMA_SA_PCPU] = { .type = NLA_U32 }, }; static struct nlmsghdr *xfrm_nlmsg_put_compat(struct sk_buff *skb, @@ -282,9 +283,10 @@ static int xfrm_xlate64_attr(struct sk_buff *dst, const struct nlattr *src) case XFRMA_MTIMER_THRESH: case XFRMA_SA_DIR: case XFRMA_NAT_KEEPALIVE_INTERVAL: + case XFRMA_SA_PCPU: return xfrm_nla_cpy(dst, src, nla_len(src)); default: - BUILD_BUG_ON(XFRMA_MAX != XFRMA_NAT_KEEPALIVE_INTERVAL); + BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_PCPU); pr_warn_once("unsupported nla_type %d\n", src->nla_type); return -EOPNOTSUPP; } @@ -439,7 +441,7 @@ static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla, int err; if (type > XFRMA_MAX) { - BUILD_BUG_ON(XFRMA_MAX != XFRMA_NAT_KEEPALIVE_INTERVAL); + BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_PCPU); NL_SET_ERR_MSG(extack, "Bad attribute"); return -EOPNOTSUPP; } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 37478d36a8df..ebef07b80afa 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -679,6 +679,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net) x->lft.hard_packet_limit = XFRM_INF; x->replay_maxage = 0; x->replay_maxdiff = 0; + x->pcpu_num = UINT_MAX; spin_lock_init(&x->lock); } return x; @@ -1155,6 +1156,12 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x, struct xfrm_state **best, int *acq_in_progress, int *error) { + /* We need the cpu id just as a lookup key, + * we don't require it to be stable. + */ + unsigned int pcpu_id = get_cpu(); + put_cpu(); + /* Resolution logic: * 1. There is a valid state with matching selector. Done. * 2. Valid state with inappropriate selector. Skip. @@ -1174,13 +1181,18 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x, &fl->u.__fl_common)) return; + if (x->pcpu_num != UINT_MAX && x->pcpu_num != pcpu_id) + return; + if (!*best || + ((*best)->pcpu_num == UINT_MAX && x->pcpu_num == pcpu_id) || (*best)->km.dying > x->km.dying || ((*best)->km.dying == x->km.dying && (*best)->curlft.add_time < x->curlft.add_time)) *best = x; } else if (x->km.state == XFRM_STATE_ACQ) { - *acq_in_progress = 1; + if (!*best || x->pcpu_num == pcpu_id) + *acq_in_progress = 1; } else if (x->km.state == XFRM_STATE_ERROR || x->km.state == XFRM_STATE_EXPIRED) { if ((!x->sel.family || @@ -1209,6 +1221,13 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short encap_family = tmpl->encap_family; unsigned int sequence; struct km_event c; + unsigned int pcpu_id; + + /* We need the cpu id just as a lookup key, + * we don't require it to be stable. + */ + pcpu_id = get_cpu(); + put_cpu(); to_put = NULL; @@ -1282,7 +1301,10 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, } found: - x = best; + if (!(pol->flags & XFRM_POLICY_CPU_ACQUIRE) || + (best && (best->pcpu_num == pcpu_id))) + x = best; + if (!x && !error && !acquire_in_progress) { if (tmpl->id.spi && (x0 = __xfrm_state_lookup_all(net, mark, daddr, @@ -1314,6 +1336,8 @@ found: xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family); memcpy(&x->mark, &pol->mark, sizeof(x->mark)); x->if_id = if_id; + if ((pol->flags & XFRM_POLICY_CPU_ACQUIRE) && best) + x->pcpu_num = pcpu_id; error = security_xfrm_state_alloc_acquire(x, pol->security, fl->flowi_secid); if (error) { @@ -1392,6 +1416,11 @@ found: x = NULL; error = -ESRCH; } + + /* Use the already installed 'fallback' while the CPU-specific + * SA acquire is handled*/ + if (best) + x = best; } out: if (x) { @@ -1524,12 +1553,14 @@ static void __xfrm_state_bump_genids(struct xfrm_state *xnew) unsigned int h; u32 mark = xnew->mark.v & xnew->mark.m; u32 if_id = xnew->if_id; + u32 cpu_id = xnew->pcpu_num; h = xfrm_dst_hash(net, &xnew->id.daddr, &xnew->props.saddr, reqid, family); hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { if (x->props.family == family && x->props.reqid == reqid && x->if_id == if_id && + x->pcpu_num == cpu_id && (mark & x->mark.m) == x->mark.v && xfrm_addr_equal(&x->id.daddr, &xnew->id.daddr, family) && xfrm_addr_equal(&x->props.saddr, &xnew->props.saddr, family)) @@ -1552,7 +1583,7 @@ EXPORT_SYMBOL(xfrm_state_insert); static struct xfrm_state *__find_acq_core(struct net *net, const struct xfrm_mark *m, unsigned short family, u8 mode, - u32 reqid, u32 if_id, u8 proto, + u32 reqid, u32 if_id, u32 pcpu_num, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create) @@ -1569,6 +1600,7 @@ static struct xfrm_state *__find_acq_core(struct net *net, x->id.spi != 0 || x->id.proto != proto || (mark & x->mark.m) != x->mark.v || + x->pcpu_num != pcpu_num || !xfrm_addr_equal(&x->id.daddr, daddr, family) || !xfrm_addr_equal(&x->props.saddr, saddr, family)) continue; @@ -1602,6 +1634,7 @@ static struct xfrm_state *__find_acq_core(struct net *net, break; } + x->pcpu_num = pcpu_num; x->km.state = XFRM_STATE_ACQ; x->id.proto = proto; x->props.family = family; @@ -1630,7 +1663,7 @@ static struct xfrm_state *__find_acq_core(struct net *net, return x; } -static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq); +static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num); int xfrm_state_add(struct xfrm_state *x) { @@ -1656,7 +1689,7 @@ int xfrm_state_add(struct xfrm_state *x) } if (use_spi && x->km.seq) { - x1 = __xfrm_find_acq_byseq(net, mark, x->km.seq); + x1 = __xfrm_find_acq_byseq(net, mark, x->km.seq, x->pcpu_num); if (x1 && ((x1->id.proto != x->id.proto) || !xfrm_addr_equal(&x1->id.daddr, &x->id.daddr, family))) { to_put = x1; @@ -1666,7 +1699,7 @@ int xfrm_state_add(struct xfrm_state *x) if (use_spi && !x1) x1 = __find_acq_core(net, &x->mark, family, x->props.mode, - x->props.reqid, x->if_id, x->id.proto, + x->props.reqid, x->if_id, x->pcpu_num, x->id.proto, &x->id.daddr, &x->props.saddr, 0); __xfrm_state_bump_genids(x); @@ -1791,6 +1824,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, x->props.flags = orig->props.flags; x->props.extra_flags = orig->props.extra_flags; + x->pcpu_num = orig->pcpu_num; x->if_id = orig->if_id; x->tfcpad = orig->tfcpad; x->replay_maxdiff = orig->replay_maxdiff; @@ -2066,13 +2100,14 @@ EXPORT_SYMBOL(xfrm_state_lookup_byaddr); struct xfrm_state * xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid, - u32 if_id, u8 proto, const xfrm_address_t *daddr, + u32 if_id, u32 pcpu_num, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create, unsigned short family) { struct xfrm_state *x; spin_lock_bh(&net->xfrm.xfrm_state_lock); - x = __find_acq_core(net, mark, family, mode, reqid, if_id, proto, daddr, saddr, create); + x = __find_acq_core(net, mark, family, mode, reqid, if_id, pcpu_num, + proto, daddr, saddr, create); spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; @@ -2207,7 +2242,7 @@ xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n, /* Silly enough, but I'm lazy to build resolution list */ -static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq) +static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num) { unsigned int h = xfrm_seq_hash(net, seq); struct xfrm_state *x; @@ -2215,6 +2250,7 @@ static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 s hlist_for_each_entry_rcu(x, net->xfrm.state_byseq + h, byseq) { if (x->km.seq == seq && (mark & x->mark.m) == x->mark.v && + x->pcpu_num == pcpu_num && x->km.state == XFRM_STATE_ACQ) { xfrm_state_hold(x); return x; @@ -2224,12 +2260,12 @@ static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 s return NULL; } -struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq) +struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num) { struct xfrm_state *x; spin_lock_bh(&net->xfrm.xfrm_state_lock); - x = __xfrm_find_acq_byseq(net, mark, seq); + x = __xfrm_find_acq_byseq(net, mark, seq, pcpu_num); spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index e3b8ce89831a..e4d448950d05 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -460,6 +460,12 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, } } + if (!sa_dir && attrs[XFRMA_SA_PCPU]) { + NL_SET_ERR_MSG(extack, "SA_PCPU only supported with SA_DIR"); + err = -EINVAL; + goto out; + } + out: return err; } @@ -841,6 +847,12 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, x->nat_keepalive_interval = nla_get_u32(attrs[XFRMA_NAT_KEEPALIVE_INTERVAL]); + if (attrs[XFRMA_SA_PCPU]) { + x->pcpu_num = nla_get_u32(attrs[XFRMA_SA_PCPU]); + if (x->pcpu_num >= num_possible_cpus()) + goto error; + } + err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV], extack); if (err) goto error; @@ -1296,6 +1308,11 @@ static int copy_to_user_state_extra(struct xfrm_state *x, if (ret) goto out; } + if (x->pcpu_num != UINT_MAX) { + ret = nla_put_u32(skb, XFRMA_SA_PCPU, x->pcpu_num); + if (ret) + goto out; + } if (x->dir) ret = nla_put_u8(skb, XFRMA_SA_DIR, x->dir); @@ -1700,6 +1717,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, u32 mark; struct xfrm_mark m; u32 if_id = 0; + u32 pcpu_num = UINT_MAX; p = nlmsg_data(nlh); err = verify_spi_info(p->info.id.proto, p->min, p->max, extack); @@ -1716,8 +1734,16 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, if (attrs[XFRMA_IF_ID]) if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + if (attrs[XFRMA_SA_PCPU]) { + pcpu_num = nla_get_u32(attrs[XFRMA_SA_PCPU]); + if (pcpu_num >= num_possible_cpus()) { + err = -EINVAL; + goto out_noput; + } + } + if (p->info.seq) { - x = xfrm_find_acq_byseq(net, mark, p->info.seq); + x = xfrm_find_acq_byseq(net, mark, p->info.seq, pcpu_num); if (x && !xfrm_addr_equal(&x->id.daddr, daddr, family)) { xfrm_state_put(x); x = NULL; @@ -1726,7 +1752,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, if (!x) x = xfrm_find_acq(net, &m, p->info.mode, p->info.reqid, - if_id, p->info.id.proto, daddr, + if_id, pcpu_num, p->info.id.proto, daddr, &p->info.saddr, 1, family); err = -ENOENT; @@ -2526,7 +2552,8 @@ static inline unsigned int xfrm_aevent_msgsize(struct xfrm_state *x) + nla_total_size(sizeof(struct xfrm_mark)) + nla_total_size(4) /* XFRM_AE_RTHR */ + nla_total_size(4) /* XFRM_AE_ETHR */ - + nla_total_size(sizeof(x->dir)); /* XFRMA_SA_DIR */ + + nla_total_size(sizeof(x->dir)) /* XFRMA_SA_DIR */ + + nla_total_size(4); /* XFRMA_SA_PCPU */ } static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c) @@ -2582,6 +2609,8 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct err = xfrm_if_id_put(skb, x->if_id); if (err) goto out_cancel; + if (x->pcpu_num != UINT_MAX) + err = nla_put_u32(skb, XFRMA_SA_PCPU, x->pcpu_num); if (x->dir) { err = nla_put_u8(skb, XFRMA_SA_DIR, x->dir); @@ -2852,6 +2881,13 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh, xfrm_mark_get(attrs, &mark); + if (attrs[XFRMA_SA_PCPU]) { + x->pcpu_num = nla_get_u32(attrs[XFRMA_SA_PCPU]); + err = -EINVAL; + if (x->pcpu_num >= num_possible_cpus()) + goto free_state; + } + err = verify_newpolicy_info(&ua->policy, extack); if (err) goto free_state; @@ -3182,6 +3218,7 @@ const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_MTIMER_THRESH] = { .type = NLA_U32 }, [XFRMA_SA_DIR] = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT), [XFRMA_NAT_KEEPALIVE_INTERVAL] = { .type = NLA_U32 }, + [XFRMA_SA_PCPU] = { .type = NLA_U32 }, }; EXPORT_SYMBOL_GPL(xfrma_policy); @@ -3348,7 +3385,8 @@ static inline unsigned int xfrm_expire_msgsize(void) { return NLMSG_ALIGN(sizeof(struct xfrm_user_expire)) + nla_total_size(sizeof(struct xfrm_mark)) + - nla_total_size(sizeof_field(struct xfrm_state, dir)); + nla_total_size(sizeof_field(struct xfrm_state, dir)) + + nla_total_size(4); /* XFRMA_SA_PCPU */ } static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c) @@ -3374,6 +3412,11 @@ static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct err = xfrm_if_id_put(skb, x->if_id); if (err) return err; + if (x->pcpu_num != UINT_MAX) { + err = nla_put_u32(skb, XFRMA_SA_PCPU, x->pcpu_num); + if (err) + return err; + } if (x->dir) { err = nla_put_u8(skb, XFRMA_SA_DIR, x->dir); @@ -3481,6 +3524,8 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x) } if (x->if_id) l += nla_total_size(sizeof(x->if_id)); + if (x->pcpu_num) + l += nla_total_size(sizeof(x->pcpu_num)); /* Must count x->lastused as it may become non-zero behind our back. */ l += nla_total_size_64bit(sizeof(u64)); @@ -3587,6 +3632,7 @@ static inline unsigned int xfrm_acquire_msgsize(struct xfrm_state *x, + nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr) + nla_total_size(sizeof(struct xfrm_mark)) + nla_total_size(xfrm_user_sec_ctx_size(x->security)) + + nla_total_size(4) /* XFRMA_SA_PCPU */ + userpolicy_type_attrsize(); } @@ -3623,6 +3669,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x, err = xfrm_if_id_put(skb, xp->if_id); if (!err && xp->xdo.dev) err = copy_user_offload(&xp->xdo, skb); + if (!err && x->pcpu_num != UINT_MAX) + err = nla_put_u32(skb, XFRMA_SA_PCPU, x->pcpu_num); if (err) { nlmsg_cancel(skb, nlh); return err; -- cgit v1.2.3 From 0045e3d80613cc7174dc15f189ee6fc4e73b9365 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 23 Oct 2024 12:53:43 +0200 Subject: xfrm: Cache used outbound xfrm states at the policy. Now that we can have percpu xfrm states, the number of active states might increase. To get a better lookup performance, we cache the used xfrm states at the policy for outbound IPsec traffic. Signed-off-by: Steffen Klassert Tested-by: Antony Antony Tested-by: Tobias Brunner --- include/net/xfrm.h | 4 ++++ net/xfrm/xfrm_policy.c | 12 +++++++++++ net/xfrm/xfrm_state.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index f5275618e744..0b394c5fb5f3 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -184,6 +184,7 @@ struct xfrm_state { }; struct hlist_node byspi; struct hlist_node byseq; + struct hlist_node state_cache; refcount_t refcnt; spinlock_t lock; @@ -537,6 +538,7 @@ struct xfrm_policy_queue { * @xp_net: network namespace the policy lives in * @bydst: hlist node for SPD hash table or rbtree list * @byidx: hlist node for index hash table + * @state_cache_list: hlist head for policy cached xfrm states * @lock: serialize changes to policy structure members * @refcnt: reference count, freed once it reaches 0 * @pos: kernel internal tie-breaker to determine age of policy @@ -567,6 +569,8 @@ struct xfrm_policy { struct hlist_node bydst; struct hlist_node byidx; + struct hlist_head state_cache_list; + /* This lock only affects elements except for entry. */ rwlock_t lock; refcount_t refcnt; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index a2ea9dbac90b..8a1b83191a6c 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -434,6 +434,7 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp) if (policy) { write_pnet(&policy->xp_net, net); INIT_LIST_HEAD(&policy->walk.all); + INIT_HLIST_HEAD(&policy->state_cache_list); INIT_HLIST_NODE(&policy->bydst); INIT_HLIST_NODE(&policy->byidx); rwlock_init(&policy->lock); @@ -475,6 +476,9 @@ EXPORT_SYMBOL(xfrm_policy_destroy); static void xfrm_policy_kill(struct xfrm_policy *policy) { + struct net *net = xp_net(policy); + struct xfrm_state *x; + xfrm_dev_policy_delete(policy); write_lock_bh(&policy->lock); @@ -490,6 +494,13 @@ static void xfrm_policy_kill(struct xfrm_policy *policy) if (del_timer(&policy->timer)) xfrm_pol_put(policy); + /* XXX: Flush state cache */ + spin_lock_bh(&net->xfrm.xfrm_state_lock); + hlist_for_each_entry_rcu(x, &policy->state_cache_list, state_cache) { + hlist_del_init_rcu(&x->state_cache); + } + spin_unlock_bh(&net->xfrm.xfrm_state_lock); + xfrm_pol_put(policy); } @@ -3275,6 +3286,7 @@ no_transform: dst_release(dst); dst = dst_orig; } + ok: xfrm_pols_put(pols, drop_pols); if (dst && dst->xfrm && diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index ebef07b80afa..a2047825f6c8 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -665,6 +665,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net) refcount_set(&x->refcnt, 1); atomic_set(&x->tunnel_users, 0); INIT_LIST_HEAD(&x->km.all); + INIT_HLIST_NODE(&x->state_cache); INIT_HLIST_NODE(&x->bydst); INIT_HLIST_NODE(&x->bysrc); INIT_HLIST_NODE(&x->byspi); @@ -744,12 +745,15 @@ int __xfrm_state_delete(struct xfrm_state *x) if (x->km.state != XFRM_STATE_DEAD) { x->km.state = XFRM_STATE_DEAD; + spin_lock(&net->xfrm.xfrm_state_lock); list_del(&x->km.all); hlist_del_rcu(&x->bydst); hlist_del_rcu(&x->bysrc); if (x->km.seq) hlist_del_rcu(&x->byseq); + if (!hlist_unhashed(&x->state_cache)) + hlist_del_rcu(&x->state_cache); if (x->id.spi) hlist_del_rcu(&x->byspi); net->xfrm.state_num--; @@ -1222,6 +1226,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned int sequence; struct km_event c; unsigned int pcpu_id; + bool cached = false; /* We need the cpu id just as a lookup key, * we don't require it to be stable. @@ -1234,6 +1239,46 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, sequence = read_seqcount_begin(&net->xfrm.xfrm_state_hash_generation); rcu_read_lock(); + hlist_for_each_entry_rcu(x, &pol->state_cache_list, state_cache) { + if (x->props.family == encap_family && + x->props.reqid == tmpl->reqid && + (mark & x->mark.m) == x->mark.v && + x->if_id == if_id && + !(x->props.flags & XFRM_STATE_WILDRECV) && + xfrm_state_addr_check(x, daddr, saddr, encap_family) && + tmpl->mode == x->props.mode && + tmpl->id.proto == x->id.proto && + (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) + xfrm_state_look_at(pol, x, fl, encap_family, + &best, &acquire_in_progress, &error); + } + + if (best) + goto cached; + + hlist_for_each_entry_rcu(x, &pol->state_cache_list, state_cache) { + if (x->props.family == encap_family && + x->props.reqid == tmpl->reqid && + (mark & x->mark.m) == x->mark.v && + x->if_id == if_id && + !(x->props.flags & XFRM_STATE_WILDRECV) && + xfrm_addr_equal(&x->id.daddr, daddr, encap_family) && + tmpl->mode == x->props.mode && + tmpl->id.proto == x->id.proto && + (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) + xfrm_state_look_at(pol, x, fl, family, + &best, &acquire_in_progress, &error); + } + +cached: + cached = true; + if (best) + goto found; + else if (error) + best = NULL; + else if (acquire_in_progress) /* XXX: acquire_in_progress should not happen */ + WARN_ON(1); + h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family); hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h, bydst) { #ifdef CONFIG_XFRM_OFFLOAD @@ -1383,6 +1428,7 @@ found: XFRM_STATE_INSERT(bysrc, &x->bysrc, net->xfrm.state_bysrc + h, x->xso.type); + INIT_HLIST_NODE(&x->state_cache); if (x->id.spi) { h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family); XFRM_STATE_INSERT(byspi, &x->byspi, @@ -1431,6 +1477,15 @@ out: } else { *err = acquire_in_progress ? -EAGAIN : error; } + + if (x && x->km.state == XFRM_STATE_VALID && !cached && + (!(pol->flags & XFRM_POLICY_CPU_ACQUIRE) || x->pcpu_num == pcpu_id)) { + spin_lock_bh(&net->xfrm.xfrm_state_lock); + if (hlist_unhashed(&x->state_cache)) + hlist_add_head_rcu(&x->state_cache, &pol->state_cache_list); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); + } + rcu_read_unlock(); if (to_put) xfrm_state_put(to_put); -- cgit v1.2.3 From 81a331a0e72ddc2f75092603d9577bd1a0ca23ad Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 23 Oct 2024 12:53:44 +0200 Subject: xfrm: Add an inbound percpu state cache. Now that we can have percpu xfrm states, the number of active states might increase. To get a better lookup performance, we add a percpu cache to cache the used inbound xfrm states. Signed-off-by: Steffen Klassert Tested-by: Antony Antony Tested-by: Tobias Brunner --- include/net/netns/xfrm.h | 1 + include/net/xfrm.h | 5 +++++ net/ipv4/esp4_offload.c | 6 ++--- net/ipv6/esp6_offload.c | 6 ++--- net/xfrm/xfrm_input.c | 2 +- net/xfrm/xfrm_state.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 70 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index ae60d6664095..23dd647fe024 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -43,6 +43,7 @@ struct netns_xfrm { struct hlist_head __rcu *state_bysrc; struct hlist_head __rcu *state_byspi; struct hlist_head __rcu *state_byseq; + struct hlist_head __percpu *state_cache_input; unsigned int state_hmask; unsigned int state_num; struct work_struct state_hash_work; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 0b394c5fb5f3..2b87999bd5aa 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -185,6 +185,7 @@ struct xfrm_state { struct hlist_node byspi; struct hlist_node byseq; struct hlist_node state_cache; + struct hlist_node state_cache_input; refcount_t refcnt; spinlock_t lock; @@ -1650,6 +1651,10 @@ int xfrm_state_update(struct xfrm_state *x); struct xfrm_state *xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family); +struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark, + const xfrm_address_t *daddr, + __be32 spi, u8 proto, + unsigned short family); struct xfrm_state *xfrm_state_lookup_byaddr(struct net *net, u32 mark, const xfrm_address_t *daddr, const xfrm_address_t *saddr, diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 80c4ea0e12f4..e0d94270da28 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -53,9 +53,9 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head, if (sp->len == XFRM_MAX_DEPTH) goto out_reset; - x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, - (xfrm_address_t *)&ip_hdr(skb)->daddr, - spi, IPPROTO_ESP, AF_INET); + x = xfrm_input_state_lookup(dev_net(skb->dev), skb->mark, + (xfrm_address_t *)&ip_hdr(skb)->daddr, + spi, IPPROTO_ESP, AF_INET); if (unlikely(x && x->dir && x->dir != XFRM_SA_DIR_IN)) { /* non-offload path will record the error and audit log */ diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 919ebfabbe4e..7b41fb4f00b5 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -80,9 +80,9 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head, if (sp->len == XFRM_MAX_DEPTH) goto out_reset; - x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, - (xfrm_address_t *)&ipv6_hdr(skb)->daddr, - spi, IPPROTO_ESP, AF_INET6); + x = xfrm_input_state_lookup(dev_net(skb->dev), skb->mark, + (xfrm_address_t *)&ipv6_hdr(skb)->daddr, + spi, IPPROTO_ESP, AF_INET6); if (unlikely(x && x->dir && x->dir != XFRM_SA_DIR_IN)) { /* non-offload path will record the error and audit log */ diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 749e7eea99e4..841a60a6fbfe 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -572,7 +572,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) goto drop; } - x = xfrm_state_lookup(net, mark, daddr, spi, nexthdr, family); + x = xfrm_input_state_lookup(net, mark, daddr, spi, nexthdr, family); if (x == NULL) { secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index a2047825f6c8..e3266a5d4f90 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -754,6 +754,9 @@ int __xfrm_state_delete(struct xfrm_state *x) hlist_del_rcu(&x->byseq); if (!hlist_unhashed(&x->state_cache)) hlist_del_rcu(&x->state_cache); + if (!hlist_unhashed(&x->state_cache_input)) + hlist_del_rcu(&x->state_cache_input); + if (x->id.spi) hlist_del_rcu(&x->byspi); net->xfrm.state_num--; @@ -1106,6 +1109,52 @@ static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, return NULL; } +struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark, + const xfrm_address_t *daddr, + __be32 spi, u8 proto, + unsigned short family) +{ + struct hlist_head *state_cache_input; + struct xfrm_state *x = NULL; + int cpu = get_cpu(); + + state_cache_input = per_cpu_ptr(net->xfrm.state_cache_input, cpu); + + rcu_read_lock(); + hlist_for_each_entry_rcu(x, state_cache_input, state_cache_input) { + if (x->props.family != family || + x->id.spi != spi || + x->id.proto != proto || + !xfrm_addr_equal(&x->id.daddr, daddr, family)) + continue; + + if ((mark & x->mark.m) != x->mark.v) + continue; + if (!xfrm_state_hold_rcu(x)) + continue; + goto out; + } + + x = __xfrm_state_lookup(net, mark, daddr, spi, proto, family); + + if (x && x->km.state == XFRM_STATE_VALID) { + spin_lock_bh(&net->xfrm.xfrm_state_lock); + if (hlist_unhashed(&x->state_cache_input)) { + hlist_add_head_rcu(&x->state_cache_input, state_cache_input); + } else { + hlist_del_rcu(&x->state_cache_input); + hlist_add_head_rcu(&x->state_cache_input, state_cache_input); + } + spin_unlock_bh(&net->xfrm.xfrm_state_lock); + } + +out: + rcu_read_unlock(); + put_cpu(); + return x; +} +EXPORT_SYMBOL(xfrm_input_state_lookup); + static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark, const xfrm_address_t *daddr, const xfrm_address_t *saddr, @@ -3079,6 +3128,11 @@ int __net_init xfrm_state_init(struct net *net) net->xfrm.state_byseq = xfrm_hash_alloc(sz); if (!net->xfrm.state_byseq) goto out_byseq; + + net->xfrm.state_cache_input = alloc_percpu(struct hlist_head); + if (!net->xfrm.state_cache_input) + goto out_state_cache_input; + net->xfrm.state_hmask = ((sz / sizeof(struct hlist_head)) - 1); net->xfrm.state_num = 0; @@ -3088,6 +3142,8 @@ int __net_init xfrm_state_init(struct net *net) &net->xfrm.xfrm_state_lock); return 0; +out_state_cache_input: + xfrm_hash_free(net->xfrm.state_byseq, sz); out_byseq: xfrm_hash_free(net->xfrm.state_byspi, sz); out_byspi: @@ -3117,6 +3173,7 @@ void xfrm_state_fini(struct net *net) xfrm_hash_free(net->xfrm.state_bysrc, sz); WARN_ON(!hlist_empty(net->xfrm.state_bydst)); xfrm_hash_free(net->xfrm.state_bydst, sz); + free_percpu(net->xfrm.state_cache_input); } #ifdef CONFIG_AUDITSYSCALL -- cgit v1.2.3 From 4bbd360a5084d8f890f814327e1d9fbb1f0f6fa1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 24 Oct 2024 13:14:58 -0700 Subject: socket: Print pf->create() when it does not clear sock->sk on failure. I suggested to put DEBUG_NET_WARN_ON_ONCE() in __sock_create() to catch possible use-after-free. But the warning itself was not useful because our interest is in the callee than the caller. Let's define DEBUG_NET_WARN_ONCE() and print the name of pf->create() and the socket identifier. While at it, we enclose DEBUG_NET_WARN_ON_ONCE() in parentheses too to avoid a checkpatch error. Note that %pf or %pF were obsoleted and will be removed later as per comment in lib/vsprintf.c. Link: https://lore.kernel.org/netdev/202410231427.633734b3-lkp@intel.com/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241024201458.49412-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/net_debug.h | 4 +++- net/socket.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/net_debug.h b/include/net/net_debug.h index 1e74684cbbdb..9fecb1496be3 100644 --- a/include/net/net_debug.h +++ b/include/net/net_debug.h @@ -149,9 +149,11 @@ do { \ #if defined(CONFIG_DEBUG_NET) -#define DEBUG_NET_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) +#define DEBUG_NET_WARN_ON_ONCE(cond) ((void)WARN_ON_ONCE(cond)) +#define DEBUG_NET_WARN_ONCE(cond, format...) ((void)WARN_ONCE(cond, format)) #else #define DEBUG_NET_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) +#define DEBUG_NET_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) #endif #endif /* _LINUX_NET_DEBUG_H */ diff --git a/net/socket.c b/net/socket.c index 9a8e4452b9b2..5fb3d265e492 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1578,7 +1578,9 @@ int __sock_create(struct net *net, int family, int type, int protocol, /* ->create should release the allocated sock->sk object on error * and make sure sock->sk is set to NULL to avoid use-after-free */ - DEBUG_NET_WARN_ON_ONCE(sock->sk); + DEBUG_NET_WARN_ONCE(sock->sk, + "%ps must clear sock->sk on failure, family: %d, type: %d, protocol: %d\n", + pf->create, family, type, protocol); goto out_module_put; } -- cgit v1.2.3 From 2a0df10434ddeb8b5b5aded90a5659aff3b106d7 Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Wed, 23 Oct 2024 15:09:06 +0200 Subject: devlink: remove unused devlink_resource_occ_get_register() and _unregister() Remove not used devlink_resource_occ_get_register() and devlink_resource_occ_get_unregister() functions; current devlink resource users are fine with devl_ variants of the two. Reviewed-by: Wojciech Drewek Reviewed-by: Jiri Pirko Reviewed-by: Joe Damato Signed-off-by: Przemek Kitszel Link: https://patch.msgid.link/20241023131248.27192-7-przemyslaw.kitszel@intel.com Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 7 ------- net/devlink/resource.c | 39 --------------------------------------- 2 files changed, 46 deletions(-) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index db5eff6cb60f..fdd6a0f9891d 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1797,15 +1797,8 @@ void devl_resource_occ_get_register(struct devlink *devlink, u64 resource_id, devlink_resource_occ_get_t *occ_get, void *occ_get_priv); -void devlink_resource_occ_get_register(struct devlink *devlink, - u64 resource_id, - devlink_resource_occ_get_t *occ_get, - void *occ_get_priv); void devl_resource_occ_get_unregister(struct devlink *devlink, u64 resource_id); - -void devlink_resource_occ_get_unregister(struct devlink *devlink, - u64 resource_id); int devl_params_register(struct devlink *devlink, const struct devlink_param *params, size_t params_count); diff --git a/net/devlink/resource.c b/net/devlink/resource.c index 96c0ff24b65a..a923222bbde8 100644 --- a/net/devlink/resource.c +++ b/net/devlink/resource.c @@ -513,28 +513,6 @@ void devl_resource_occ_get_register(struct devlink *devlink, } EXPORT_SYMBOL_GPL(devl_resource_occ_get_register); -/** - * devlink_resource_occ_get_register - register occupancy getter - * - * @devlink: devlink - * @resource_id: resource id - * @occ_get: occupancy getter callback - * @occ_get_priv: occupancy getter callback priv - * - * Context: Takes and release devlink->lock . - */ -void devlink_resource_occ_get_register(struct devlink *devlink, - u64 resource_id, - devlink_resource_occ_get_t *occ_get, - void *occ_get_priv) -{ - devl_lock(devlink); - devl_resource_occ_get_register(devlink, resource_id, - occ_get, occ_get_priv); - devl_unlock(devlink); -} -EXPORT_SYMBOL_GPL(devlink_resource_occ_get_register); - /** * devl_resource_occ_get_unregister - unregister occupancy getter * @@ -557,20 +535,3 @@ void devl_resource_occ_get_unregister(struct devlink *devlink, resource->occ_get_priv = NULL; } EXPORT_SYMBOL_GPL(devl_resource_occ_get_unregister); - -/** - * devlink_resource_occ_get_unregister - unregister occupancy getter - * - * @devlink: devlink - * @resource_id: resource id - * - * Context: Takes and release devlink->lock . - */ -void devlink_resource_occ_get_unregister(struct devlink *devlink, - u64 resource_id) -{ - devl_lock(devlink); - devl_resource_occ_get_unregister(devlink, resource_id); - devl_unlock(devlink); -} -EXPORT_SYMBOL_GPL(devlink_resource_occ_get_unregister); -- cgit v1.2.3 From e3302f9a503a632c125170dc3c72b2886a910b0a Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Wed, 23 Oct 2024 15:09:07 +0200 Subject: devlink: remove unused devlink_resource_register() Remove unused devlink_resource_register(); all the drivers use devl_resource_register() variant instead. Reviewed-by: Wojciech Drewek Reviewed-by: Jiri Pirko Reviewed-by: Joe Damato Signed-off-by: Przemek Kitszel Link: https://patch.msgid.link/20241023131248.27192-8-przemyslaw.kitszel@intel.com Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 6 ------ net/devlink/resource.c | 33 --------------------------------- 2 files changed, 39 deletions(-) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index fdd6a0f9891d..fbb9a2668e24 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1779,12 +1779,6 @@ int devl_resource_register(struct devlink *devlink, u64 resource_id, u64 parent_resource_id, const struct devlink_resource_size_params *size_params); -int devlink_resource_register(struct devlink *devlink, - const char *resource_name, - u64 resource_size, - u64 resource_id, - u64 parent_resource_id, - const struct devlink_resource_size_params *size_params); void devl_resources_unregister(struct devlink *devlink); void devlink_resources_unregister(struct devlink *devlink); int devl_resource_size_get(struct devlink *devlink, diff --git a/net/devlink/resource.c b/net/devlink/resource.c index a923222bbde8..2d6324f3d91f 100644 --- a/net/devlink/resource.c +++ b/net/devlink/resource.c @@ -381,39 +381,6 @@ int devl_resource_register(struct devlink *devlink, } EXPORT_SYMBOL_GPL(devl_resource_register); -/** - * devlink_resource_register - devlink resource register - * - * @devlink: devlink - * @resource_name: resource's name - * @resource_size: resource's size - * @resource_id: resource's id - * @parent_resource_id: resource's parent id - * @size_params: size parameters - * - * Generic resources should reuse the same names across drivers. - * Please see the generic resources list at: - * Documentation/networking/devlink/devlink-resource.rst - * - * Context: Takes and release devlink->lock . - */ -int devlink_resource_register(struct devlink *devlink, - const char *resource_name, - u64 resource_size, - u64 resource_id, - u64 parent_resource_id, - const struct devlink_resource_size_params *size_params) -{ - int err; - - devl_lock(devlink); - err = devl_resource_register(devlink, resource_name, resource_size, - resource_id, parent_resource_id, size_params); - devl_unlock(devlink); - return err; -} -EXPORT_SYMBOL_GPL(devlink_resource_register); - static void devlink_resource_unregister(struct devlink *devlink, struct devlink_resource *resource) { -- cgit v1.2.3 From 386c2b877b97cde53c6e422f9081ae133492fd05 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Wed, 23 Oct 2024 16:14:51 +0800 Subject: tcp: add a common helper to debug the underlying issue Following the commit c8770db2d544 ("tcp: check skb is non-NULL in tcp_rto_delta_us()"), we decided to add a helper so that it's easier to get verbose warning on either cases. Link: https://lore.kernel.org/all/5632e043-bdba-4d75-bc7e-bf58014492fd@redhat.com/ Suggested-by: Paolo Abeni Signed-off-by: Jason Xing Cc: Neal Cardwell Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/tcp.h | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 739a9fb83d0c..8b8d94bb1746 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2430,6 +2430,19 @@ void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb, void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb); void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb); +static inline void tcp_warn_once(const struct sock *sk, bool cond, const char *str) +{ + WARN_ONCE(cond, + "%sout:%u sacked:%u lost:%u retrans:%u tlp_high_seq:%u sk_state:%u ca_state:%u advmss:%u mss_cache:%u pmtu:%u\n", + str, + tcp_sk(sk)->packets_out, tcp_sk(sk)->sacked_out, + tcp_sk(sk)->lost_out, tcp_sk(sk)->retrans_out, + tcp_sk(sk)->tlp_high_seq, sk->sk_state, + inet_csk(sk)->icsk_ca_state, + tcp_sk(sk)->advmss, tcp_sk(sk)->mss_cache, + inet_csk(sk)->icsk_pmtu_cookie); +} + /* At how many usecs into the future should the RTO fire? */ static inline s64 tcp_rto_delta_us(const struct sock *sk) { @@ -2441,17 +2454,7 @@ static inline s64 tcp_rto_delta_us(const struct sock *sk) return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp; } else { - WARN_ONCE(1, - "rtx queue empty: " - "out:%u sacked:%u lost:%u retrans:%u " - "tlp_high_seq:%u sk_state:%u ca_state:%u " - "advmss:%u mss_cache:%u pmtu:%u\n", - tcp_sk(sk)->packets_out, tcp_sk(sk)->sacked_out, - tcp_sk(sk)->lost_out, tcp_sk(sk)->retrans_out, - tcp_sk(sk)->tlp_high_seq, sk->sk_state, - inet_csk(sk)->icsk_ca_state, - tcp_sk(sk)->advmss, tcp_sk(sk)->mss_cache, - inet_csk(sk)->icsk_pmtu_cookie); + tcp_warn_once(sk, 1, "rtx queue empty: "); return jiffies_to_usecs(rto); } -- cgit v1.2.3 From 668d663989c77fcb2a92748645e4c394b03d5988 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Wed, 23 Oct 2024 16:14:52 +0800 Subject: tcp: add more warn of socket in tcp_send_loss_probe() Add two fields to print in the helper which here covers tcp_send_loss_probe(). Link: https://lore.kernel.org/all/5632e043-bdba-4d75-bc7e-bf58014492fd@redhat.com/ Suggested-by: Paolo Abeni Signed-off-by: Jason Xing Cc: Neal Cardwell Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/tcp.h | 3 ++- net/ipv4/tcp_output.c | 4 +--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 8b8d94bb1746..e9b37b76e894 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2433,8 +2433,9 @@ void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb); static inline void tcp_warn_once(const struct sock *sk, bool cond, const char *str) { WARN_ONCE(cond, - "%sout:%u sacked:%u lost:%u retrans:%u tlp_high_seq:%u sk_state:%u ca_state:%u advmss:%u mss_cache:%u pmtu:%u\n", + "%scwn:%u out:%u sacked:%u lost:%u retrans:%u tlp_high_seq:%u sk_state:%u ca_state:%u advmss:%u mss_cache:%u pmtu:%u\n", str, + tcp_snd_cwnd(tcp_sk(sk)), tcp_sk(sk)->packets_out, tcp_sk(sk)->sacked_out, tcp_sk(sk)->lost_out, tcp_sk(sk)->retrans_out, tcp_sk(sk)->tlp_high_seq, sk->sk_state, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 054244ce5117..5485a70b5fe5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2954,9 +2954,7 @@ void tcp_send_loss_probe(struct sock *sk) } skb = skb_rb_last(&sk->tcp_rtx_queue); if (unlikely(!skb)) { - WARN_ONCE(tp->packets_out, - "invalid inflight: %u state %u cwnd %u mss %d\n", - tp->packets_out, sk->sk_state, tcp_snd_cwnd(tp), mss); + tcp_warn_once(sk, tp->packets_out, "invalid inflight: "); smp_store_release(&inet_csk(sk)->icsk_pending, 0); return; } -- cgit v1.2.3 From db71aae70e3e646d8ba4cb50e4bd4c281a91c804 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Sat, 26 Oct 2024 12:53:36 +0000 Subject: net: checksum: Move from32to16() to generic header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit from32to16() is used by lib/checksum.c and also by arch/parisc/lib/checksum.c. The next patch will use it in the bpf_csum_diff helper. Move from32to16() to the include/net/checksum.h as csum_from32to16() and remove other implementations. Signed-off-by: Puranjay Mohan Signed-off-by: Daniel Borkmann Reviewed-by: Toke Høiland-Jørgensen Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20241026125339.26459-2-puranjay@kernel.org --- arch/parisc/lib/checksum.c | 13 ++----------- include/net/checksum.h | 6 ++++++ lib/checksum.c | 11 +---------- 3 files changed, 9 insertions(+), 21 deletions(-) (limited to 'include/net') diff --git a/arch/parisc/lib/checksum.c b/arch/parisc/lib/checksum.c index 4818f3db84a5..59d8c15d81bd 100644 --- a/arch/parisc/lib/checksum.c +++ b/arch/parisc/lib/checksum.c @@ -25,15 +25,6 @@ : "=r"(_t) \ : "r"(_r), "0"(_t)); -static inline unsigned short from32to16(unsigned int x) -{ - /* 32 bits --> 16 bits + carry */ - x = (x & 0xffff) + (x >> 16); - /* 16 bits + carry --> 16 bits including carry */ - x = (x & 0xffff) + (x >> 16); - return (unsigned short)x; -} - static inline unsigned int do_csum(const unsigned char * buff, int len) { int odd, count; @@ -85,7 +76,7 @@ static inline unsigned int do_csum(const unsigned char * buff, int len) } if (len & 1) result += le16_to_cpu(*buff); - result = from32to16(result); + result = csum_from32to16(result); if (odd) result = swab16(result); out: @@ -102,7 +93,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum) { unsigned int result = do_csum(buff, len); addc(result, sum); - return (__force __wsum)from32to16(result); + return (__force __wsum)csum_from32to16(result); } EXPORT_SYMBOL(csum_partial); diff --git a/include/net/checksum.h b/include/net/checksum.h index 1338cb92c8e7..243f972267b8 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -151,6 +151,12 @@ static inline void csum_replace(__wsum *csum, __wsum old, __wsum new) *csum = csum_add(csum_sub(*csum, old), new); } +static inline unsigned short csum_from32to16(unsigned int sum) +{ + sum += (sum >> 16) | (sum << 16); + return (unsigned short)(sum >> 16); +} + struct sk_buff; void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, __be32 from, __be32 to, bool pseudohdr); diff --git a/lib/checksum.c b/lib/checksum.c index 6860d6b05a17..025ba546e1ec 100644 --- a/lib/checksum.c +++ b/lib/checksum.c @@ -34,15 +34,6 @@ #include #ifndef do_csum -static inline unsigned short from32to16(unsigned int x) -{ - /* add up 16-bit and 16-bit for 16+c bit */ - x = (x & 0xffff) + (x >> 16); - /* add up carry.. */ - x = (x & 0xffff) + (x >> 16); - return x; -} - static unsigned int do_csum(const unsigned char *buff, int len) { int odd; @@ -90,7 +81,7 @@ static unsigned int do_csum(const unsigned char *buff, int len) #else result += (*buff << 8); #endif - result = from32to16(result); + result = csum_from32to16(result); if (odd) result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); out: -- cgit v1.2.3 From 2748697225c38a19666bba6a83afc6bf46ee16be Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 23 Oct 2024 16:52:46 +0300 Subject: net: sched: propagate "skip_sw" flag to struct flow_cls_common_offload Background: switchdev ports offload the Linux bridge, and most of the packets they handle will never see the CPU. The ports between which there exists no hardware data path are considered 'foreign' to switchdev. These can either be normal physical NICs without switchdev offload, or incompatible switchdev ports, or virtual interfaces like veth/dummy/etc. In some cases, an offloaded filter can only do half the work, and the rest must be handled by software. Redirecting/mirroring from the ingress of a switchdev port towards a foreign interface is one example of combined hardware/software data path. The most that the switchdev port can do is to extract the matching packets from its offloaded data path and send them to the CPU. From there on, the software filter runs (a second time, after the first run in hardware) on the packet and performs the mirred action. It makes sense for switchdev drivers which allow this kind of "half offloading" to sense the "skip_sw" flag of the filter/action pair, and deny attempts from the user to install a filter that does not run in software, because that simply won't work. In fact, a mirred action on a switchdev port towards a dummy interface appears to be a valid way of (selectively) monitoring offloaded traffic that flows through it. IFF_PROMISC was also discussed years ago, but (despite initial disagreement) there seems to be consensus that this flag should not affect the destination taken by packets, but merely whether or not the NIC discards packets with unknown MAC DA for local processing. [1] https://lore.kernel.org/netdev/20190830092637.7f83d162@ceranb/ [2] https://lore.kernel.org/netdev/20191002233750.13566-1-olteanv@gmail.com/ Suggested-by: Ido Schimmel Link: https://lore.kernel.org/netdev/ZxUo0Dc0M5Y6l9qF@shredder.mtl.com/ Signed-off-by: Vladimir Oltean Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20241023135251.1752488-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- include/net/flow_offload.h | 1 + include/net/pkt_cls.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 292cd8f4b762..596ab9791e4d 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -685,6 +685,7 @@ struct flow_cls_common_offload { u32 chain_index; __be16 protocol; u32 prio; + bool skip_sw; struct netlink_ext_ack *extack; }; diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 4880b3a7aced..cf199af85c52 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -755,6 +755,7 @@ tc_cls_common_offload_init(struct flow_cls_common_offload *cls_common, cls_common->chain_index = tp->chain->index; cls_common->protocol = tp->protocol; cls_common->prio = tp->prio >> 16; + cls_common->skip_sw = tc_skip_sw(flags); if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE) cls_common->extack = extack; } -- cgit v1.2.3 From d86c7a9162ae925dcb6632a4544c62eb0eb5c7cc Mon Sep 17 00:00:00 2001 From: George Guo Date: Mon, 28 Oct 2024 20:34:35 +0800 Subject: netlabel: document doi_remove field of struct netlbl_calipso_ops Add documentation of doi_remove field to Kernel doc for struct netlbl_calipso_ops. Flagged by ./scripts/kernel-doc -none. Signed-off-by: George Guo Acked-by: Paul Moore Link: https://patch.msgid.link/20241028123435.3495916-1-dongtai.guo@linux.dev Signed-off-by: Jakub Kicinski --- include/net/netlabel.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netlabel.h b/include/net/netlabel.h index 529160f76cac..4afd934b1238 100644 --- a/include/net/netlabel.h +++ b/include/net/netlabel.h @@ -208,6 +208,7 @@ struct netlbl_lsm_secattr { * struct netlbl_calipso_ops - NetLabel CALIPSO operations * @doi_add: add a CALIPSO DOI * @doi_free: free a CALIPSO DOI + * @doi_remove: remove a CALIPSO DOI * @doi_getdef: returns a reference to a DOI * @doi_putdef: releases a reference of a DOI * @doi_walk: enumerate the DOI list -- cgit v1.2.3 From 4138e9ec00936c9fe7d0fe961e32f381b1e36926 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 29 Oct 2024 11:47:14 +0100 Subject: netlink: add NLA_POLICY_MAX_LEN macro Similarly to NLA_POLICY_MIN_LEN, NLA_POLICY_MAX_LEN defines a policy with a maximum length value. The netlink generator for YAML specs has been extended accordingly. Signed-off-by: Antonio Quartulli Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20241029-b4-ovpn-v11-1-de4698c73a25@openvpn.net Signed-off-by: Jakub Kicinski --- include/net/netlink.h | 1 + tools/net/ynl/ynl-gen-c.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netlink.h b/include/net/netlink.h index db6af207287c..2dc671c977ff 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -469,6 +469,7 @@ struct nla_policy { .max = _len \ } #define NLA_POLICY_MIN_LEN(_len) NLA_POLICY_MIN(NLA_BINARY, _len) +#define NLA_POLICY_MAX_LEN(_len) NLA_POLICY_MAX(NLA_BINARY, _len) /** * struct nl_info - netlink source information diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index 1a825b4081b2..aa22eb092475 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -481,7 +481,7 @@ class TypeBinary(Type): pass elif len(self.checks) == 1: check_name = list(self.checks)[0] - if check_name not in {'exact-len', 'min-len'}: + if check_name not in {'exact-len', 'min-len', 'max-len'}: raise Exception('Unsupported check for binary type: ' + check_name) else: raise Exception('More than one check for binary type not implemented, yet') @@ -492,6 +492,8 @@ class TypeBinary(Type): mem = 'NLA_POLICY_EXACT_LEN(' + self.get_limit_str('exact-len') + ')' elif 'min-len' in self.checks: mem = '{ .len = ' + self.get_limit_str('min-len') + ', }' + elif 'max-len' in self.checks: + mem = 'NLA_POLICY_MAX_LEN(' + self.get_limit_str('max-len') + ')' return mem -- cgit v1.2.3 From 6b2d11e2d8fc130df4708be0b6b53fd3e6b54cf6 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Wed, 30 Oct 2024 04:22:33 +0000 Subject: net/tcp: Add missing lockdep annotations for TCP-AO hlist traversals Under CONFIG_PROVE_RCU_LIST + CONFIG_RCU_EXPERT hlist_for_each_entry_rcu() provides very helpful splats, which help to find possible issues. I missed CONFIG_RCU_EXPERT=y in my testing config the same as described in a3e4bf7f9675 ("configs/debug: make sure PROVE_RCU_LIST=y takes effect"). The fix itself is trivial: add the very same lockdep annotations as were used to dereference ao_info from the socket. Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/netdev/20241028152645.35a8be66@kernel.org/ Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Link: https://patch.msgid.link/20241030-tcp-ao-hlist-lockdep-annotate-v1-1-bf641a64d7c6@gmail.com Signed-off-by: Jakub Kicinski --- include/net/tcp_ao.h | 3 ++- net/ipv4/tcp_ao.c | 42 +++++++++++++++++++++++------------------- net/ipv4/tcp_ipv4.c | 3 ++- net/ipv6/tcp_ipv6.c | 4 ++-- 4 files changed, 29 insertions(+), 23 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h index 1d46460d0fef..df655ce6987d 100644 --- a/include/net/tcp_ao.h +++ b/include/net/tcp_ao.h @@ -183,7 +183,8 @@ int tcp_ao_hash_skb(unsigned short int family, const u8 *tkey, int hash_offset, u32 sne); int tcp_parse_ao(struct sock *sk, int cmd, unsigned short int family, sockptr_t optval, int optlen); -struct tcp_ao_key *tcp_ao_established_key(struct tcp_ao_info *ao, +struct tcp_ao_key *tcp_ao_established_key(const struct sock *sk, + struct tcp_ao_info *ao, int sndid, int rcvid); int tcp_ao_copy_all_matching(const struct sock *sk, struct sock *newsk, struct request_sock *req, struct sk_buff *skb, diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index db6516092daf..bbb8d5f0eae7 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -109,12 +109,13 @@ bool tcp_ao_ignore_icmp(const struct sock *sk, int family, int type, int code) * it's known that the keys in ao_info are matching peer's * family/address/VRF/etc. */ -struct tcp_ao_key *tcp_ao_established_key(struct tcp_ao_info *ao, +struct tcp_ao_key *tcp_ao_established_key(const struct sock *sk, + struct tcp_ao_info *ao, int sndid, int rcvid) { struct tcp_ao_key *key; - hlist_for_each_entry_rcu(key, &ao->head, node) { + hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk)) { if ((sndid >= 0 && key->sndid != sndid) || (rcvid >= 0 && key->rcvid != rcvid)) continue; @@ -205,7 +206,7 @@ static struct tcp_ao_key *__tcp_ao_do_lookup(const struct sock *sk, int l3index, if (!ao) return NULL; - hlist_for_each_entry_rcu(key, &ao->head, node) { + hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk)) { u8 prefixlen = min(prefix, key->prefixlen); if (!tcp_ao_key_cmp(key, l3index, addr, prefixlen, @@ -793,7 +794,7 @@ int tcp_ao_prepare_reset(const struct sock *sk, struct sk_buff *skb, if (!ao_info) return -ENOENT; - *key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1); + *key = tcp_ao_established_key(sk, ao_info, aoh->rnext_keyid, -1); if (!*key) return -ENOENT; *traffic_key = snd_other_key(*key); @@ -979,7 +980,7 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, */ key = READ_ONCE(info->rnext_key); if (key->rcvid != aoh->keyid) { - key = tcp_ao_established_key(info, -1, aoh->keyid); + key = tcp_ao_established_key(sk, info, -1, aoh->keyid); if (!key) goto key_not_found; } @@ -1003,7 +1004,7 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, aoh->rnext_keyid, tcp_ao_hdr_maclen(aoh)); /* If the key is not found we do nothing. */ - key = tcp_ao_established_key(info, aoh->rnext_keyid, -1); + key = tcp_ao_established_key(sk, info, aoh->rnext_keyid, -1); if (key) /* pairs with tcp_ao_del_cmd */ WRITE_ONCE(info->current_key, key); @@ -1163,7 +1164,7 @@ void tcp_ao_established(struct sock *sk) if (!ao) return; - hlist_for_each_entry_rcu(key, &ao->head, node) + hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk)) tcp_ao_cache_traffic_keys(sk, ao, key); } @@ -1180,7 +1181,7 @@ void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb) WRITE_ONCE(ao->risn, tcp_hdr(skb)->seq); ao->rcv_sne = 0; - hlist_for_each_entry_rcu(key, &ao->head, node) + hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk)) tcp_ao_cache_traffic_keys(sk, ao, key); } @@ -1256,14 +1257,14 @@ int tcp_ao_copy_all_matching(const struct sock *sk, struct sock *newsk, key_head = rcu_dereference(hlist_first_rcu(&new_ao->head)); first_key = hlist_entry_safe(key_head, struct tcp_ao_key, node); - key = tcp_ao_established_key(new_ao, tcp_rsk(req)->ao_keyid, -1); + key = tcp_ao_established_key(req_to_sk(req), new_ao, tcp_rsk(req)->ao_keyid, -1); if (key) new_ao->current_key = key; else new_ao->current_key = first_key; /* set rnext_key */ - key = tcp_ao_established_key(new_ao, -1, tcp_rsk(req)->ao_rcv_next); + key = tcp_ao_established_key(req_to_sk(req), new_ao, -1, tcp_rsk(req)->ao_rcv_next); if (key) new_ao->rnext_key = key; else @@ -1857,12 +1858,12 @@ static int tcp_ao_del_cmd(struct sock *sk, unsigned short int family, * if there's any. */ if (cmd.set_current) { - new_current = tcp_ao_established_key(ao_info, cmd.current_key, -1); + new_current = tcp_ao_established_key(sk, ao_info, cmd.current_key, -1); if (!new_current) return -ENOENT; } if (cmd.set_rnext) { - new_rnext = tcp_ao_established_key(ao_info, -1, cmd.rnext); + new_rnext = tcp_ao_established_key(sk, ao_info, -1, cmd.rnext); if (!new_rnext) return -ENOENT; } @@ -1902,7 +1903,8 @@ static int tcp_ao_del_cmd(struct sock *sk, unsigned short int family, * "It is presumed that an MKT affecting a particular * connection cannot be destroyed during an active connection" */ - hlist_for_each_entry_rcu(key, &ao_info->head, node) { + hlist_for_each_entry_rcu(key, &ao_info->head, node, + lockdep_sock_is_held(sk)) { if (cmd.sndid != key->sndid || cmd.rcvid != key->rcvid) continue; @@ -2000,14 +2002,14 @@ static int tcp_ao_info_cmd(struct sock *sk, unsigned short int family, * if there's any. */ if (cmd.set_current) { - new_current = tcp_ao_established_key(ao_info, cmd.current_key, -1); + new_current = tcp_ao_established_key(sk, ao_info, cmd.current_key, -1); if (!new_current) { err = -ENOENT; goto out; } } if (cmd.set_rnext) { - new_rnext = tcp_ao_established_key(ao_info, -1, cmd.rnext); + new_rnext = tcp_ao_established_key(sk, ao_info, -1, cmd.rnext); if (!new_rnext) { err = -ENOENT; goto out; @@ -2101,7 +2103,8 @@ int tcp_v4_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen) * The layout of the fields in the user and kernel structures is expected to * be the same (including in the 32bit vs 64bit case). */ -static int tcp_ao_copy_mkts_to_user(struct tcp_ao_info *ao_info, +static int tcp_ao_copy_mkts_to_user(const struct sock *sk, + struct tcp_ao_info *ao_info, sockptr_t optval, sockptr_t optlen) { struct tcp_ao_getsockopt opt_in, opt_out; @@ -2229,7 +2232,8 @@ static int tcp_ao_copy_mkts_to_user(struct tcp_ao_info *ao_info, /* May change in RX, while we're dumping, pre-fetch it */ current_key = READ_ONCE(ao_info->current_key); - hlist_for_each_entry_rcu(key, &ao_info->head, node) { + hlist_for_each_entry_rcu(key, &ao_info->head, node, + lockdep_sock_is_held(sk)) { if (opt_in.get_all) goto match; @@ -2309,7 +2313,7 @@ int tcp_ao_get_mkts(struct sock *sk, sockptr_t optval, sockptr_t optlen) if (!ao_info) return -ENOENT; - return tcp_ao_copy_mkts_to_user(ao_info, optval, optlen); + return tcp_ao_copy_mkts_to_user(sk, ao_info, optval, optlen); } int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen) @@ -2396,7 +2400,7 @@ int tcp_ao_set_repair(struct sock *sk, sockptr_t optval, unsigned int optlen) WRITE_ONCE(ao->snd_sne, cmd.snd_sne); WRITE_ONCE(ao->rcv_sne, cmd.rcv_sne); - hlist_for_each_entry_rcu(key, &ao->head, node) + hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk)) tcp_ao_cache_traffic_keys(sk, ao, key); return 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 9d3dd101ea71..a38c8b1f44db 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1053,7 +1053,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) } if (aoh) - key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1); + key.ao_key = tcp_ao_established_key(sk, ao_info, + aoh->rnext_keyid, -1); } } if (key.ao_key) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 597920061a3a..c748eeae1453 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1172,8 +1172,8 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) goto out; if (aoh) - key.ao_key = tcp_ao_established_key(ao_info, - aoh->rnext_keyid, -1); + key.ao_key = tcp_ao_established_key(sk, ao_info, + aoh->rnext_keyid, -1); } } if (key.ao_key) { -- cgit v1.2.3 From b3e8f29d6b45e865bab9a9964709ff7413e33e85 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 Nov 2024 10:41:15 +0100 Subject: netfilter: nf_tables: avoid false-positive lockdep splats with flowtables The transaction mutex prevents concurrent add/delete, its ok to iterate those lists outside of rcu read side critical sections. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 ++- net/netfilter/nf_tables_api.c | 15 +++++++++------ net/netfilter/nft_flow_offload.c | 4 ++-- 3 files changed, 13 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 91ae20cb7648..c1513bd14568 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1463,7 +1463,8 @@ struct nft_flowtable { struct nf_flowtable data; }; -struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, +struct nft_flowtable *nft_flowtable_lookup(const struct net *net, + const struct nft_table *table, const struct nlattr *nla, u8 genmask); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a51731d76401..9e367e134691 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8378,12 +8378,14 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = { [NFTA_FLOWTABLE_FLAGS] = { .type = NLA_U32 }, }; -struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, +struct nft_flowtable *nft_flowtable_lookup(const struct net *net, + const struct nft_table *table, const struct nlattr *nla, u8 genmask) { struct nft_flowtable *flowtable; - list_for_each_entry_rcu(flowtable, &table->flowtables, list) { + list_for_each_entry_rcu(flowtable, &table->flowtables, list, + lockdep_commit_lock_is_held(net)) { if (!nla_strcmp(nla, flowtable->name) && nft_active_genmask(flowtable, genmask)) return flowtable; @@ -8739,7 +8741,7 @@ static int nf_tables_newflowtable(struct sk_buff *skb, return PTR_ERR(table); } - flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], + flowtable = nft_flowtable_lookup(net, table, nla[NFTA_FLOWTABLE_NAME], genmask); if (IS_ERR(flowtable)) { err = PTR_ERR(flowtable); @@ -8933,7 +8935,7 @@ static int nf_tables_delflowtable(struct sk_buff *skb, flowtable = nft_flowtable_lookup_byhandle(table, attr, genmask); } else { attr = nla[NFTA_FLOWTABLE_NAME]; - flowtable = nft_flowtable_lookup(table, attr, genmask); + flowtable = nft_flowtable_lookup(net, table, attr, genmask); } if (IS_ERR(flowtable)) { @@ -9003,7 +9005,8 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, if (!hook_list) hook_list = &flowtable->hook_list; - list_for_each_entry_rcu(hook, hook_list, list) { + list_for_each_entry_rcu(hook, hook_list, list, + lockdep_commit_lock_is_held(net)) { if (nla_put_string(skb, NFTA_DEVICE_NAME, hook->ops.dev->name)) goto nla_put_failure; } @@ -9145,7 +9148,7 @@ static int nf_tables_getflowtable(struct sk_buff *skb, return PTR_ERR(table); } - flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], + flowtable = nft_flowtable_lookup(net, table, nla[NFTA_FLOWTABLE_NAME], genmask); if (IS_ERR(flowtable)) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 2f732fae5a83..65199c23c75c 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -409,8 +409,8 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx, if (!tb[NFTA_FLOW_TABLE_NAME]) return -EINVAL; - flowtable = nft_flowtable_lookup(ctx->table, tb[NFTA_FLOW_TABLE_NAME], - genmask); + flowtable = nft_flowtable_lookup(ctx->net, ctx->table, + tb[NFTA_FLOW_TABLE_NAME], genmask); if (IS_ERR(flowtable)) return PTR_ERR(flowtable); -- cgit v1.2.3 From e57dfaa4b0a72f6a231a8eedb95d260045bbd8db Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 31 Oct 2024 16:52:57 +0100 Subject: xfrm: Convert struct xfrm_dst_lookup_params -> tos to dscp_t. Add type annotation to the "tos" field of struct xfrm_dst_lookup_params, to ensure that the ECN bits aren't mistakenly taken into account when doing route lookups. Rename that field (tos -> dscp) to make that change explicit. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 ++- net/ipv4/xfrm4_policy.c | 3 ++- net/xfrm/xfrm_policy.c | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 2b87999bd5aa..32c09e85a64c 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -354,7 +355,7 @@ void xfrm_if_unregister_cb(void); struct xfrm_dst_lookup_params { struct net *net; - int tos; + dscp_t dscp; int oif; xfrm_address_t *saddr; xfrm_address_t *daddr; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 7e1c2faed1ff..7fb6205619e7 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -24,7 +25,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct flowi4 *fl4, memset(fl4, 0, sizeof(*fl4)); fl4->daddr = params->daddr->a4; - fl4->flowi4_tos = params->tos; + fl4->flowi4_tos = inet_dscp_to_dsfield(params->dscp); fl4->flowi4_l3mdev = l3mdev_master_ifindex_by_index(params->net, params->oif); fl4->flowi4_mark = params->mark; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 7e3e10fb9ca0..4408c11c0835 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -312,7 +312,7 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, params.net = net; params.saddr = saddr; params.daddr = daddr; - params.tos = inet_dscp_to_dsfield(dscp); + params.dscp = dscp; params.oif = oif; params.mark = mark; params.ipproto = x->id.proto; -- cgit v1.2.3 From 44d0469f79bd3d0b3433732877358df7dc6b17b1 Mon Sep 17 00:00:00 2001 From: Zijian Zhang Date: Wed, 6 Nov 2024 00:37:42 +0000 Subject: bpf: Add sk_is_inet and IS_ICSK check in tls_sw_has_ctx_tx/rx As the introduction of the support for vsock and unix sockets in sockmap, tls_sw_has_ctx_tx/rx cannot presume the socket passed in must be IS_ICSK. vsock and af_unix sockets have vsock_sock and unix_sock instead of inet_connection_sock. For these sockets, tls_get_ctx may return an invalid pointer and cause page fault in function tls_sw_ctx_rx. BUG: unable to handle page fault for address: 0000000000040030 Workqueue: vsock-loopback vsock_loopback_work RIP: 0010:sk_psock_strp_data_ready+0x23/0x60 Call Trace: ? __die+0x81/0xc3 ? no_context+0x194/0x350 ? do_page_fault+0x30/0x110 ? async_page_fault+0x3e/0x50 ? sk_psock_strp_data_ready+0x23/0x60 virtio_transport_recv_pkt+0x750/0x800 ? update_load_avg+0x7e/0x620 vsock_loopback_work+0xd0/0x100 process_one_work+0x1a7/0x360 worker_thread+0x30/0x390 ? create_worker+0x1a0/0x1a0 kthread+0x112/0x130 ? __kthread_cancel_work+0x40/0x40 ret_from_fork+0x1f/0x40 v2: - Add IS_ICSK check v3: - Update the commits in Fixes Fixes: 634f1a7110b4 ("vsock: support sockmap") Fixes: 94531cfcbe79 ("af_unix: Add unix_stream_proto for sockmap") Signed-off-by: Zijian Zhang Acked-by: Stanislav Fomichev Acked-by: Jakub Kicinski Reviewed-by: Cong Wang Acked-by: Stefano Garzarella Link: https://lore.kernel.org/r/20241106003742.399240-1-zijianzhang@bytedance.com Signed-off-by: Martin KaFai Lau --- include/net/tls.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/tls.h b/include/net/tls.h index 3a33924db2bc..61fef2880114 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -390,8 +390,12 @@ tls_offload_ctx_tx(const struct tls_context *tls_ctx) static inline bool tls_sw_has_ctx_tx(const struct sock *sk) { - struct tls_context *ctx = tls_get_ctx(sk); + struct tls_context *ctx; + + if (!sk_is_inet(sk) || !inet_test_bit(IS_ICSK, sk)) + return false; + ctx = tls_get_ctx(sk); if (!ctx) return false; return !!tls_sw_ctx_tx(ctx); @@ -399,8 +403,12 @@ static inline bool tls_sw_has_ctx_tx(const struct sock *sk) static inline bool tls_sw_has_ctx_rx(const struct sock *sk) { - struct tls_context *ctx = tls_get_ctx(sk); + struct tls_context *ctx; + + if (!sk_is_inet(sk) || !inet_test_bit(IS_ICSK, sk)) + return false; + ctx = tls_get_ctx(sk); if (!ctx) return false; return !!tls_sw_ctx_rx(ctx); -- cgit v1.2.3 From 9907cda95fcbf44141b1292faab89cf8ec542f22 Mon Sep 17 00:00:00 2001 From: Juraj Å arinay Date: Sun, 3 Nov 2024 13:45:25 +0100 Subject: net: nfc: Propagate ISO14443 type A target ATS to userspace via netlink MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a 20-byte field ats to struct nfc_target and expose it as NFC_ATTR_TARGET_ATS via the netlink interface. The payload contains 'historical bytes' that help to distinguish cards from one another. The information is commonly used to assemble an emulated ATR similar to that reported by smart cards with contacts. Add a 20-byte field target_ats to struct nci_dev to hold the payload obtained in nci_rf_intf_activated_ntf_packet() and copy it to over to nfc_target.ats in nci_activate_target(). The approach is similar to the handling of 'general bytes' within ATR_RES. Replace the hard-coded size of rats_res within struct activation_params_nfca_poll_iso_dep by the equal constant NFC_ATS_MAXSIZE now defined in nfc.h Within NCI, the information corresponds to the 'RATS Response' activation parameter that omits the initial length byte TL. This loses no information and is consistent with our handling of SENSB_RES that also drops the first (constant) byte. Tested with nxp_nci_i2c on a few type A targets including an ICAO 9303 compliant passport. I refrain from the corresponding change to digital_in_recv_ats() to have the few drivers based on digital.h fill nfc_target.ats, as I have no way to test it. That class of drivers appear not to set NFC_ATTR_TARGET_SENSB_RES either. Consider a separate patch to propagate (all) the parameters. Signed-off-by: Juraj Å arinay Link: https://patch.msgid.link/20241103124525.8392-1-juraj@sarinay.com Signed-off-by: Paolo Abeni --- include/net/nfc/nci.h | 2 +- include/net/nfc/nci_core.h | 4 ++++ include/net/nfc/nfc.h | 4 ++++ include/uapi/linux/nfc.h | 3 +++ net/nfc/nci/core.c | 13 ++++++++++++- net/nfc/nci/ntf.c | 32 +++++++++++++++++++++++++++++++- net/nfc/netlink.c | 5 +++++ 7 files changed, 60 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/nfc/nci.h b/include/net/nfc/nci.h index dc36519d16aa..09efcaed7c3f 100644 --- a/include/net/nfc/nci.h +++ b/include/net/nfc/nci.h @@ -475,7 +475,7 @@ struct nci_rf_discover_ntf { #define NCI_OP_RF_INTF_ACTIVATED_NTF nci_opcode_pack(NCI_GID_RF_MGMT, 0x05) struct activation_params_nfca_poll_iso_dep { __u8 rats_res_len; - __u8 rats_res[20]; + __u8 rats_res[NFC_ATS_MAXSIZE]; }; struct activation_params_nfcb_poll_iso_dep { diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h index ea8595651c38..e180bdf2f82b 100644 --- a/include/net/nfc/nci_core.h +++ b/include/net/nfc/nci_core.h @@ -265,6 +265,10 @@ struct nci_dev { /* stored during intf_activated_ntf */ __u8 remote_gb[NFC_MAX_GT_LEN]; __u8 remote_gb_len; + + /* stored during intf_activated_ntf */ + __u8 target_ats[NFC_ATS_MAXSIZE]; + __u8 target_ats_len; }; /* ----- NCI Devices ----- */ diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h index 3a3781838c67..127e6c7d910d 100644 --- a/include/net/nfc/nfc.h +++ b/include/net/nfc/nfc.h @@ -86,6 +86,8 @@ struct nfc_ops { * is a type A one. The %sens_res most significant byte must be byte 2 * as described by the NFC Forum digital specification (i.e. the platform * configuration one) while %sens_res least significant byte is byte 1. + * @ats_len: length of Answer To Select in bytes + * @ats: Answer To Select returned by an ISO 14443 Type A target upon activation */ struct nfc_target { u32 idx; @@ -105,6 +107,8 @@ struct nfc_target { u8 is_iso15693; u8 iso15693_dsfid; u8 iso15693_uid[NFC_ISO15693_UID_MAXSIZE]; + u8 ats_len; + u8 ats[NFC_ATS_MAXSIZE]; }; /** diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h index 4fa4e979e948..2f5b4be25261 100644 --- a/include/uapi/linux/nfc.h +++ b/include/uapi/linux/nfc.h @@ -164,6 +164,7 @@ enum nfc_commands { * @NFC_ATTR_VENDOR_SUBCMD: Vendor specific sub command * @NFC_ATTR_VENDOR_DATA: Vendor specific data, to be optionally passed * to a vendor specific command implementation + * @NFC_ATTR_TARGET_ATS: ISO 14443 type A target Answer To Select */ enum nfc_attrs { NFC_ATTR_UNSPEC, @@ -198,6 +199,7 @@ enum nfc_attrs { NFC_ATTR_VENDOR_ID, NFC_ATTR_VENDOR_SUBCMD, NFC_ATTR_VENDOR_DATA, + NFC_ATTR_TARGET_ATS, /* private: internal use only */ __NFC_ATTR_AFTER_LAST }; @@ -225,6 +227,7 @@ enum nfc_sdp_attr { #define NFC_GB_MAXSIZE 48 #define NFC_FIRMWARE_NAME_MAXSIZE 32 #define NFC_ISO15693_UID_MAXSIZE 8 +#define NFC_ATS_MAXSIZE 20 /* NFC protocols */ #define NFC_PROTO_JEWEL 1 diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index f456a5911e7d..1ec5955fe469 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -757,6 +757,14 @@ int nci_core_conn_close(struct nci_dev *ndev, u8 conn_id) } EXPORT_SYMBOL(nci_core_conn_close); +static void nci_set_target_ats(struct nfc_target *target, struct nci_dev *ndev) +{ + if (ndev->target_ats_len > 0) { + target->ats_len = ndev->target_ats_len; + memcpy(target->ats, ndev->target_ats, target->ats_len); + } +} + static int nci_set_local_general_bytes(struct nfc_dev *nfc_dev) { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); @@ -939,8 +947,11 @@ static int nci_activate_target(struct nfc_dev *nfc_dev, msecs_to_jiffies(NCI_RF_DISC_SELECT_TIMEOUT)); } - if (!rc) + if (!rc) { ndev->target_active_prot = protocol; + if (protocol == NFC_PROTO_ISO14443) + nci_set_target_ats(target, ndev); + } return rc; } diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index 994a0a1efb58..a818eff27e6b 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -402,7 +402,7 @@ static int nci_extract_activation_params_iso_dep(struct nci_dev *ndev, switch (ntf->activation_rf_tech_and_mode) { case NCI_NFC_A_PASSIVE_POLL_MODE: nfca_poll = &ntf->activation_params.nfca_poll_iso_dep; - nfca_poll->rats_res_len = min_t(__u8, *data++, 20); + nfca_poll->rats_res_len = min_t(__u8, *data++, NFC_ATS_MAXSIZE); pr_debug("rats_res_len %d\n", nfca_poll->rats_res_len); if (nfca_poll->rats_res_len > 0) { memcpy(nfca_poll->rats_res, @@ -531,6 +531,28 @@ static int nci_store_general_bytes_nfc_dep(struct nci_dev *ndev, return NCI_STATUS_OK; } +static int nci_store_ats_nfc_iso_dep(struct nci_dev *ndev, + const struct nci_rf_intf_activated_ntf *ntf) +{ + ndev->target_ats_len = 0; + + if (ntf->activation_params_len <= 0) + return NCI_STATUS_OK; + + if (ntf->activation_params.nfca_poll_iso_dep.rats_res_len > NFC_ATS_MAXSIZE) { + pr_debug("ATS too long\n"); + return NCI_STATUS_RF_PROTOCOL_ERROR; + } + + if (ntf->activation_params.nfca_poll_iso_dep.rats_res_len > 0) { + ndev->target_ats_len = ntf->activation_params.nfca_poll_iso_dep.rats_res_len; + memcpy(ndev->target_ats, ntf->activation_params.nfca_poll_iso_dep.rats_res, + ndev->target_ats_len); + } + + return NCI_STATUS_OK; +} + static void nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, const struct sk_buff *skb) { @@ -660,6 +682,14 @@ exit: if (err != NCI_STATUS_OK) pr_err("unable to store general bytes\n"); } + + /* store ATS to be reported later in nci_activate_target */ + if (ntf.rf_interface == NCI_RF_INTERFACE_ISO_DEP && + ntf.activation_rf_tech_and_mode == NCI_NFC_A_PASSIVE_POLL_MODE) { + err = nci_store_ats_nfc_iso_dep(ndev, &ntf); + if (err != NCI_STATUS_OK) + pr_err("unable to store ATS\n"); + } } if (!(ntf.activation_rf_tech_and_mode & NCI_RF_TECH_MODE_LISTEN_MASK)) { diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index dd2ce73a24fb..6a40b8d0350d 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -96,6 +96,11 @@ static int nfc_genl_send_target(struct sk_buff *msg, struct nfc_target *target, goto nla_put_failure; } + if (target->ats_len > 0 && + nla_put(msg, NFC_ATTR_TARGET_ATS, target->ats_len, + target->ats)) + goto nla_put_failure; + genlmsg_end(msg, hdr); return 0; -- cgit v1.2.3 From c03d278fdf35e73dd0ec543b9b556876b9d9a8dc Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 5 Nov 2024 12:07:22 +0100 Subject: netfilter: nf_tables: wait for rcu grace period on net_device removal 8c873e219970 ("netfilter: core: free hooks with call_rcu") removed synchronize_net() call when unregistering basechain hook, however, net_device removal event handler for the NFPROTO_NETDEV was not updated to wait for RCU grace period. Note that 835b803377f5 ("netfilter: nf_tables_netdev: unregister hooks on net_device removal") does not remove basechain rules on device removal, I was hinted to remove rules on net_device removal later, see 5ebe0b0eec9d ("netfilter: nf_tables: destroy basechain and rules on netdevice removal"). Although NETDEV_UNREGISTER event is guaranteed to be handled after synchronize_net() call, this path needs to wait for rcu grace period via rcu callback to release basechain hooks if netns is alive because an ongoing netlink dump could be in progress (sockets hold a reference on the netns). Note that nf_tables_pre_exit_net() unregisters and releases basechain hooks but it is possible to see NETDEV_UNREGISTER at a later stage in the netns exit path, eg. veth peer device in another netns: cleanup_net() default_device_exit_batch() unregister_netdevice_many_notify() notifier_call_chain() nf_tables_netdev_event() __nft_release_basechain() In this particular case, same rule of thumb applies: if netns is alive, then wait for rcu grace period because netlink dump in the other netns could be in progress. Otherwise, if the other netns is going away then no netlink dump can be in progress and basechain hooks can be released inmediately. While at it, turn WARN_ON() into WARN_ON_ONCE() for the basechain validation, which should not ever happen. Fixes: 835b803377f5 ("netfilter: nf_tables_netdev: unregister hooks on net_device removal") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++++ net/netfilter/nf_tables_api.c | 41 ++++++++++++++++++++++++++++++++------- 2 files changed, 38 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 91ae20cb7648..066a3ea33b12 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1103,6 +1103,7 @@ struct nft_rule_blob { * @name: name of the chain * @udlen: user data length * @udata: user data in the chain + * @rcu_head: rcu head for deferred release * @blob_next: rule blob pointer to the next in the chain */ struct nft_chain { @@ -1120,6 +1121,7 @@ struct nft_chain { char *name; u16 udlen; u8 *udata; + struct rcu_head rcu_head; /* Only used during control plane commit phase: */ struct nft_rule_blob *blob_next; @@ -1263,6 +1265,7 @@ static inline void nft_use_inc_restore(u32 *use) * @sets: sets in the table * @objects: stateful objects in the table * @flowtables: flow tables in the table + * @net: netnamespace this table belongs to * @hgenerator: handle generator state * @handle: table handle * @use: number of chain references to this table @@ -1282,6 +1285,7 @@ struct nft_table { struct list_head sets; struct list_head objects; struct list_head flowtables; + possible_net_t net; u64 hgenerator; u64 handle; u32 use; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a24fe62650a7..588a2757986c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1495,6 +1495,7 @@ static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info, INIT_LIST_HEAD(&table->sets); INIT_LIST_HEAD(&table->objects); INIT_LIST_HEAD(&table->flowtables); + write_pnet(&table->net, net); table->family = family; table->flags = flags; table->handle = ++nft_net->table_handle; @@ -11430,22 +11431,48 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, } EXPORT_SYMBOL_GPL(nft_data_dump); -int __nft_release_basechain(struct nft_ctx *ctx) +static void __nft_release_basechain_now(struct nft_ctx *ctx) { struct nft_rule *rule, *nr; - if (WARN_ON(!nft_is_base_chain(ctx->chain))) - return 0; - - nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain); list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) { list_del(&rule->list); - nft_use_dec(&ctx->chain->use); nf_tables_rule_release(ctx, rule); } + nf_tables_chain_destroy(ctx->chain); +} + +static void nft_release_basechain_rcu(struct rcu_head *head) +{ + struct nft_chain *chain = container_of(head, struct nft_chain, rcu_head); + struct nft_ctx ctx = { + .family = chain->table->family, + .chain = chain, + .net = read_pnet(&chain->table->net), + }; + + __nft_release_basechain_now(&ctx); + put_net(ctx.net); +} + +int __nft_release_basechain(struct nft_ctx *ctx) +{ + struct nft_rule *rule; + + if (WARN_ON_ONCE(!nft_is_base_chain(ctx->chain))) + return 0; + + nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain); + list_for_each_entry(rule, &ctx->chain->rules, list) + nft_use_dec(&ctx->chain->use); + nft_chain_del(ctx->chain); nft_use_dec(&ctx->table->use); - nf_tables_chain_destroy(ctx->chain); + + if (maybe_get_net(ctx->net)) + call_rcu(&ctx->chain->rcu_head, nft_release_basechain_rcu); + else + __nft_release_basechain_now(ctx); return 0; } -- cgit v1.2.3 From df81366b484da9618cec12cfc30ec93f7d4b6ac7 Mon Sep 17 00:00:00 2001 From: Zong-Zhe Yang Date: Fri, 1 Nov 2024 16:21:43 +0800 Subject: wifi: mac80211: fix description of ieee80211_set_active_links() for new sequence The sequence of calls has changed, but the description is inconsistent. So, fix the description. Fixes: 188a1bf89432 ("wifi: mac80211: re-order assigning channel in activate links") Signed-off-by: Zong-Zhe Yang Link: https://patch.msgid.link/20241101082143.11138-1-kevin_yang@realtek.com Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index b4f246cdcca4..a97c9f85ae9a 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -7672,12 +7672,12 @@ static inline bool ieee80211_is_tx_data(struct sk_buff *skb) * * - change_vif_links(0x11) * - unassign_vif_chanctx(link_id=0) + * - assign_vif_chanctx(link_id=4) * - change_sta_links(0x11) for each affected STA (the AP) * (TDLS connections on now inactive links should be torn down) * - remove group keys on the old link (link_id 0) * - add new group keys (GTK/IGTK/BIGTK) on the new link (link_id 4) * - change_sta_links(0x10) for each affected STA (the AP) - * - assign_vif_chanctx(link_id=4) * - change_vif_links(0x10) * * Return: 0 on success. An error code otherwise. -- cgit v1.2.3 From 48171c65f61148b0025128b70837280123f1309d Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 6 Nov 2024 22:37:32 +0100 Subject: ipv4: Prepare ip_route_output() to future .flowi4_tos conversion. Convert the "tos" parameter of ip_route_output() to dscp_t. This way we'll have a dscp_t value directly available when .flowi4_tos will eventually be converted to dscp_t. All ip_route_output() callers but one set this "tos" parameter to 0 and therefore don't need to be adapted to the new prototype. Only br_nf_pre_routing_finish() needs conversion. It can just use ip4h_dscp() to get the DSCP field from the IPv4 header. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/0f10d031dd44c70aae9bc6e19391cb30d5c2fe71.1730928699.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/net/route.h | 6 +++--- net/bridge/br_netfilter_hooks.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index 586e59f7ed8a..0a690adfdff5 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -156,12 +156,12 @@ static inline struct rtable *ip_route_output_key(struct net *net, struct flowi4 * structure is only partially set, it may bypass some fib-rules. */ static inline struct rtable *ip_route_output(struct net *net, __be32 daddr, - __be32 saddr, u8 tos, int oif, - __u8 scope) + __be32 saddr, dscp_t dscp, + int oif, __u8 scope) { struct flowi4 fl4 = { .flowi4_oif = oif, - .flowi4_tos = tos, + .flowi4_tos = inet_dscp_to_dsfield(dscp), .flowi4_scope = scope, .daddr = daddr, .saddr = saddr, diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 17a5f5923d61..7f2f40cef5fe 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -406,7 +406,7 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_ goto free_skb; rt = ip_route_output(net, iph->daddr, 0, - iph->tos & INET_DSCP_MASK, 0, + ip4h_dscp(iph), 0, RT_SCOPE_UNIVERSE); if (!IS_ERR(rt)) { /* - Bridged-and-DNAT'ed traffic doesn't -- cgit v1.2.3 From 580db513b4a9d52f306580015a1872eea0a0894e Mon Sep 17 00:00:00 2001 From: Khang Nguyen Date: Tue, 5 Nov 2024 14:19:15 +0700 Subject: net: mctp: Expose transport binding identifier via IFLA attribute MCTP control protocol implementations are transport binding dependent. Endpoint discovery is mandatory based on transport binding. Message timing requirements are specified in each respective transport binding specification. However, we currently have no means to get this information from MCTP links. Add a IFLA_MCTP_PHYS_BINDING netlink link attribute, which represents the transport type using the DMTF DSP0239-defined type numbers, returned as part of RTM_GETLINK data. We get an IFLA_MCTP_PHYS_BINDING attribute for each MCTP link, for example: - 0x00 (unspec) for loopback interface; - 0x01 (SMBus/I2C) for mctpi2c%d interfaces; and - 0x05 (serial) for mctpserial%d interfaces. Signed-off-by: Khang Nguyen Reviewed-by: Matt Johnston Link: https://patch.msgid.link/20241105071915.821871-1-khangng@os.amperecomputing.com Signed-off-by: Jakub Kicinski --- drivers/net/mctp/mctp-i2c.c | 3 ++- drivers/net/mctp/mctp-i3c.c | 2 +- drivers/net/mctp/mctp-serial.c | 5 +++-- include/net/mctp.h | 18 ++++++++++++++++++ include/net/mctpdevice.h | 4 +++- include/uapi/linux/if_link.h | 1 + net/mctp/device.c | 12 +++++++++--- 7 files changed, 37 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/drivers/net/mctp/mctp-i2c.c b/drivers/net/mctp/mctp-i2c.c index e70fb6687994..d2b3f5a59141 100644 --- a/drivers/net/mctp/mctp-i2c.c +++ b/drivers/net/mctp/mctp-i2c.c @@ -880,7 +880,8 @@ static int mctp_i2c_add_netdev(struct mctp_i2c_client *mcli, goto err; } - rc = mctp_register_netdev(ndev, &mctp_i2c_mctp_ops); + rc = mctp_register_netdev(ndev, &mctp_i2c_mctp_ops, + MCTP_PHYS_BINDING_SMBUS); if (rc < 0) { dev_err(&mcli->client->dev, "register netdev \"%s\" failed %d\n", diff --git a/drivers/net/mctp/mctp-i3c.c b/drivers/net/mctp/mctp-i3c.c index 1bc87a062686..9adad59b8676 100644 --- a/drivers/net/mctp/mctp-i3c.c +++ b/drivers/net/mctp/mctp-i3c.c @@ -607,7 +607,7 @@ __must_hold(&busdevs_lock) goto err_free_uninit; } - rc = mctp_register_netdev(ndev, NULL); + rc = mctp_register_netdev(ndev, NULL, MCTP_PHYS_BINDING_I3C); if (rc < 0) { dev_warn(&ndev->dev, "netdev register failed: %d\n", rc); goto err_free_netdev; diff --git a/drivers/net/mctp/mctp-serial.c b/drivers/net/mctp/mctp-serial.c index e63720ec3238..26c9a33fd636 100644 --- a/drivers/net/mctp/mctp-serial.c +++ b/drivers/net/mctp/mctp-serial.c @@ -23,6 +23,7 @@ #include #include +#include #include #define MCTP_SERIAL_MTU 68 /* base mtu (64) + mctp header */ @@ -470,7 +471,7 @@ static int mctp_serial_open(struct tty_struct *tty) spin_lock_init(&dev->lock); INIT_WORK(&dev->tx_work, mctp_serial_tx_work); - rc = register_netdev(ndev); + rc = mctp_register_netdev(ndev, NULL, MCTP_PHYS_BINDING_SERIAL); if (rc) goto free_netdev; @@ -492,7 +493,7 @@ static void mctp_serial_close(struct tty_struct *tty) struct mctp_serial *dev = tty->disc_data; int idx = dev->idx; - unregister_netdev(dev->netdev); + mctp_unregister_netdev(dev->netdev); ida_free(&mctp_serial_ida, idx); } diff --git a/include/net/mctp.h b/include/net/mctp.h index 28d59ae94ca3..1ecbff7116f6 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -298,4 +298,22 @@ void mctp_routes_exit(void); int mctp_device_init(void); void mctp_device_exit(void); +/* MCTP IDs and Codes from DMTF specification + * "DSP0239 Management Component Transport Protocol (MCTP) IDs and Codes" + * https://www.dmtf.org/sites/default/files/standards/documents/DSP0239_1.11.1.pdf + */ +enum mctp_phys_binding { + MCTP_PHYS_BINDING_UNSPEC = 0x00, + MCTP_PHYS_BINDING_SMBUS = 0x01, + MCTP_PHYS_BINDING_PCIE_VDM = 0x02, + MCTP_PHYS_BINDING_USB = 0x03, + MCTP_PHYS_BINDING_KCS = 0x04, + MCTP_PHYS_BINDING_SERIAL = 0x05, + MCTP_PHYS_BINDING_I3C = 0x06, + MCTP_PHYS_BINDING_MMBI = 0x07, + MCTP_PHYS_BINDING_PCC = 0x08, + MCTP_PHYS_BINDING_UCIE = 0x09, + MCTP_PHYS_BINDING_VENDOR = 0xFF, +}; + #endif /* __NET_MCTP_H */ diff --git a/include/net/mctpdevice.h b/include/net/mctpdevice.h index 5c0d04b5c12c..957d9ef924c5 100644 --- a/include/net/mctpdevice.h +++ b/include/net/mctpdevice.h @@ -22,6 +22,7 @@ struct mctp_dev { refcount_t refs; unsigned int net; + enum mctp_phys_binding binding; const struct mctp_netdev_ops *ops; @@ -44,7 +45,8 @@ struct mctp_dev *mctp_dev_get_rtnl(const struct net_device *dev); struct mctp_dev *__mctp_dev_get(const struct net_device *dev); int mctp_register_netdev(struct net_device *dev, - const struct mctp_netdev_ops *ops); + const struct mctp_netdev_ops *ops, + enum mctp_phys_binding binding); void mctp_unregister_netdev(struct net_device *dev); void mctp_dev_hold(struct mctp_dev *mdev); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 8516c1ccd57a..2575e0cd9b48 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1958,6 +1958,7 @@ struct ifla_rmnet_flags { enum { IFLA_MCTP_UNSPEC, IFLA_MCTP_NET, + IFLA_MCTP_PHYS_BINDING, __IFLA_MCTP_MAX, }; diff --git a/net/mctp/device.c b/net/mctp/device.c index 3d75b919995d..26ce34b7e88e 100644 --- a/net/mctp/device.c +++ b/net/mctp/device.c @@ -371,6 +371,8 @@ static int mctp_fill_link_af(struct sk_buff *skb, return -ENODATA; if (nla_put_u32(skb, IFLA_MCTP_NET, mdev->net)) return -EMSGSIZE; + if (nla_put_u8(skb, IFLA_MCTP_PHYS_BINDING, mdev->binding)) + return -EMSGSIZE; return 0; } @@ -385,6 +387,7 @@ static size_t mctp_get_link_af_size(const struct net_device *dev, if (!mdev) return 0; ret = nla_total_size(4); /* IFLA_MCTP_NET */ + ret += nla_total_size(1); /* IFLA_MCTP_PHYS_BINDING */ mctp_dev_put(mdev); return ret; } @@ -480,7 +483,8 @@ static int mctp_dev_notify(struct notifier_block *this, unsigned long event, } static int mctp_register_netdevice(struct net_device *dev, - const struct mctp_netdev_ops *ops) + const struct mctp_netdev_ops *ops, + enum mctp_phys_binding binding) { struct mctp_dev *mdev; @@ -489,17 +493,19 @@ static int mctp_register_netdevice(struct net_device *dev, return PTR_ERR(mdev); mdev->ops = ops; + mdev->binding = binding; return register_netdevice(dev); } int mctp_register_netdev(struct net_device *dev, - const struct mctp_netdev_ops *ops) + const struct mctp_netdev_ops *ops, + enum mctp_phys_binding binding) { int rc; rtnl_lock(); - rc = mctp_register_netdevice(dev, ops); + rc = mctp_register_netdevice(dev, ops, binding); rtnl_unlock(); return rc; -- cgit v1.2.3 From 41b3caa7c0761141aa6d508924b9d23db57a17bc Mon Sep 17 00:00:00 2001 From: Gilad Naaman Date: Thu, 7 Nov 2024 16:04:38 +0000 Subject: neighbour: Add hlist_node to struct neighbour Add a doubly-linked node to neighbours, so that they can be deleted without iterating the entire bucket they're in. Signed-off-by: Gilad Naaman Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241107160444.2913124-2-gnaaman@drivenets.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 2 ++ net/core/neighbour.c | 20 +++++++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 3887ed9e5026..0402447854c7 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -136,6 +136,7 @@ struct neigh_statistics { struct neighbour { struct neighbour __rcu *next; + struct hlist_node hash; struct neigh_table *tbl; struct neigh_parms *parms; unsigned long confirmed; @@ -191,6 +192,7 @@ struct pneigh_entry { struct neigh_hash_table { struct neighbour __rcu **hash_buckets; + struct hlist_head *hash_heads; unsigned int hash_shift; __u32 hash_rnd[NEIGH_NUM_HASH_RND]; struct rcu_head rcu; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 4b871cecd2ce..5552e6b05c82 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -216,6 +216,7 @@ static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np, neigh = rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock)); rcu_assign_pointer(*np, neigh); + hlist_del_rcu(&n->hash); neigh_mark_dead(n); retval = true; } @@ -402,6 +403,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, rcu_assign_pointer(*np, rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock))); + hlist_del_rcu(&n->hash); write_lock(&n->lock); neigh_del_timer(n); neigh_mark_dead(n); @@ -529,20 +531,30 @@ static void neigh_get_hash_rnd(u32 *x) static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) { + size_t hash_heads_size = (1 << shift) * sizeof(struct hlist_head); size_t size = (1 << shift) * sizeof(struct neighbour *); - struct neigh_hash_table *ret; struct neighbour __rcu **buckets; + struct hlist_head *hash_heads; + struct neigh_hash_table *ret; int i; ret = kmalloc(sizeof(*ret), GFP_ATOMIC); if (!ret) return NULL; + buckets = kvzalloc(size, GFP_ATOMIC); if (!buckets) { kfree(ret); return NULL; } + hash_heads = kvzalloc(hash_heads_size, GFP_ATOMIC); + if (!hash_heads) { + kvfree(buckets); + kfree(ret); + return NULL; + } ret->hash_buckets = buckets; + ret->hash_heads = hash_heads; ret->hash_shift = shift; for (i = 0; i < NEIGH_NUM_HASH_RND; i++) neigh_get_hash_rnd(&ret->hash_rnd[i]); @@ -556,6 +568,7 @@ static void neigh_hash_free_rcu(struct rcu_head *head) rcu); kvfree(nht->hash_buckets); + kvfree(nht->hash_heads); kfree(nht); } @@ -592,6 +605,8 @@ static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, new_nht->hash_buckets[hash], lockdep_is_held(&tbl->lock))); rcu_assign_pointer(new_nht->hash_buckets[hash], n); + hlist_del_rcu(&n->hash); + hlist_add_head_rcu(&n->hash, &new_nht->hash_heads[hash]); } } @@ -702,6 +717,7 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, rcu_dereference_protected(nht->hash_buckets[hash_val], lockdep_is_held(&tbl->lock))); rcu_assign_pointer(nht->hash_buckets[hash_val], n); + hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]); write_unlock_bh(&tbl->lock); neigh_dbg(2, "neigh %p is created\n", n); rc = n; @@ -987,6 +1003,7 @@ static void neigh_periodic_work(struct work_struct *work) rcu_assign_pointer(*np, rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock))); + hlist_del_rcu(&n->hash); neigh_mark_dead(n); write_unlock(&n->lock); neigh_cleanup_and_release(n); @@ -3116,6 +3133,7 @@ void __neigh_for_each_release(struct neigh_table *tbl, rcu_assign_pointer(*np, rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock))); + hlist_del_rcu(&n->hash); neigh_mark_dead(n); } else np = &n->next; -- cgit v1.2.3 From d7ddee1a522ddf5b28e2a3f7093cf238c96f492a Mon Sep 17 00:00:00 2001 From: Gilad Naaman Date: Thu, 7 Nov 2024 16:04:39 +0000 Subject: neighbour: Define neigh_for_each_in_bucket Introduce neigh_for_each_in_bucket in neighbour.h, to help iterate over the neighbour table more succinctly. Signed-off-by: Gilad Naaman Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241107160444.2913124-3-gnaaman@drivenets.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 0402447854c7..4b9068c5e668 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -277,6 +277,12 @@ static inline void *neighbour_priv(const struct neighbour *n) extern const struct nla_policy nda_policy[]; +#define neigh_for_each_in_bucket(pos, head) hlist_for_each_entry(pos, head, hash) +#define neigh_for_each_in_bucket_rcu(pos, head) \ + hlist_for_each_entry_rcu(pos, head, hash) +#define neigh_for_each_in_bucket_safe(pos, tmp, head) \ + hlist_for_each_entry_safe(pos, tmp, head, hash) + static inline bool neigh_key_eq32(const struct neighbour *n, const void *pkey) { return *(const u32 *)n->primary_key == *(const u32 *)pkey; -- cgit v1.2.3 From 0e3bcb0f78a0ca7cfdb7906dc79d922e19ba09b5 Mon Sep 17 00:00:00 2001 From: Gilad Naaman Date: Thu, 7 Nov 2024 16:04:41 +0000 Subject: neighbour: Convert iteration to use hlist+macro Remove all usage of the bare neighbour::next pointer, replacing them with neighbour::hash and its for_each macro. Signed-off-by: Gilad Naaman Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241107160444.2913124-5-gnaaman@drivenets.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 5 +---- net/core/neighbour.c | 47 ++++++++++++++++++----------------------------- 2 files changed, 19 insertions(+), 33 deletions(-) (limited to 'include/net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 4b9068c5e668..94cf4f8c118f 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -311,12 +311,9 @@ static inline struct neighbour *___neigh_lookup_noref( u32 hash_val; hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); - for (n = rcu_dereference(nht->hash_buckets[hash_val]); - n != NULL; - n = rcu_dereference(n->next)) { + neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[hash_val]) if (n->dev == dev && key_eq(n, pkey)) return n; - } return NULL; } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 3485d6b3ba99..f99354d768c2 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -387,11 +387,11 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, lockdep_is_held(&tbl->lock)); for (i = 0; i < (1 << nht->hash_shift); i++) { - struct neighbour *n; struct neighbour __rcu **np = &nht->hash_buckets[i]; + struct hlist_node *tmp; + struct neighbour *n; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock))) != NULL) { + neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) { if (dev && n->dev != dev) { np = &n->next; continue; @@ -587,18 +587,14 @@ static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, return old_nht; for (i = 0; i < (1 << old_nht->hash_shift); i++) { - struct neighbour *n, *next; + struct hlist_node *tmp; + struct neighbour *n; - for (n = rcu_dereference_protected(old_nht->hash_buckets[i], - lockdep_is_held(&tbl->lock)); - n != NULL; - n = next) { + neigh_for_each_in_bucket_safe(n, tmp, &old_nht->hash_heads[i]) { hash = tbl->hash(n->primary_key, n->dev, new_nht->hash_rnd); hash >>= (32 - new_nht->hash_shift); - next = rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock)); rcu_assign_pointer(n->next, rcu_dereference_protected( @@ -693,11 +689,7 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, goto out_tbl_unlock; } - for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val], - lockdep_is_held(&tbl->lock)); - n1 != NULL; - n1 = rcu_dereference_protected(n1->next, - lockdep_is_held(&tbl->lock))) { + neigh_for_each_in_bucket(n1, &nht->hash_heads[hash_val]) { if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) { if (want_ref) neigh_hold(n1); @@ -949,10 +941,11 @@ static void neigh_connect(struct neighbour *neigh) static void neigh_periodic_work(struct work_struct *work) { struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); - struct neighbour *n; + struct neigh_hash_table *nht; struct neighbour __rcu **np; + struct hlist_node *tmp; + struct neighbour *n; unsigned int i; - struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); @@ -979,8 +972,7 @@ static void neigh_periodic_work(struct work_struct *work) for (i = 0 ; i < (1 << nht->hash_shift); i++) { np = &nht->hash_buckets[i]; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock))) != NULL) { + neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) { unsigned int state; write_lock(&n->lock); @@ -2730,9 +2722,8 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, for (h = s_h; h < (1 << nht->hash_shift); h++) { if (h > s_h) s_idx = 0; - for (n = rcu_dereference(nht->hash_buckets[h]), idx = 0; - n != NULL; - n = rcu_dereference(n->next)) { + idx = 0; + neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[h]) { if (idx < s_idx || !net_eq(dev_net(n->dev), net)) goto next; if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || @@ -3099,9 +3090,7 @@ void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void for (chain = 0; chain < (1 << nht->hash_shift); chain++) { struct neighbour *n; - for (n = rcu_dereference(nht->hash_buckets[chain]); - n != NULL; - n = rcu_dereference(n->next)) + neigh_for_each_in_bucket(n, &nht->hash_heads[chain]) cb(n, cookie); } read_unlock_bh(&tbl->lock); @@ -3113,18 +3102,18 @@ EXPORT_SYMBOL(neigh_for_each); void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)) { - int chain; struct neigh_hash_table *nht; + int chain; nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (chain = 0; chain < (1 << nht->hash_shift); chain++) { - struct neighbour *n; struct neighbour __rcu **np; + struct hlist_node *tmp; + struct neighbour *n; np = &nht->hash_buckets[chain]; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock))) != NULL) { + neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[chain]) { int release; write_lock(&n->lock); -- cgit v1.2.3 From a01a67ab2fffa7458354f0a666a6d550fa2b82fc Mon Sep 17 00:00:00 2001 From: Gilad Naaman Date: Thu, 7 Nov 2024 16:04:42 +0000 Subject: neighbour: Remove bare neighbour::next pointer Remove the now-unused neighbour::next pointer, leaving struct neighbour solely with the hlist_node implementation. Signed-off-by: Gilad Naaman Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241107160444.2913124-6-gnaaman@drivenets.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 4 +-- net/core/neighbour.c | 90 ++++++------------------------------------------- net/ipv4/arp.c | 2 +- 3 files changed, 12 insertions(+), 84 deletions(-) (limited to 'include/net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 94cf4f8c118f..40aac1e24c68 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -135,7 +135,6 @@ struct neigh_statistics { #define NEIGH_CACHE_STAT_INC(tbl, field) this_cpu_inc((tbl)->stats->field) struct neighbour { - struct neighbour __rcu *next; struct hlist_node hash; struct neigh_table *tbl; struct neigh_parms *parms; @@ -191,7 +190,6 @@ struct pneigh_entry { #define NEIGH_NUM_HASH_RND 4 struct neigh_hash_table { - struct neighbour __rcu **hash_buckets; struct hlist_head *hash_heads; unsigned int hash_shift; __u32 hash_rnd[NEIGH_NUM_HASH_RND]; @@ -354,7 +352,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid); void __neigh_set_probe_once(struct neighbour *neigh); -bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl); +bool neigh_remove_one(struct neighbour *ndel); void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev); int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev); int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index f99354d768c2..59f359c7b5e3 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -204,18 +204,12 @@ static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, } } -static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np, - struct neigh_table *tbl) +bool neigh_remove_one(struct neighbour *n) { bool retval = false; write_lock(&n->lock); if (refcount_read(&n->refcnt) == 1) { - struct neighbour *neigh; - - neigh = rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock)); - rcu_assign_pointer(*np, neigh); hlist_del_rcu(&n->hash); neigh_mark_dead(n); retval = true; @@ -226,29 +220,6 @@ static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np, return retval; } -bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) -{ - struct neigh_hash_table *nht; - void *pkey = ndel->primary_key; - u32 hash_val; - struct neighbour *n; - struct neighbour __rcu **np; - - nht = rcu_dereference_protected(tbl->nht, - lockdep_is_held(&tbl->lock)); - hash_val = tbl->hash(pkey, ndel->dev, nht->hash_rnd); - hash_val = hash_val >> (32 - nht->hash_shift); - - np = &nht->hash_buckets[hash_val]; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock)))) { - if (n == ndel) - return neigh_del(n, np, tbl); - np = &n->next; - } - return false; -} - static int neigh_forced_gc(struct neigh_table *tbl) { int max_clean = atomic_read(&tbl->gc_entries) - @@ -276,7 +247,7 @@ static int neigh_forced_gc(struct neigh_table *tbl) remove = true; write_unlock(&n->lock); - if (remove && neigh_remove_one(n, tbl)) + if (remove && neigh_remove_one(n)) shrunk++; if (shrunk >= max_clean) break; @@ -387,22 +358,15 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, lockdep_is_held(&tbl->lock)); for (i = 0; i < (1 << nht->hash_shift); i++) { - struct neighbour __rcu **np = &nht->hash_buckets[i]; struct hlist_node *tmp; struct neighbour *n; neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) { - if (dev && n->dev != dev) { - np = &n->next; + if (dev && n->dev != dev) continue; - } - if (skip_perm && n->nud_state & NUD_PERMANENT) { - np = &n->next; + if (skip_perm && n->nud_state & NUD_PERMANENT) continue; - } - rcu_assign_pointer(*np, - rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock))); + hlist_del_rcu(&n->hash); write_lock(&n->lock); neigh_del_timer(n); @@ -531,9 +495,7 @@ static void neigh_get_hash_rnd(u32 *x) static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) { - size_t hash_heads_size = (1 << shift) * sizeof(struct hlist_head); - size_t size = (1 << shift) * sizeof(struct neighbour *); - struct neighbour __rcu **buckets; + size_t size = (1 << shift) * sizeof(struct hlist_head); struct hlist_head *hash_heads; struct neigh_hash_table *ret; int i; @@ -542,18 +504,11 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) if (!ret) return NULL; - buckets = kvzalloc(size, GFP_ATOMIC); - if (!buckets) { - kfree(ret); - return NULL; - } - hash_heads = kvzalloc(hash_heads_size, GFP_ATOMIC); + hash_heads = kvzalloc(size, GFP_ATOMIC); if (!hash_heads) { - kvfree(buckets); kfree(ret); return NULL; } - ret->hash_buckets = buckets; ret->hash_heads = hash_heads; ret->hash_shift = shift; for (i = 0; i < NEIGH_NUM_HASH_RND; i++) @@ -567,7 +522,6 @@ static void neigh_hash_free_rcu(struct rcu_head *head) struct neigh_hash_table, rcu); - kvfree(nht->hash_buckets); kvfree(nht->hash_heads); kfree(nht); } @@ -596,11 +550,6 @@ static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, hash >>= (32 - new_nht->hash_shift); - rcu_assign_pointer(n->next, - rcu_dereference_protected( - new_nht->hash_buckets[hash], - lockdep_is_held(&tbl->lock))); - rcu_assign_pointer(new_nht->hash_buckets[hash], n); hlist_del_rcu(&n->hash); hlist_add_head_rcu(&n->hash, &new_nht->hash_heads[hash]); } @@ -705,10 +654,6 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, list_add_tail(&n->managed_list, &n->tbl->managed_list); if (want_ref) neigh_hold(n); - rcu_assign_pointer(n->next, - rcu_dereference_protected(nht->hash_buckets[hash_val], - lockdep_is_held(&tbl->lock))); - rcu_assign_pointer(nht->hash_buckets[hash_val], n); hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]); write_unlock_bh(&tbl->lock); neigh_dbg(2, "neigh %p is created\n", n); @@ -942,7 +887,6 @@ static void neigh_periodic_work(struct work_struct *work) { struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); struct neigh_hash_table *nht; - struct neighbour __rcu **np; struct hlist_node *tmp; struct neighbour *n; unsigned int i; @@ -970,8 +914,6 @@ static void neigh_periodic_work(struct work_struct *work) goto out; for (i = 0 ; i < (1 << nht->hash_shift); i++) { - np = &nht->hash_buckets[i]; - neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) { unsigned int state; @@ -981,7 +923,7 @@ static void neigh_periodic_work(struct work_struct *work) if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) || (n->flags & NTF_EXT_LEARNED)) { write_unlock(&n->lock); - goto next_elt; + continue; } if (time_before(n->used, n->confirmed) && @@ -992,9 +934,6 @@ static void neigh_periodic_work(struct work_struct *work) (state == NUD_FAILED || !time_in_range_open(jiffies, n->used, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { - rcu_assign_pointer(*np, - rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock))); hlist_del_rcu(&n->hash); neigh_mark_dead(n); write_unlock(&n->lock); @@ -1002,9 +941,6 @@ static void neigh_periodic_work(struct work_struct *work) continue; } write_unlock(&n->lock); - -next_elt: - np = &n->next; } /* * It's fine to release lock here, even if hash table @@ -1951,7 +1887,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, NETLINK_CB(skb).portid, extack); write_lock_bh(&tbl->lock); neigh_release(neigh); - neigh_remove_one(neigh, tbl); + neigh_remove_one(neigh); write_unlock_bh(&tbl->lock); out: @@ -3108,24 +3044,18 @@ void __neigh_for_each_release(struct neigh_table *tbl, nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (chain = 0; chain < (1 << nht->hash_shift); chain++) { - struct neighbour __rcu **np; struct hlist_node *tmp; struct neighbour *n; - np = &nht->hash_buckets[chain]; neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[chain]) { int release; write_lock(&n->lock); release = cb(n); if (release) { - rcu_assign_pointer(*np, - rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock))); hlist_del_rcu(&n->hash); neigh_mark_dead(n); - } else - np = &n->next; + } write_unlock(&n->lock); if (release) neigh_cleanup_and_release(n); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 11c1519b3699..cb9a7ed8abd3 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1215,7 +1215,7 @@ int arp_invalidate(struct net_device *dev, __be32 ip, bool force) NEIGH_UPDATE_F_ADMIN, 0); write_lock_bh(&tbl->lock); neigh_release(neigh); - neigh_remove_one(neigh, tbl); + neigh_remove_one(neigh); write_unlock_bh(&tbl->lock); } -- cgit v1.2.3 From f7f52738637f4361c108cad36e23ee98959a9006 Mon Sep 17 00:00:00 2001 From: Gilad Naaman Date: Thu, 7 Nov 2024 16:04:43 +0000 Subject: neighbour: Create netdev->neighbour association Create a mapping between a netdev and its neighoburs, allowing for much cheaper flushes. Signed-off-by: Gilad Naaman Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241107160444.2913124-7-gnaaman@drivenets.com Signed-off-by: Jakub Kicinski --- .../networking/net_cachelines/net_device.rst | 1 + include/linux/netdevice.h | 7 ++ include/net/neighbour.h | 9 +- include/net/neighbour_tables.h | 12 +++ net/core/neighbour.c | 96 +++++++++++++--------- 5 files changed, 80 insertions(+), 45 deletions(-) create mode 100644 include/net/neighbour_tables.h (limited to 'include/net') diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index ade50d4e67cf..15e31ece675f 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -188,4 +188,5 @@ u64 max_pacing_offload_horizon struct_napi_config* napi_config unsigned_long gro_flush_timeout u32 napi_defer_hard_irqs +struct hlist_head neighbours[2] =================================== =========================== =================== =================== =================================================================================== diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3c552b648b27..df4483598628 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -52,6 +52,7 @@ #include #include #include +#include struct netpoll_info; struct device; @@ -2032,6 +2033,9 @@ enum netdev_reg_state { * @napi_defer_hard_irqs: If not zero, provides a counter that would * allow to avoid NIC hard IRQ, on busy queues. * + * @neighbours: List heads pointing to this device's neighbours' + * dev_list, one per address-family. + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -2440,6 +2444,9 @@ struct net_device { */ struct net_shaper_hierarchy *net_shaper_hierarchy; #endif + + struct hlist_head neighbours[NEIGH_NR_TABLES]; + u8 priv[] ____cacheline_aligned __counted_by(priv_len); } ____cacheline_aligned; diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 40aac1e24c68..9a832cab5b1d 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -29,6 +29,7 @@ #include #include #include +#include /* * NUD stands for "neighbor unreachability detection" @@ -136,6 +137,7 @@ struct neigh_statistics { struct neighbour { struct hlist_node hash; + struct hlist_node dev_list; struct neigh_table *tbl; struct neigh_parms *parms; unsigned long confirmed; @@ -236,13 +238,6 @@ struct neigh_table { struct pneigh_entry **phash_buckets; }; -enum { - NEIGH_ARP_TABLE = 0, - NEIGH_ND_TABLE = 1, - NEIGH_NR_TABLES, - NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */ -}; - static inline int neigh_parms_family(struct neigh_parms *p) { return p->tbl->family; diff --git a/include/net/neighbour_tables.h b/include/net/neighbour_tables.h new file mode 100644 index 000000000000..bcffbe8f7601 --- /dev/null +++ b/include/net/neighbour_tables.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NET_NEIGHBOUR_TABLES_H +#define _NET_NEIGHBOUR_TABLES_H + +enum { + NEIGH_ARP_TABLE = 0, + NEIGH_ND_TABLE = 1, + NEIGH_NR_TABLES, + NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */ +}; + +#endif diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 59f359c7b5e3..5e572f6eaf2c 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -60,6 +60,25 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, static const struct seq_operations neigh_stat_seq_ops; #endif +static struct hlist_head *neigh_get_dev_table(struct net_device *dev, int family) +{ + int i; + + switch (family) { + default: + DEBUG_NET_WARN_ON_ONCE(1); + fallthrough; /* to avoid panic by null-ptr-deref */ + case AF_INET: + i = NEIGH_ARP_TABLE; + break; + case AF_INET6: + i = NEIGH_ND_TABLE; + break; + } + + return &dev->neighbours[i]; +} + /* Neighbour hash table buckets are protected with rwlock tbl->lock. @@ -211,6 +230,7 @@ bool neigh_remove_one(struct neighbour *n) write_lock(&n->lock); if (refcount_read(&n->refcnt) == 1) { hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); retval = true; } @@ -351,48 +371,42 @@ static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net, static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { - int i; - struct neigh_hash_table *nht; - - nht = rcu_dereference_protected(tbl->nht, - lockdep_is_held(&tbl->lock)); + struct hlist_head *dev_head; + struct hlist_node *tmp; + struct neighbour *n; - for (i = 0; i < (1 << nht->hash_shift); i++) { - struct hlist_node *tmp; - struct neighbour *n; + dev_head = neigh_get_dev_table(dev, tbl->family); - neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) { - if (dev && n->dev != dev) - continue; - if (skip_perm && n->nud_state & NUD_PERMANENT) - continue; + hlist_for_each_entry_safe(n, tmp, dev_head, dev_list) { + if (skip_perm && n->nud_state & NUD_PERMANENT) + continue; - hlist_del_rcu(&n->hash); - write_lock(&n->lock); - neigh_del_timer(n); - neigh_mark_dead(n); - if (refcount_read(&n->refcnt) != 1) { - /* The most unpleasant situation. - We must destroy neighbour entry, - but someone still uses it. - - The destroy will be delayed until - the last user releases us, but - we must kill timers etc. and move - it to safe state. - */ - __skb_queue_purge(&n->arp_queue); - n->arp_queue_len_bytes = 0; - WRITE_ONCE(n->output, neigh_blackhole); - if (n->nud_state & NUD_VALID) - n->nud_state = NUD_NOARP; - else - n->nud_state = NUD_NONE; - neigh_dbg(2, "neigh %p is stray\n", n); - } - write_unlock(&n->lock); - neigh_cleanup_and_release(n); + hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); + write_lock(&n->lock); + neigh_del_timer(n); + neigh_mark_dead(n); + if (refcount_read(&n->refcnt) != 1) { + /* The most unpleasant situation. + * We must destroy neighbour entry, + * but someone still uses it. + * + * The destroy will be delayed until + * the last user releases us, but + * we must kill timers etc. and move + * it to safe state. + */ + __skb_queue_purge(&n->arp_queue); + n->arp_queue_len_bytes = 0; + WRITE_ONCE(n->output, neigh_blackhole); + if (n->nud_state & NUD_VALID) + n->nud_state = NUD_NOARP; + else + n->nud_state = NUD_NONE; + neigh_dbg(2, "neigh %p is stray\n", n); } + write_unlock(&n->lock); + neigh_cleanup_and_release(n); } } @@ -655,6 +669,10 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, if (want_ref) neigh_hold(n); hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]); + + hlist_add_head_rcu(&n->dev_list, + neigh_get_dev_table(dev, tbl->family)); + write_unlock_bh(&tbl->lock); neigh_dbg(2, "neigh %p is created\n", n); rc = n; @@ -935,6 +953,7 @@ static void neigh_periodic_work(struct work_struct *work) !time_in_range_open(jiffies, n->used, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); write_unlock(&n->lock); neigh_cleanup_and_release(n); @@ -3054,6 +3073,7 @@ void __neigh_for_each_release(struct neigh_table *tbl, release = cb(n); if (release) { hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); } write_unlock(&n->lock); -- cgit v1.2.3 From 7f4b3960e54faec72132a71da4a84a8e2a0b9037 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 8 Nov 2024 11:41:44 +0100 Subject: net: netlink: add nla_get_*_default() accessors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are quite a number of places that use patterns such as if (attr) val = nla_get_u16(attr); else val = DEFAULT; Add nla_get_u16_default() and friends like that to not have to type this out all the time. Acked-by: Toke Høiland-Jørgensen Acked-by: Jakub Kicinski Signed-off-by: Johannes Berg Link: https://patch.msgid.link/20241108114145.acd2aadb03ac.I3df6aac71d38a5baa1c0a03d0c7e82d4395c030e@changeid Signed-off-by: Jakub Kicinski --- include/net/netlink.h | 262 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 262 insertions(+) (limited to 'include/net') diff --git a/include/net/netlink.h b/include/net/netlink.h index 2dc671c977ff..39eaa6be6ca8 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -142,6 +142,8 @@ * nla_get_flag(nla) return 1 if flag is true * nla_get_msecs(nla) get payload for a msecs attribute * + * The same functions also exist with _default(). + * * Attribute Misc: * nla_memcpy(dest, nla, count) copy attribute into memory * nla_memcmp(nla, data, size) compare attribute with memory area @@ -1695,6 +1697,20 @@ static inline u32 nla_get_u32(const struct nlattr *nla) return *(u32 *) nla_data(nla); } +/** + * nla_get_u32_default - return payload of u32 attribute or default + * @nla: u32 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline u32 nla_get_u32_default(const struct nlattr *nla, u32 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_u32(nla); +} + /** * nla_get_be32 - return payload of __be32 attribute * @nla: __be32 netlink attribute @@ -1704,6 +1720,21 @@ static inline __be32 nla_get_be32(const struct nlattr *nla) return *(__be32 *) nla_data(nla); } +/** + * nla_get_be32_default - return payload of be32 attribute or default + * @nla: __be32 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __be32 nla_get_be32_default(const struct nlattr *nla, + __be32 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_be32(nla); +} + /** * nla_get_le32 - return payload of __le32 attribute * @nla: __le32 netlink attribute @@ -1713,6 +1744,21 @@ static inline __le32 nla_get_le32(const struct nlattr *nla) return *(__le32 *) nla_data(nla); } +/** + * nla_get_le32_default - return payload of le32 attribute or default + * @nla: __le32 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __le32 nla_get_le32_default(const struct nlattr *nla, + __le32 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_le32(nla); +} + /** * nla_get_u16 - return payload of u16 attribute * @nla: u16 netlink attribute @@ -1722,6 +1768,20 @@ static inline u16 nla_get_u16(const struct nlattr *nla) return *(u16 *) nla_data(nla); } +/** + * nla_get_u16_default - return payload of u16 attribute or default + * @nla: u16 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline u16 nla_get_u16_default(const struct nlattr *nla, u16 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_u16(nla); +} + /** * nla_get_be16 - return payload of __be16 attribute * @nla: __be16 netlink attribute @@ -1731,6 +1791,21 @@ static inline __be16 nla_get_be16(const struct nlattr *nla) return *(__be16 *) nla_data(nla); } +/** + * nla_get_be16_default - return payload of be16 attribute or default + * @nla: __be16 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __be16 nla_get_be16_default(const struct nlattr *nla, + __be16 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_be16(nla); +} + /** * nla_get_le16 - return payload of __le16 attribute * @nla: __le16 netlink attribute @@ -1740,6 +1815,21 @@ static inline __le16 nla_get_le16(const struct nlattr *nla) return *(__le16 *) nla_data(nla); } +/** + * nla_get_le16_default - return payload of le16 attribute or default + * @nla: __le16 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __le16 nla_get_le16_default(const struct nlattr *nla, + __le16 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_le16(nla); +} + /** * nla_get_u8 - return payload of u8 attribute * @nla: u8 netlink attribute @@ -1749,6 +1839,20 @@ static inline u8 nla_get_u8(const struct nlattr *nla) return *(u8 *) nla_data(nla); } +/** + * nla_get_u8_default - return payload of u8 attribute or default + * @nla: u8 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline u8 nla_get_u8_default(const struct nlattr *nla, u8 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_u8(nla); +} + /** * nla_get_u64 - return payload of u64 attribute * @nla: u64 netlink attribute @@ -1762,6 +1866,20 @@ static inline u64 nla_get_u64(const struct nlattr *nla) return tmp; } +/** + * nla_get_u64_default - return payload of u64 attribute or default + * @nla: u64 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline u64 nla_get_u64_default(const struct nlattr *nla, u64 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_u64(nla); +} + /** * nla_get_uint - return payload of uint attribute * @nla: uint netlink attribute @@ -1773,6 +1891,20 @@ static inline u64 nla_get_uint(const struct nlattr *nla) return nla_get_u64(nla); } +/** + * nla_get_uint_default - return payload of uint attribute or default + * @nla: uint netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline u64 nla_get_uint_default(const struct nlattr *nla, u64 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_uint(nla); +} + /** * nla_get_be64 - return payload of __be64 attribute * @nla: __be64 netlink attribute @@ -1786,6 +1918,21 @@ static inline __be64 nla_get_be64(const struct nlattr *nla) return tmp; } +/** + * nla_get_be64_default - return payload of be64 attribute or default + * @nla: __be64 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __be64 nla_get_be64_default(const struct nlattr *nla, + __be64 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_be64(nla); +} + /** * nla_get_le64 - return payload of __le64 attribute * @nla: __le64 netlink attribute @@ -1795,6 +1942,21 @@ static inline __le64 nla_get_le64(const struct nlattr *nla) return *(__le64 *) nla_data(nla); } +/** + * nla_get_le64_default - return payload of le64 attribute or default + * @nla: __le64 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __le64 nla_get_le64_default(const struct nlattr *nla, + __le64 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_le64(nla); +} + /** * nla_get_s32 - return payload of s32 attribute * @nla: s32 netlink attribute @@ -1804,6 +1966,20 @@ static inline s32 nla_get_s32(const struct nlattr *nla) return *(s32 *) nla_data(nla); } +/** + * nla_get_s32_default - return payload of s32 attribute or default + * @nla: s32 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline s32 nla_get_s32_default(const struct nlattr *nla, s32 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_s32(nla); +} + /** * nla_get_s16 - return payload of s16 attribute * @nla: s16 netlink attribute @@ -1813,6 +1989,20 @@ static inline s16 nla_get_s16(const struct nlattr *nla) return *(s16 *) nla_data(nla); } +/** + * nla_get_s16_default - return payload of s16 attribute or default + * @nla: s16 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline s16 nla_get_s16_default(const struct nlattr *nla, s16 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_s16(nla); +} + /** * nla_get_s8 - return payload of s8 attribute * @nla: s8 netlink attribute @@ -1822,6 +2012,20 @@ static inline s8 nla_get_s8(const struct nlattr *nla) return *(s8 *) nla_data(nla); } +/** + * nla_get_s8_default - return payload of s8 attribute or default + * @nla: s8 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline s8 nla_get_s8_default(const struct nlattr *nla, s8 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_s8(nla); +} + /** * nla_get_s64 - return payload of s64 attribute * @nla: s64 netlink attribute @@ -1835,6 +2039,20 @@ static inline s64 nla_get_s64(const struct nlattr *nla) return tmp; } +/** + * nla_get_s64_default - return payload of s64 attribute or default + * @nla: s64 netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline s64 nla_get_s64_default(const struct nlattr *nla, s64 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_s64(nla); +} + /** * nla_get_sint - return payload of uint attribute * @nla: uint netlink attribute @@ -1846,6 +2064,20 @@ static inline s64 nla_get_sint(const struct nlattr *nla) return nla_get_s64(nla); } +/** + * nla_get_sint_default - return payload of sint attribute or default + * @nla: sint netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline s64 nla_get_sint_default(const struct nlattr *nla, s64 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_sint(nla); +} + /** * nla_get_flag - return payload of flag attribute * @nla: flag netlink attribute @@ -1868,6 +2100,21 @@ static inline unsigned long nla_get_msecs(const struct nlattr *nla) return msecs_to_jiffies((unsigned long) msecs); } +/** + * nla_get_msecs_default - return payload of msecs attribute or default + * @nla: msecs netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline unsigned long nla_get_msecs_default(const struct nlattr *nla, + unsigned long defvalue) +{ + if (!nla) + return defvalue; + return nla_get_msecs(nla); +} + /** * nla_get_in_addr - return payload of IPv4 address attribute * @nla: IPv4 address netlink attribute @@ -1877,6 +2124,21 @@ static inline __be32 nla_get_in_addr(const struct nlattr *nla) return *(__be32 *) nla_data(nla); } +/** + * nla_get_in_addr_default - return payload of be32 attribute or default + * @nla: IPv4 address netlink attribute, may be %NULL + * @defvalue: default value to use if @nla is %NULL + * + * Return: the value of the attribute, or the default value if not present + */ +static inline __be32 nla_get_in_addr_default(const struct nlattr *nla, + __be32 defvalue) +{ + if (!nla) + return defvalue; + return nla_get_in_addr(nla); +} + /** * nla_get_in6_addr - return payload of IPv6 address attribute * @nla: IPv6 address netlink attribute -- cgit v1.2.3 From d5ec8d91f82ef78405b506737952dec8af95a95b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:14 -0800 Subject: rtnetlink: Remove __rtnl_link_unregister(). rtnl_link_unregister() holds RTNL and calls __rtnl_link_unregister(), where we call synchronize_srcu() to wait inflight RTM_NEWLINK requests for per-netns RTNL. We put synchronize_srcu() in __rtnl_link_unregister() due to ifb.ko and dummy.ko. However, rtnl_newlink() will acquire SRCU before RTNL later in this series. Then, lockdep will detect the deadlock: rtnl_link_unregister() rtnl_newlink() ---- ---- lock(rtnl_mutex); lock(&ops->srcu); lock(rtnl_mutex); sync(&ops->srcu); To avoid the problem, we must call synchronize_srcu() before RTNL in rtnl_link_unregister(). As a preparation, let's remove __rtnl_link_unregister(). Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241108004823.29419-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/dummy.c | 6 +++++- drivers/net/ifb.c | 6 +++++- include/net/rtnetlink.h | 1 - net/core/rtnetlink.c | 32 ++++++++++---------------------- 4 files changed, 20 insertions(+), 25 deletions(-) (limited to 'include/net') diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c index e9c5e1e11fa0..72618b6af44e 100644 --- a/drivers/net/dummy.c +++ b/drivers/net/dummy.c @@ -166,6 +166,7 @@ err: static int __init dummy_init_module(void) { + bool need_unregister = false; int i, err = 0; down_write(&pernet_ops_rwsem); @@ -179,12 +180,15 @@ static int __init dummy_init_module(void) cond_resched(); } if (err < 0) - __rtnl_link_unregister(&dummy_link_ops); + need_unregister = true; out: rtnl_unlock(); up_write(&pernet_ops_rwsem); + if (need_unregister) + rtnl_link_unregister(&dummy_link_ops); + return err; } diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c index 2c1b5def4a0b..a4b9ec4e8f30 100644 --- a/drivers/net/ifb.c +++ b/drivers/net/ifb.c @@ -424,6 +424,7 @@ err: static int __init ifb_init_module(void) { + bool need_unregister = false; int i, err; down_write(&pernet_ops_rwsem); @@ -437,12 +438,15 @@ static int __init ifb_init_module(void) cond_resched(); } if (err) - __rtnl_link_unregister(&ifb_link_ops); + need_unregister = true; out: rtnl_unlock(); up_write(&pernet_ops_rwsem); + if (need_unregister) + rtnl_link_unregister(&ifb_link_ops); + return err; } diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index b260c0cc9671..3ebfcc6e56fd 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -165,7 +165,6 @@ struct rtnl_link_ops { }; int __rtnl_link_register(struct rtnl_link_ops *ops); -void __rtnl_link_unregister(struct rtnl_link_ops *ops); int rtnl_link_register(struct rtnl_link_ops *ops); void rtnl_link_unregister(struct rtnl_link_ops *ops); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a5c386a45501..f0246ecec7fa 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -568,27 +568,6 @@ static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) unregister_netdevice_many(&list_kill); } -/** - * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. - * @ops: struct rtnl_link_ops * to unregister - * - * The caller must hold the rtnl_mutex and guarantee net_namespace_list - * integrity (hold pernet_ops_rwsem for writing to close the race - * with setup_net() and cleanup_net()). - */ -void __rtnl_link_unregister(struct rtnl_link_ops *ops) -{ - struct net *net; - - list_del_rcu(&ops->list); - synchronize_srcu(&ops->srcu); - cleanup_srcu_struct(&ops->srcu); - - for_each_net(net) - __rtnl_kill_links(net, ops); -} -EXPORT_SYMBOL_GPL(__rtnl_link_unregister); - /* Return with the rtnl_lock held when there are no network * devices unregistering in any network namespace. */ @@ -617,10 +596,19 @@ static void rtnl_lock_unregistering_all(void) */ void rtnl_link_unregister(struct rtnl_link_ops *ops) { + struct net *net; + /* Close the race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); rtnl_lock_unregistering_all(); - __rtnl_link_unregister(ops); + + list_del_rcu(&ops->list); + synchronize_srcu(&ops->srcu); + cleanup_srcu_struct(&ops->srcu); + + for_each_net(net) + __rtnl_kill_links(net, ops); + rtnl_unlock(); up_write(&pernet_ops_rwsem); } -- cgit v1.2.3 From 6b57ff21a3109b1dba2d286ff415463e6fb1fca3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:15 -0800 Subject: rtnetlink: Protect link_ops by mutex. rtnl_link_unregister() holds RTNL and calls synchronize_srcu(), but rtnl_newlink() will acquire SRCU frist and then RTNL. Then, we need to unlink ops and call synchronize_srcu() outside of RTNL to avoid the deadlock. rtnl_link_unregister() rtnl_newlink() ---- ---- lock(rtnl_mutex); lock(&ops->srcu); lock(rtnl_mutex); sync(&ops->srcu); Let's move as such and add a mutex to protect link_ops. Now, link_ops is protected by its dedicated mutex and rtnl_link_register() no longer needs to hold RTNL. While at it, we move the initialisation of ops->dellink and ops->srcu out of the mutex scope. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241108004823.29419-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/rtnetlink.h | 2 +- net/core/rtnetlink.c | 33 ++++++++++++++++++++------------- 2 files changed, 21 insertions(+), 14 deletions(-) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 3ebfcc6e56fd..7559020f760c 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -71,7 +71,7 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) /** * struct rtnl_link_ops - rtnetlink link operations * - * @list: Used internally, protected by RTNL and SRCU + * @list: Used internally, protected by link_ops_mutex and SRCU * @srcu: Used internally * @kind: Identifier * @netns_refund: Physical device, move to init_net on netns exit diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f0246ecec7fa..21154ef0048f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -466,6 +466,7 @@ void __rtnl_unregister_many(const struct rtnl_msg_handler *handlers, int n) } EXPORT_SYMBOL_GPL(__rtnl_unregister_many); +static DEFINE_MUTEX(link_ops_mutex); static LIST_HEAD(link_ops); static struct rtnl_link_ops *rtnl_link_ops_get(const char *kind, int *srcu_index) @@ -508,14 +509,6 @@ int __rtnl_link_register(struct rtnl_link_ops *ops) struct rtnl_link_ops *tmp; int err; - /* When RTNL is removed, add lock for link_ops. */ - ASSERT_RTNL(); - - list_for_each_entry(tmp, &link_ops, list) { - if (!strcmp(ops->kind, tmp->kind)) - return -EEXIST; - } - /* The check for alloc/setup is here because if ops * does not have that filled up, it is not possible * to use the ops for creating device. So do not @@ -528,9 +521,20 @@ int __rtnl_link_register(struct rtnl_link_ops *ops) if (err) return err; + mutex_lock(&link_ops_mutex); + + list_for_each_entry(tmp, &link_ops, list) { + if (!strcmp(ops->kind, tmp->kind)) { + err = -EEXIST; + goto unlock; + } + } + list_add_tail_rcu(&ops->list, &link_ops); +unlock: + mutex_unlock(&link_ops_mutex); - return 0; + return err; } EXPORT_SYMBOL_GPL(__rtnl_link_register); @@ -598,14 +602,17 @@ void rtnl_link_unregister(struct rtnl_link_ops *ops) { struct net *net; - /* Close the race with setup_net() and cleanup_net() */ - down_write(&pernet_ops_rwsem); - rtnl_lock_unregistering_all(); - + mutex_lock(&link_ops_mutex); list_del_rcu(&ops->list); + mutex_unlock(&link_ops_mutex); + synchronize_srcu(&ops->srcu); cleanup_srcu_struct(&ops->srcu); + /* Close the race with setup_net() and cleanup_net() */ + down_write(&pernet_ops_rwsem); + rtnl_lock_unregistering_all(); + for_each_net(net) __rtnl_kill_links(net, ops); -- cgit v1.2.3 From 68297dbb967f87c3c92af9d2f652270f57c547c7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:16 -0800 Subject: rtnetlink: Remove __rtnl_link_register() link_ops is protected by link_ops_mutex and no longer needs RTNL, so we have no reason to have __rtnl_link_register() separately. Let's remove it and call rtnl_link_register() from ifb.ko and dummy.ko. Note that both modules' init() work on init_net only, so we need not export pernet_ops_rwsem and can use rtnl_net_lock() there. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241108004823.29419-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/dummy.c | 17 ++++++----------- drivers/net/ifb.c | 17 ++++++----------- include/net/rtnetlink.h | 2 -- net/core/net_namespace.c | 1 - net/core/rtnetlink.c | 35 +++++++---------------------------- 5 files changed, 19 insertions(+), 53 deletions(-) (limited to 'include/net') diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c index 72618b6af44e..005d79975f3b 100644 --- a/drivers/net/dummy.c +++ b/drivers/net/dummy.c @@ -166,27 +166,22 @@ err: static int __init dummy_init_module(void) { - bool need_unregister = false; int i, err = 0; - down_write(&pernet_ops_rwsem); - rtnl_lock(); - err = __rtnl_link_register(&dummy_link_ops); + err = rtnl_link_register(&dummy_link_ops); if (err < 0) - goto out; + return err; + + rtnl_net_lock(&init_net); for (i = 0; i < numdummies && !err; i++) { err = dummy_init_one(); cond_resched(); } - if (err < 0) - need_unregister = true; -out: - rtnl_unlock(); - up_write(&pernet_ops_rwsem); + rtnl_net_unlock(&init_net); - if (need_unregister) + if (err < 0) rtnl_link_unregister(&dummy_link_ops); return err; diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c index a4b9ec4e8f30..67424888ff0a 100644 --- a/drivers/net/ifb.c +++ b/drivers/net/ifb.c @@ -424,27 +424,22 @@ err: static int __init ifb_init_module(void) { - bool need_unregister = false; int i, err; - down_write(&pernet_ops_rwsem); - rtnl_lock(); - err = __rtnl_link_register(&ifb_link_ops); + err = rtnl_link_register(&ifb_link_ops); if (err < 0) - goto out; + return err; + + rtnl_net_lock(&init_net); for (i = 0; i < numifbs && !err; i++) { err = ifb_init_one(i); cond_resched(); } - if (err) - need_unregister = true; -out: - rtnl_unlock(); - up_write(&pernet_ops_rwsem); + rtnl_net_unlock(&init_net); - if (need_unregister) + if (err) rtnl_link_unregister(&ifb_link_ops); return err; diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 7559020f760c..ef7c11f0d74c 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -164,8 +164,6 @@ struct rtnl_link_ops { int *prividx, int attr); }; -int __rtnl_link_register(struct rtnl_link_ops *ops); - int rtnl_link_register(struct rtnl_link_ops *ops); void rtnl_link_unregister(struct rtnl_link_ops *ops); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 809b48c0a528..157021ced442 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -56,7 +56,6 @@ static bool init_net_initialized; * outside. */ DECLARE_RWSEM(pernet_ops_rwsem); -EXPORT_SYMBOL_GPL(pernet_ops_rwsem); #define MIN_PERNET_OPS_ID \ ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *)) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 21154ef0048f..e8357a3b9c7e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -495,20 +495,21 @@ static void rtnl_link_ops_put(struct rtnl_link_ops *ops, int srcu_index) } /** - * __rtnl_link_register - Register rtnl_link_ops with rtnetlink. + * rtnl_link_register - Register rtnl_link_ops with rtnetlink. * @ops: struct rtnl_link_ops * to register * - * The caller must hold the rtnl_mutex. This function should be used - * by drivers that create devices during module initialization. It - * must be called before registering the devices. - * * Returns 0 on success or a negative error code. */ -int __rtnl_link_register(struct rtnl_link_ops *ops) +int rtnl_link_register(struct rtnl_link_ops *ops) { struct rtnl_link_ops *tmp; int err; + /* Sanity-check max sizes to avoid stack buffer overflow. */ + if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE || + ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)) + return -EINVAL; + /* The check for alloc/setup is here because if ops * does not have that filled up, it is not possible * to use the ops for creating device. So do not @@ -536,28 +537,6 @@ unlock: return err; } -EXPORT_SYMBOL_GPL(__rtnl_link_register); - -/** - * rtnl_link_register - Register rtnl_link_ops with rtnetlink. - * @ops: struct rtnl_link_ops * to register - * - * Returns 0 on success or a negative error code. - */ -int rtnl_link_register(struct rtnl_link_ops *ops) -{ - int err; - - /* Sanity-check max sizes to avoid stack buffer overflow. */ - if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE || - ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)) - return -EINVAL; - - rtnl_lock(); - err = __rtnl_link_register(ops); - rtnl_unlock(); - return err; -} EXPORT_SYMBOL_GPL(rtnl_link_register); static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) -- cgit v1.2.3 From 28690e5361c05fd4ef0ca3a17d1c667cba790554 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:18 -0800 Subject: rtnetlink: Add peer_type in struct rtnl_link_ops. In ops->newlink(), veth, vxcan, and netkit call rtnl_link_get_net() with a net pointer, which is the first argument of ->newlink(). rtnl_link_get_net() could return another netns based on IFLA_NET_NS_PID and IFLA_NET_NS_FD in the peer device's attributes. We want to get it and fill rtnl_nets->nets[] in advance in rtnl_newlink() for per-netns RTNL. All of the three get the peer netns in the same way: 1. Call rtnl_nla_parse_ifinfomsg() 2. Call ops->validate() (vxcan doesn't have) 3. Call rtnl_link_get_net_tb() Let's add a new field peer_type to struct rtnl_link_ops and prefetch netns in the peer ifla to add it to rtnl_nets in rtnl_newlink(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20241108004823.29419-6-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/rtnetlink.h | 2 ++ net/core/rtnetlink.c | 55 +++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 53 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index ef7c11f0d74c..bef76abcff8d 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -75,6 +75,7 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) * @srcu: Used internally * @kind: Identifier * @netns_refund: Physical device, move to init_net on netns exit + * @peer_type: Peer device specific netlink attribute number (e.g. VETH_INFO_PEER) * @maxtype: Highest device specific netlink attribute number * @policy: Netlink policy for device specific attribute validation * @validate: Optional validation function for netlink/changelink parameters @@ -116,6 +117,7 @@ struct rtnl_link_ops { void (*setup)(struct net_device *dev); bool netns_refund; + const u16 peer_type; unsigned int maxtype; const struct nla_policy *policy; int (*validate)(struct nlattr *tb[], diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 960d9d2c6aec..1af187a4a3f1 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2492,9 +2492,10 @@ int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer, } EXPORT_SYMBOL(rtnl_nla_parse_ifinfomsg); -struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) +static struct net *rtnl_link_get_net_ifla(struct nlattr *tb[]) { - struct net *net; + struct net *net = NULL; + /* Examine the link attributes and figure out which * network namespace we are talking about. */ @@ -2502,8 +2503,17 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); else if (tb[IFLA_NET_NS_FD]) net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD])); - else + + return net; +} + +struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) +{ + struct net *net = rtnl_link_get_net_ifla(tb); + + if (!net) net = get_net(src_net); + return net; } EXPORT_SYMBOL(rtnl_link_get_net); @@ -3765,6 +3775,37 @@ out_unregister: goto out; } +static int rtnl_add_peer_net(struct rtnl_nets *rtnl_nets, + const struct rtnl_link_ops *ops, + struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_MAX + 1]; + struct net *net; + int err; + + if (!data || !data[ops->peer_type]) + return 0; + + err = rtnl_nla_parse_ifinfomsg(tb, data[ops->peer_type], extack); + if (err < 0) + return err; + + if (ops->validate) { + err = ops->validate(tb, NULL, extack); + if (err < 0) + return err; + } + + net = rtnl_link_get_net_ifla(tb); + if (IS_ERR(net)) + return PTR_ERR(net); + if (net) + rtnl_nets_add(rtnl_nets, net); + + return 0; +} + static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, const struct rtnl_link_ops *ops, struct net *tgt_net, struct net *link_net, @@ -3893,12 +3934,18 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (ret < 0) goto put_ops; } + + if (ops->peer_type) { + ret = rtnl_add_peer_net(&rtnl_nets, ops, data, extack); + if (ret < 0) + goto put_ops; + } } tgt_net = rtnl_link_get_net_capable(skb, sock_net(skb->sk), tb, CAP_NET_ADMIN); if (IS_ERR(tgt_net)) { ret = PTR_ERR(tgt_net); - goto put_ops; + goto put_net; } rtnl_nets_add(&rtnl_nets, tgt_net); -- cgit v1.2.3 From 636af13f213bf9b28a34254327934bc72a797754 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:23 -0800 Subject: rtnetlink: Register rtnl_dellink() and rtnl_setlink() with RTNL_FLAG_DOIT_PERNET_WIP. Currently, rtnl_setlink() and rtnl_dellink() cannot be fully converted to per-netns RTNL due to a lack of handling peer/lower/upper devices in different netns. For example, when we change a device in rtnl_setlink() and need to propagate that to its upper devices, we want to avoid acquiring all netns locks, for which we do not know the upper limit. The same situation happens when we remove a device. rtnl_dellink() could be transformed to remove a single device in the requested netns and delegate other devices to per-netns work, and rtnl_setlink() might be ? Until we come up with a better idea, let's use a new flag RTNL_FLAG_DOIT_PERNET_WIP for rtnl_dellink() and rtnl_setlink(). This will unblock converting RTNL users where such devices are not related. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Nikolay Aleksandrov Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241108004823.29419-11-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/rtnetlink.h | 1 + net/core/rtnetlink.c | 19 ++++++++++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index bef76abcff8d..bc0069a8b6ea 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -13,6 +13,7 @@ typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *); enum rtnl_link_flags { RTNL_FLAG_DOIT_UNLOCKED = BIT(0), #define RTNL_FLAG_DOIT_PERNET RTNL_FLAG_DOIT_UNLOCKED +#define RTNL_FLAG_DOIT_PERNET_WIP RTNL_FLAG_DOIT_UNLOCKED RTNL_FLAG_BULK_DEL_SUPPORTED = BIT(1), RTNL_FLAG_DUMP_UNLOCKED = BIT(2), RTNL_FLAG_DUMP_SPLIT_NLM_DONE = BIT(3), /* legacy behavior */ diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 30191d17add3..327fa4957929 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3379,6 +3379,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *net = sock_net(skb->sk); struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; + struct rtnl_nets rtnl_nets; struct net *tgt_net; int err; @@ -3397,6 +3398,12 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } + rtnl_nets_init(&rtnl_nets); + rtnl_nets_add(&rtnl_nets, get_net(net)); + rtnl_nets_add(&rtnl_nets, tgt_net); + + rtnl_nets_lock(&rtnl_nets); + if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) @@ -3409,7 +3416,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, else if (!err) err = -ENODEV; - put_net(tgt_net); + rtnl_nets_unlock(&rtnl_nets); errout: return err; } @@ -3494,6 +3501,8 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, return PTR_ERR(tgt_net); } + rtnl_net_lock(tgt_net); + if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) @@ -3508,6 +3517,8 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, else err = -EINVAL; + rtnl_net_unlock(tgt_net); + if (netnsid >= 0) put_net(tgt_net); @@ -6994,10 +7005,12 @@ static struct pernet_operations rtnetlink_net_ops = { static const struct rtnl_msg_handler rtnetlink_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWLINK, .doit = rtnl_newlink, .flags = RTNL_FLAG_DOIT_PERNET}, - {.msgtype = RTM_DELLINK, .doit = rtnl_dellink}, + {.msgtype = RTM_DELLINK, .doit = rtnl_dellink, + .flags = RTNL_FLAG_DOIT_PERNET_WIP}, {.msgtype = RTM_GETLINK, .doit = rtnl_getlink, .dumpit = rtnl_dump_ifinfo, .flags = RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, - {.msgtype = RTM_SETLINK, .doit = rtnl_setlink}, + {.msgtype = RTM_SETLINK, .doit = rtnl_setlink, + .flags = RTNL_FLAG_DOIT_PERNET_WIP}, {.msgtype = RTM_GETADDR, .dumpit = rtnl_dump_all}, {.msgtype = RTM_GETROUTE, .dumpit = rtnl_dump_all}, {.msgtype = RTM_GETNETCONF, .dumpit = rtnl_dump_all}, -- cgit v1.2.3 From 3fcbecbdeb048dfd1bea824f4276717fed02d10e Mon Sep 17 00:00:00 2001 From: Martin Karsten Date: Sat, 9 Nov 2024 05:02:32 +0000 Subject: net: Add control functions for irq suspension The napi_suspend_irqs routine bootstraps irq suspension by elongating the defer timeout to irq_suspend_timeout. The napi_resume_irqs routine effectively cancels irq suspension by forcing the napi to be scheduled immediately. Signed-off-by: Martin Karsten Co-developed-by: Joe Damato Signed-off-by: Joe Damato Tested-by: Joe Damato Tested-by: Martin Karsten Acked-by: Stanislav Fomichev Reviewed-by: Sridhar Samudrala Link: https://patch.msgid.link/20241109050245.191288-3-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- include/net/busy_poll.h | 3 +++ net/core/dev.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) (limited to 'include/net') diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index f03040baaefd..c858270141bc 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -52,6 +52,9 @@ void napi_busy_loop_rcu(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget); +void napi_suspend_irqs(unsigned int napi_id); +void napi_resume_irqs(unsigned int napi_id); + #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) { diff --git a/net/core/dev.c b/net/core/dev.c index 4d910872963f..13d00fc10f55 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6507,6 +6507,43 @@ void napi_busy_loop(unsigned int napi_id, } EXPORT_SYMBOL(napi_busy_loop); +void napi_suspend_irqs(unsigned int napi_id) +{ + struct napi_struct *napi; + + rcu_read_lock(); + napi = napi_by_id(napi_id); + if (napi) { + unsigned long timeout = napi_get_irq_suspend_timeout(napi); + + if (timeout) + hrtimer_start(&napi->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); + } + rcu_read_unlock(); +} + +void napi_resume_irqs(unsigned int napi_id) +{ + struct napi_struct *napi; + + rcu_read_lock(); + napi = napi_by_id(napi_id); + if (napi) { + /* If irq_suspend_timeout is set to 0 between the call to + * napi_suspend_irqs and now, the original value still + * determines the safety timeout as intended and napi_watchdog + * will resume irq processing. + */ + if (napi_get_irq_suspend_timeout(napi)) { + local_bh_disable(); + napi_schedule(napi); + local_bh_enable(); + } + } + rcu_read_unlock(); +} + #endif /* CONFIG_NET_RX_BUSY_POLL */ static void __napi_hash_add_with_id(struct napi_struct *napi, -- cgit v1.2.3 From 37653a0b8a6f5d6ab23daa8e585c5ed24a0fc500 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:55:53 +0800 Subject: net: ip: make fib_validate_source() support drop reasons In this commit, we make fib_validate_source() and __fib_validate_source() return -reason instead of errno on error. The return value of fib_validate_source can be -errno, 0, and 1. It's hard to make fib_validate_source() return drop reasons directly. The fib_validate_source() will return 1 if the scope of the source(revert) route is HOST. And the __mkroute_input() will mark the skb with IPSKB_DOREDIRECT in this case (combine with some other conditions). And then, a REDIRECT ICMP will be sent in ip_forward() if this flag exists. We can't pass this information to __mkroute_input if we make fib_validate_source() return drop reasons. Therefore, we introduce the wrapper fib_validate_source_reason() for fib_validate_source(), which will return the drop reasons on error. In the origin logic, LINUX_MIB_IPRPFILTER will be counted if fib_validate_source() return -EXDEV. And now, we need to adjust it by checking "reason == SKB_DROP_REASON_IP_RPFILTER". However, this will take effect only after the patch "net: ip: make ip_route_input_noref() return drop reasons", as we can't pass the drop reasons from fib_validate_source() to ip_rcv_finish_core() in this patch. Following new drop reasons are added in this patch: SKB_DROP_REASON_IP_LOCAL_SOURCE SKB_DROP_REASON_IP_INVALID_SOURCE Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/dropreason-core.h | 10 ++++++++++ include/net/ip_fib.h | 12 ++++++++++++ net/ipv4/fib_frontend.c | 17 ++++++++++++----- net/ipv4/ip_input.c | 4 +--- net/ipv4/route.c | 33 +++++++++++++++++++-------------- 5 files changed, 54 insertions(+), 22 deletions(-) (limited to 'include/net') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index d59bb96c5a02..62a60be1db84 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -76,6 +76,8 @@ FN(INVALID_PROTO) \ FN(IP_INADDRERRORS) \ FN(IP_INNOROUTES) \ + FN(IP_LOCAL_SOURCE) \ + FN(IP_INVALID_SOURCE) \ FN(PKT_TOO_BIG) \ FN(DUP_FRAG) \ FN(FRAG_REASM_TIMEOUT) \ @@ -373,6 +375,14 @@ enum skb_drop_reason { * IPSTATS_MIB_INADDRERRORS */ SKB_DROP_REASON_IP_INNOROUTES, + /** @SKB_DROP_REASON_IP_LOCAL_SOURCE: the source ip is local */ + SKB_DROP_REASON_IP_LOCAL_SOURCE, + /** + * @SKB_DROP_REASON_IP_INVALID_SOURCE: the source ip is invalid: + * 1) source ip is multicast or limited broadcast + * 2) source ip is zero and not IGMP + */ + SKB_DROP_REASON_IP_INVALID_SOURCE, /** * @SKB_DROP_REASON_PKT_TOO_BIG: packet size is too big (maybe exceed the * MTU) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index b6e44f4eaa4c..a113c11ab56b 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -452,6 +452,18 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, dscp_t dscp, int oif, struct net_device *dev, struct in_device *idev, u32 *itag); +static inline enum skb_drop_reason +fib_validate_source_reason(struct sk_buff *skb, __be32 src, __be32 dst, + dscp_t dscp, int oif, struct net_device *dev, + struct in_device *idev, u32 *itag) +{ + int err = fib_validate_source(skb, src, dst, dscp, oif, dev, idev, + itag); + if (err < 0) + return -err; + return SKB_NOT_DROPPED_YET; +} + #ifdef CONFIG_IP_ROUTE_CLASSID static inline int fib_num_tclassid_users(struct net *net) { diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 0c9ce934b490..87bb36a5bdec 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -346,6 +346,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, int rpf, struct in_device *idev, u32 *itag) { struct net *net = dev_net(dev); + enum skb_drop_reason reason; struct flow_keys flkeys; int ret, no_addr; struct fib_result res; @@ -377,9 +378,15 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, if (fib_lookup(net, &fl4, &res, 0)) goto last_resort; - if (res.type != RTN_UNICAST && - (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) - goto e_inval; + if (res.type != RTN_UNICAST) { + if (res.type != RTN_LOCAL) { + reason = SKB_DROP_REASON_IP_INVALID_SOURCE; + goto e_inval; + } else if (!IN_DEV_ACCEPT_LOCAL(idev)) { + reason = SKB_DROP_REASON_IP_LOCAL_SOURCE; + goto e_inval; + } + } fib_combine_itag(itag, &res); dev_match = fib_info_nh_uses_dev(res.fi, dev); @@ -412,9 +419,9 @@ last_resort: return 0; e_inval: - return -EINVAL; + return -reason; e_rpf: - return -EXDEV; + return -SKB_DROP_REASON_IP_RPFILTER; } /* Ignore rp_filter for packets protected by IPsec. */ diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 89bb63da6852..c40a26972884 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -425,10 +425,8 @@ drop: return NET_RX_DROP; drop_error: - if (err == -EXDEV) { - drop_reason = SKB_DROP_REASON_IP_RPFILTER; + if (drop_reason == SKB_DROP_REASON_IP_RPFILTER) __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); - } goto drop; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ccdbe9c70132..c38e95d9da9e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1682,7 +1682,7 @@ int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, struct in_device *in_dev, u32 *itag) { - int err; + enum skb_drop_reason reason; /* Primary sanity checks. */ if (!in_dev) @@ -1700,10 +1700,10 @@ int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, ip_hdr(skb)->protocol != IPPROTO_IGMP) return -EINVAL; } else { - err = fib_validate_source(skb, saddr, 0, dscp, 0, dev, in_dev, - itag); - if (err < 0) - return err; + reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0, + dev, in_dev, itag); + if (reason) + return -EINVAL; } return 0; } @@ -1801,6 +1801,7 @@ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, err = fib_validate_source(skb, saddr, daddr, dscp, FIB_RES_OIF(*res), in_dev->dev, in_dev, &itag); if (err < 0) { + err = -EINVAL; ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); @@ -2153,6 +2154,7 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct in_device *in_dev = __in_dev_get_rcu(dev); struct rtable *rt = skb_rtable(hint); struct net *net = dev_net(dev); + enum skb_drop_reason reason; int err = -EINVAL; u32 tag = 0; @@ -2171,9 +2173,9 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (rt->rt_type != RTN_LOCAL) goto skip_validate_source; - err = fib_validate_source(skb, saddr, daddr, dscp, 0, dev, in_dev, - &tag); - if (err < 0) + reason = fib_validate_source_reason(skb, saddr, daddr, dscp, 0, dev, + in_dev, &tag); + if (reason) goto martian_source; skip_validate_source: @@ -2215,6 +2217,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, struct fib_result *res) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); struct flow_keys *flkeys = NULL, _flkeys; struct net *net = dev_net(dev); @@ -2309,10 +2312,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto brd_input; } + err = -EINVAL; if (res->type == RTN_LOCAL) { - err = fib_validate_source(skb, saddr, daddr, dscp, 0, dev, - in_dev, &itag); - if (err < 0) + reason = fib_validate_source_reason(skb, saddr, daddr, dscp, + 0, dev, in_dev, &itag); + if (reason) goto martian_source; goto local_input; } @@ -2333,9 +2337,10 @@ brd_input: goto e_inval; if (!ipv4_is_zeronet(saddr)) { - err = fib_validate_source(skb, saddr, 0, dscp, 0, dev, in_dev, - &itag); - if (err < 0) + err = -EINVAL; + reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0, + dev, in_dev, &itag); + if (reason) goto martian_source; } flags |= RTCF_BROADCAST; -- cgit v1.2.3 From d46f827016d891dbc234cb05c406180f77fb3b2d Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:55:55 +0800 Subject: net: ip: make ip_mc_validate_source() return drop reason Make ip_mc_validate_source() return drop reason, and adjust the call of it in ip_route_input_mc(). Another caller of it is ip_rcv_finish_core->udp_v4_early_demux, and the errno is not checked in detail, so we don't do more adjustment for it. The drop reason "SKB_DROP_REASON_IP_LOCALNET" is added in this commit. Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/dropreason-core.h | 3 +++ include/net/route.h | 7 ++++--- net/ipv4/route.c | 35 +++++++++++++++++++---------------- 3 files changed, 26 insertions(+), 19 deletions(-) (limited to 'include/net') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 62a60be1db84..a2a1fb90e0e5 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -78,6 +78,7 @@ FN(IP_INNOROUTES) \ FN(IP_LOCAL_SOURCE) \ FN(IP_INVALID_SOURCE) \ + FN(IP_LOCALNET) \ FN(PKT_TOO_BIG) \ FN(DUP_FRAG) \ FN(FRAG_REASM_TIMEOUT) \ @@ -383,6 +384,8 @@ enum skb_drop_reason { * 2) source ip is zero and not IGMP */ SKB_DROP_REASON_IP_INVALID_SOURCE, + /** @SKB_DROP_REASON_IP_LOCALNET: source or dest ip is local net */ + SKB_DROP_REASON_IP_LOCALNET, /** * @SKB_DROP_REASON_PKT_TOO_BIG: packet size is too big (maybe exceed the * MTU) diff --git a/include/net/route.h b/include/net/route.h index 0a690adfdff5..e2e1922c58fb 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -199,9 +199,10 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 return ip_route_output_key(net, fl4); } -int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev, - struct in_device *in_dev, u32 *itag); +enum skb_drop_reason +ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + struct in_device *in_dev, u32 *itag); int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev); int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b20316789baa..ef0b5ffda843 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1678,34 +1678,37 @@ struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt) EXPORT_SYMBOL(rt_dst_clone); /* called in rcu_read_lock() section */ -int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev, - struct in_device *in_dev, u32 *itag) +enum skb_drop_reason +ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + struct in_device *in_dev, u32 *itag) { enum skb_drop_reason reason; /* Primary sanity checks. */ if (!in_dev) - return -EINVAL; + return SKB_DROP_REASON_NOT_SPECIFIED; - if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || - skb->protocol != htons(ETH_P_IP)) - return -EINVAL; + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) + return SKB_DROP_REASON_IP_INVALID_SOURCE; + + if (skb->protocol != htons(ETH_P_IP)) + return SKB_DROP_REASON_INVALID_PROTO; if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) - return -EINVAL; + return SKB_DROP_REASON_IP_LOCALNET; if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr) && ip_hdr(skb)->protocol != IPPROTO_IGMP) - return -EINVAL; + return SKB_DROP_REASON_IP_INVALID_SOURCE; } else { reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0, dev, in_dev, itag); if (reason) - return -EINVAL; + return reason; } - return 0; + return SKB_NOT_DROPPED_YET; } /* called in rcu_read_lock() section */ @@ -1715,14 +1718,14 @@ ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, { struct in_device *in_dev = __in_dev_get_rcu(dev); unsigned int flags = RTCF_MULTICAST; + enum skb_drop_reason reason; struct rtable *rth; u32 itag = 0; - int err; - err = ip_mc_validate_source(skb, daddr, saddr, dscp, dev, in_dev, - &itag); - if (err) - return SKB_DROP_REASON_NOT_SPECIFIED; + reason = ip_mc_validate_source(skb, daddr, saddr, dscp, dev, in_dev, + &itag); + if (reason) + return reason; if (our) flags |= RTCF_LOCAL; -- cgit v1.2.3 From 5b92112acd8e2ed84a4df653fc20575f4da6fa49 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:55:56 +0800 Subject: net: ip: make ip_route_input_slow() return drop reasons In this commit, we make ip_route_input_slow() return skb drop reasons, and following new skb drop reasons are added: SKB_DROP_REASON_IP_INVALID_DEST The only caller of ip_route_input_slow() is ip_route_input_rcu(), and we adjust it by making it return -EINVAL on error. Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/dropreason-core.h | 6 +++++ net/ipv4/route.c | 56 +++++++++++++++++++++++++++---------------- 2 files changed, 41 insertions(+), 21 deletions(-) (limited to 'include/net') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index a2a1fb90e0e5..74624d369d48 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -79,6 +79,7 @@ FN(IP_LOCAL_SOURCE) \ FN(IP_INVALID_SOURCE) \ FN(IP_LOCALNET) \ + FN(IP_INVALID_DEST) \ FN(PKT_TOO_BIG) \ FN(DUP_FRAG) \ FN(FRAG_REASM_TIMEOUT) \ @@ -386,6 +387,11 @@ enum skb_drop_reason { SKB_DROP_REASON_IP_INVALID_SOURCE, /** @SKB_DROP_REASON_IP_LOCALNET: source or dest ip is local net */ SKB_DROP_REASON_IP_LOCALNET, + /** + * @SKB_DROP_REASON_IP_INVALID_DEST: the dest ip is invalid: + * 1) dest ip is 0 + */ + SKB_DROP_REASON_IP_INVALID_DEST, /** * @SKB_DROP_REASON_PKT_TOO_BIG: packet size is too big (maybe exceed the * MTU) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ef0b5ffda843..b73f0355cc38 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2217,9 +2217,10 @@ static struct net_device *ip_rt_get_dev(struct net *net, * called with rcu_read_lock() */ -static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev, - struct fib_result *res) +static enum skb_drop_reason +ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + struct fib_result *res) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -2249,8 +2250,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.flowi4_tun_key.tun_id = 0; skb_dst_drop(skb); - if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) { + reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; + } res->fi = NULL; res->table = NULL; @@ -2260,21 +2263,29 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, /* Accept zero addresses only to limited broadcast; * I even do not know to fix it or not. Waiting for complains :-) */ - if (ipv4_is_zeronet(saddr)) + if (ipv4_is_zeronet(saddr)) { + reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; + } - if (ipv4_is_zeronet(daddr)) + if (ipv4_is_zeronet(daddr)) { + reason = SKB_DROP_REASON_IP_INVALID_DEST; goto martian_destination; + } /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(), * and call it once if daddr or/and saddr are loopback addresses */ if (ipv4_is_loopback(daddr)) { - if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) + if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { + reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_destination; + } } else if (ipv4_is_loopback(saddr)) { - if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) + if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { + reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_source; + } } /* @@ -2329,19 +2340,26 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, err = -EHOSTUNREACH; goto no_route; } - if (res->type != RTN_UNICAST) + if (res->type != RTN_UNICAST) { + reason = SKB_DROP_REASON_IP_INVALID_DEST; goto martian_destination; + } make_route: err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, dscp, flkeys); -out: return err; + if (!err) + reason = SKB_NOT_DROPPED_YET; + +out: + return reason; brd_input: - if (skb->protocol != htons(ETH_P_IP)) - goto e_inval; + if (skb->protocol != htons(ETH_P_IP)) { + reason = SKB_DROP_REASON_INVALID_PROTO; + goto out; + } if (!ipv4_is_zeronet(saddr)) { - err = -EINVAL; reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0, dev, in_dev, &itag); if (reason) @@ -2362,7 +2380,7 @@ local_input: rth = rcu_dereference(nhc->nhc_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); - err = 0; + reason = SKB_NOT_DROPPED_YET; goto out; } } @@ -2399,7 +2417,7 @@ local_input: rt_add_uncached_list(rth); } skb_dst_set(skb, &rth->dst); - err = 0; + reason = SKB_NOT_DROPPED_YET; goto out; no_route: @@ -2420,12 +2438,8 @@ martian_destination: &daddr, &saddr, dev->name); #endif -e_inval: - err = -EINVAL; - goto out; - e_nobufs: - err = -ENOBUFS; + reason = SKB_DROP_REASON_NOMEM; goto out; martian_source: @@ -2482,7 +2496,7 @@ static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, return reason ? -EINVAL : 0; } - return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res); + return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res) ? -EINVAL : 0; } int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, -- cgit v1.2.3 From 82d9983ebeb871cb5abd27c12a950c14c68772e1 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:55:58 +0800 Subject: net: ip: make ip_route_input_noref() return drop reasons In this commit, we make ip_route_input_noref() return drop reasons, which come from ip_route_input_rcu(). We need adjust the callers of ip_route_input_noref() to make sure the return value of ip_route_input_noref() is used properly. The errno that ip_route_input_noref() returns comes from ip_route_input and bpf_lwt_input_reroute in the origin logic, and we make them return -EINVAL on error instead. In the following patch, we will make ip_route_input() returns drop reasons too. Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/route.h | 15 ++++++++------- net/core/lwt_bpf.c | 6 ++++-- net/ipv4/ip_fragment.c | 11 ++++++----- net/ipv4/ip_input.c | 7 ++++--- net/ipv4/route.c | 7 ++++--- 5 files changed, 26 insertions(+), 20 deletions(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index e2e1922c58fb..b85ffa3e042b 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -203,8 +203,9 @@ enum skb_drop_reason ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, struct in_device *in_dev, u32 *itag); -int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev); +enum skb_drop_reason +ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev); int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, const struct sk_buff *hint); @@ -212,18 +213,18 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, dscp_t dscp, struct net_device *devin) { - int err; + enum skb_drop_reason reason; rcu_read_lock(); - err = ip_route_input_noref(skb, dst, src, dscp, devin); - if (!err) { + reason = ip_route_input_noref(skb, dst, src, dscp, devin); + if (!reason) { skb_dst_force(skb); if (!skb_dst(skb)) - err = -EINVAL; + reason = SKB_DROP_REASON_NOT_SPECIFIED; } rcu_read_unlock(); - return err; + return reason ? -EINVAL : 0; } void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index e0ca24a58810..8a78bff53b2c 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -88,6 +88,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, static int bpf_lwt_input_reroute(struct sk_buff *skb) { + enum skb_drop_reason reason; int err = -EINVAL; if (skb->protocol == htons(ETH_P_IP)) { @@ -96,8 +97,9 @@ static int bpf_lwt_input_reroute(struct sk_buff *skb) dev_hold(dev); skb_dst_drop(skb); - err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - ip4h_dscp(iph), dev); + reason = ip_route_input_noref(skb, iph->daddr, iph->saddr, + ip4h_dscp(iph), dev); + err = reason ? -EINVAL : 0; dev_put(dev); } else if (skb->protocol == htons(ETH_P_IPV6)) { skb_dst_drop(skb); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 48e2810f1f27..07036a2943c1 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -132,12 +132,12 @@ static bool frag_expire_skip_icmp(u32 user) */ static void ip_expire(struct timer_list *t) { + enum skb_drop_reason reason = SKB_DROP_REASON_FRAG_REASM_TIMEOUT; struct inet_frag_queue *frag = from_timer(frag, t, timer); const struct iphdr *iph; struct sk_buff *head = NULL; struct net *net; struct ipq *qp; - int err; qp = container_of(frag, struct ipq, q); net = qp->q.fqdir->net; @@ -175,14 +175,15 @@ static void ip_expire(struct timer_list *t) /* skb has no dst, perform route lookup again */ iph = ip_hdr(head); - err = ip_route_input_noref(head, iph->daddr, iph->saddr, ip4h_dscp(iph), - head->dev); - if (err) + reason = ip_route_input_noref(head, iph->daddr, iph->saddr, + ip4h_dscp(iph), head->dev); + if (reason) goto out; /* Only an end host needs to send an ICMP * "Fragment Reassembly Timeout" message, per RFC792. */ + reason = SKB_DROP_REASON_FRAG_REASM_TIMEOUT; if (frag_expire_skip_icmp(qp->q.key.v4.user) && (skb_rtable(head)->rt_type != RTN_LOCAL)) goto out; @@ -195,7 +196,7 @@ out: spin_unlock(&qp->q.lock); out_rcu_unlock: rcu_read_unlock(); - kfree_skb_reason(head, SKB_DROP_REASON_FRAG_REASM_TIMEOUT); + kfree_skb_reason(head, reason); ipq_put(qp); } diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index c40a26972884..513eb0c6435a 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -362,10 +362,11 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk, * how the packet travels inside Linux networking. */ if (!skb_valid_dst(skb)) { - err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - ip4h_dscp(iph), dev); - if (unlikely(err)) + drop_reason = ip_route_input_noref(skb, iph->daddr, iph->saddr, + ip4h_dscp(iph), dev); + if (unlikely(drop_reason)) goto drop_error; + drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; } else { struct in_device *in_dev = __in_dev_get_rcu(dev); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 270bc8c96619..5a7edb66174a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2500,8 +2500,9 @@ ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res); } -int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev) +enum skb_drop_reason ip_route_input_noref(struct sk_buff *skb, __be32 daddr, + __be32 saddr, dscp_t dscp, + struct net_device *dev) { enum skb_drop_reason reason; struct fib_result res; @@ -2510,7 +2511,7 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, reason = ip_route_input_rcu(skb, daddr, saddr, dscp, dev, &res); rcu_read_unlock(); - return reason ? -EINVAL : 0; + return reason; } EXPORT_SYMBOL(ip_route_input_noref); -- cgit v1.2.3 From 50038bf38e6577a15d52b890d82c197cf3b163a0 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:55:59 +0800 Subject: net: ip: make ip_route_input() return drop reasons In this commit, we make ip_route_input() return skb drop reasons that come from ip_route_input_noref(). Meanwhile, adjust all the call to it. Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/route.h | 7 ++++--- net/bridge/br_netfilter_hooks.c | 11 ++++++----- net/ipv4/icmp.c | 2 +- net/ipv4/ip_options.c | 2 +- net/ipv6/seg6_local.c | 14 +++++++------- 5 files changed, 19 insertions(+), 17 deletions(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index b85ffa3e042b..fb3433dc9c72 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -210,8 +210,9 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, const struct sk_buff *hint); -static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, - dscp_t dscp, struct net_device *devin) +static inline enum skb_drop_reason +ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, dscp_t dscp, + struct net_device *devin) { enum skb_drop_reason reason; @@ -224,7 +225,7 @@ static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, } rcu_read_unlock(); - return reason ? -EINVAL : 0; + return reason; } void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 7f2f40cef5fe..451e45b9a6a5 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -373,8 +373,8 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct net_device *dev = skb->dev, *br_indev; const struct iphdr *iph = ip_hdr(skb); + enum skb_drop_reason reason; struct rtable *rt; - int err; br_indev = nf_bridge_get_physindev(skb, net); if (!br_indev) { @@ -390,9 +390,9 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_ } nf_bridge->in_prerouting = 0; if (br_nf_ipv4_daddr_was_changed(skb, nf_bridge)) { - err = ip_route_input(skb, iph->daddr, iph->saddr, - ip4h_dscp(iph), dev); - if (err) { + reason = ip_route_input(skb, iph->daddr, iph->saddr, + ip4h_dscp(iph), dev); + if (reason) { struct in_device *in_dev = __in_dev_get_rcu(dev); /* If err equals -EHOSTUNREACH the error is due to a @@ -402,7 +402,8 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_ * martian destinations: loopback destinations and destination * 0.0.0.0. In both cases the packet will be dropped because the * destination is the loopback device and not the bridge. */ - if (err != -EHOSTUNREACH || !in_dev || IN_DEV_FORWARD(in_dev)) + if (reason != SKB_DROP_REASON_IP_INADDRERRORS || !in_dev || + IN_DEV_FORWARD(in_dev)) goto free_skb; rt = ip_route_output(net, iph->daddr, 0, diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 33eec844a5a0..4f088fa1c2f2 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -545,7 +545,7 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, orefdst = skb_in->_skb_refdst; /* save old refdst */ skb_dst_set(skb_in, NULL); err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr, - dscp, rt2->dst.dev); + dscp, rt2->dst.dev) ? -EINVAL : 0; dst_release(&rt2->dst); rt2 = skb_rtable(skb_in); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 81e86e5defee..e3321932bec0 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -618,7 +618,7 @@ int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev) orefdst = skb->_skb_refdst; skb_dst_set(skb, NULL); err = ip_route_input(skb, nexthop, iph->saddr, ip4h_dscp(iph), - dev); + dev) ? -EINVAL : 0; rt2 = skb_rtable(skb); if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { skb_dst_drop(skb); diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index c74705ead984..ac1dbd492c22 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -954,10 +954,10 @@ static int input_action_end_dx4_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); + enum skb_drop_reason reason; struct seg6_local_lwt *slwt; struct iphdr *iph; __be32 nhaddr; - int err; slwt = seg6_local_lwtunnel(orig_dst->lwtstate); @@ -967,9 +967,9 @@ static int input_action_end_dx4_finish(struct net *net, struct sock *sk, skb_dst_drop(skb); - err = ip_route_input(skb, nhaddr, iph->saddr, 0, skb->dev); - if (err) { - kfree_skb(skb); + reason = ip_route_input(skb, nhaddr, iph->saddr, 0, skb->dev); + if (reason) { + kfree_skb_reason(skb, reason); return -EINVAL; } @@ -1174,8 +1174,8 @@ drop: static int input_action_end_dt4(struct sk_buff *skb, struct seg6_local_lwt *slwt) { + enum skb_drop_reason reason; struct iphdr *iph; - int err; if (!decap_and_validate(skb, IPPROTO_IPIP)) goto drop; @@ -1193,8 +1193,8 @@ static int input_action_end_dt4(struct sk_buff *skb, iph = ip_hdr(skb); - err = ip_route_input(skb, iph->daddr, iph->saddr, 0, skb->dev); - if (unlikely(err)) + reason = ip_route_input(skb, iph->daddr, iph->saddr, 0, skb->dev); + if (unlikely(reason)) goto drop; return dst_input(skb); -- cgit v1.2.3 From d9340d1e02779dbf83d53b0deb4068c7768b8261 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:56:00 +0800 Subject: net: ip: make ip_mkroute_input/__mkroute_input return drop reasons In this commit, we make ip_mkroute_input() and __mkroute_input() return drop reasons. The drop reason "SKB_DROP_REASON_ARP_PVLAN_DISABLE" is introduced for the case: the packet which is not IP is forwarded to the in_dev, and the proxy_arp_pvlan is not enabled. Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/dropreason-core.h | 7 +++++++ net/ipv4/route.c | 34 ++++++++++++++++++---------------- 2 files changed, 25 insertions(+), 16 deletions(-) (limited to 'include/net') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 74624d369d48..6c5a1ea209a2 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -104,6 +104,7 @@ FN(IP_TUNNEL_ECN) \ FN(TUNNEL_TXINFO) \ FN(LOCAL_MAC) \ + FN(ARP_PVLAN_DISABLE) \ FNe(MAX) /** @@ -477,6 +478,12 @@ enum skb_drop_reason { * the MAC address of the local netdev. */ SKB_DROP_REASON_LOCAL_MAC, + /** + * @SKB_DROP_REASON_ARP_PVLAN_DISABLE: packet which is not IP is + * forwarded to the in_dev, and the proxy_arp_pvlan is not + * enabled. + */ + SKB_DROP_REASON_ARP_PVLAN_DISABLE, /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 5a7edb66174a..2697a6c88416 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1782,10 +1782,12 @@ static void ip_handle_martian_source(struct net_device *dev, } /* called in rcu_read_lock() section */ -static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, - struct in_device *in_dev, __be32 daddr, - __be32 saddr, dscp_t dscp) +static enum skb_drop_reason +__mkroute_input(struct sk_buff *skb, const struct fib_result *res, + struct in_device *in_dev, __be32 daddr, + __be32 saddr, dscp_t dscp) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct fib_nh_common *nhc = FIB_RES_NHC(*res); struct net_device *dev = nhc->nhc_dev; struct fib_nh_exception *fnhe; @@ -1799,13 +1801,13 @@ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, out_dev = __in_dev_get_rcu(dev); if (!out_dev) { net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); - return -EINVAL; + return reason; } err = fib_validate_source(skb, saddr, daddr, dscp, FIB_RES_OIF(*res), in_dev->dev, in_dev, &itag); if (err < 0) { - err = -EINVAL; + reason = -err; ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); @@ -1833,7 +1835,7 @@ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, */ if (out_dev == in_dev && IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { - err = -EINVAL; + reason = SKB_DROP_REASON_ARP_PVLAN_DISABLE; goto cleanup; } } @@ -1856,7 +1858,7 @@ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, rth = rt_dst_alloc(out_dev->dev, 0, res->type, IN_DEV_ORCONF(out_dev, NOXFRM)); if (!rth) { - err = -ENOBUFS; + reason = SKB_DROP_REASON_NOMEM; goto cleanup; } @@ -1870,9 +1872,9 @@ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, lwtunnel_set_redirect(&rth->dst); skb_dst_set(skb, &rth->dst); out: - err = 0; - cleanup: - return err; + reason = SKB_NOT_DROPPED_YET; +cleanup: + return reason; } #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -2130,9 +2132,10 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, } #endif /* CONFIG_IP_ROUTE_MULTIPATH */ -static int ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, - struct in_device *in_dev, __be32 daddr, - __be32 saddr, dscp_t dscp, struct flow_keys *hkeys) +static enum skb_drop_reason +ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, + struct in_device *in_dev, __be32 daddr, + __be32 saddr, dscp_t dscp, struct flow_keys *hkeys) { #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && fib_info_num_path(res->fi) > 1) { @@ -2346,9 +2349,8 @@ ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, } make_route: - err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, dscp, flkeys); - if (!err) - reason = SKB_NOT_DROPPED_YET; + reason = ip_mkroute_input(skb, res, in_dev, daddr, saddr, dscp, + flkeys); out: return reason; -- cgit v1.2.3 From 479aed04e84a5d66caa3b25bfc651292c153ef70 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:56:01 +0800 Subject: net: ip: make ip_route_use_hint() return drop reasons In this commit, we make ip_route_use_hint() return drop reasons. The drop reasons that we return are similar to what we do in ip_route_input_slow(), and no drop reasons are added in this commit. Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/route.h | 7 ++++--- net/ipv4/ip_input.c | 9 ++++----- net/ipv4/route.c | 28 +++++++++++++++++----------- 3 files changed, 25 insertions(+), 19 deletions(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index fb3433dc9c72..84cb1e04f5cd 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -206,9 +206,10 @@ ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, enum skb_drop_reason ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev); -int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev, - const struct sk_buff *hint); +enum skb_drop_reason +ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + const struct sk_buff *hint); static inline enum skb_drop_reason ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, dscp_t dscp, diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 513eb0c6435a..f0a4dda246ab 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -322,15 +322,14 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk, int err, drop_reason; struct rtable *rt; - drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; - if (ip_can_use_hint(skb, iph, hint)) { - err = ip_route_use_hint(skb, iph->daddr, iph->saddr, - ip4h_dscp(iph), dev, hint); - if (unlikely(err)) + drop_reason = ip_route_use_hint(skb, iph->daddr, iph->saddr, + ip4h_dscp(iph), dev, hint); + if (unlikely(drop_reason)) goto drop_error; } + drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) && !skb_dst(skb) && !skb->sk && diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 2697a6c88416..e5603e84b20d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2154,28 +2154,34 @@ ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, * assuming daddr is valid and the destination is not a local broadcast one. * Uses the provided hint instead of performing a route lookup. */ -int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev, - const struct sk_buff *hint) +enum skb_drop_reason +ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + const struct sk_buff *hint) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); struct rtable *rt = skb_rtable(hint); struct net *net = dev_net(dev); - enum skb_drop_reason reason; - int err = -EINVAL; u32 tag = 0; if (!in_dev) - return -EINVAL; + return reason; - if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) { + reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; + } - if (ipv4_is_zeronet(saddr)) + if (ipv4_is_zeronet(saddr)) { + reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; + } - if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) + if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { + reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_source; + } if (rt->rt_type != RTN_LOCAL) goto skip_validate_source; @@ -2187,11 +2193,11 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, skip_validate_source: skb_dst_copy(skb, hint); - return 0; + return SKB_NOT_DROPPED_YET; martian_source: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); - return err; + return reason; } /* get device for dst_alloc with local routes */ -- cgit v1.2.3 From 6b998404c71ecff9ebeed343c1ce21f9df3ef131 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 12 Nov 2024 21:36:29 +0100 Subject: net: simplify eeecfg_mac_can_tx_lpi Simplify the function. Signed-off-by: Heiner Kallweit Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/f9a4623b-b94c-466c-8733-62057c6d9a17@gmail.com Signed-off-by: Jakub Kicinski --- include/net/eee.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/eee.h b/include/net/eee.h index 84837aba3cd9..cfab1b8bc46a 100644 --- a/include/net/eee.h +++ b/include/net/eee.h @@ -13,10 +13,7 @@ struct eee_config { static inline bool eeecfg_mac_can_tx_lpi(const struct eee_config *eeecfg) { /* eee_enabled is the master on/off */ - if (!eeecfg->eee_enabled || !eeecfg->tx_lpi_enabled) - return false; - - return true; + return eeecfg->eee_enabled && eeecfg->tx_lpi_enabled; } static inline void eeecfg_to_eee(struct ethtool_keee *eee, -- cgit v1.2.3 From 8eb36164d1a6769a20ed43033510067ff3dab9ee Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 11 Nov 2024 10:16:49 +0000 Subject: bonding: add ns target multicast address to slave device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 4598380f9c54 ("bonding: fix ns validation on backup slaves") tried to resolve the issue where backup slaves couldn't be brought up when receiving IPv6 Neighbor Solicitation (NS) messages. However, this fix only worked for drivers that receive all multicast messages, such as the veth interface. For standard drivers, the NS multicast message is silently dropped because the slave device is not a member of the NS target multicast group. To address this, we need to make the slave device join the NS target multicast group, ensuring it can receive these IPv6 NS messages to validate the slave’s status properly. There are three policies before joining the multicast group: 1. All settings must be under active-backup mode (alb and tlb do not support arp_validate), with backup slaves and slaves supporting multicast. 2. We can add or remove multicast groups when arp_validate changes. 3. Other operations, such as enslaving, releasing, or setting NS targets, need to be guarded by arp_validate. Fixes: 4e24be018eb9 ("bonding: add new parameter ns_targets") Signed-off-by: Hangbin Liu Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/bonding/bond_main.c | 16 +++++++- drivers/net/bonding/bond_options.c | 82 +++++++++++++++++++++++++++++++++++++- include/net/bond_options.h | 2 + 3 files changed, 98 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index b1bffd8e9a95..15e0f14d0d49 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1008,6 +1008,8 @@ static void bond_hw_addr_swap(struct bonding *bond, struct slave *new_active, if (bond->dev->flags & IFF_UP) bond_hw_addr_flush(bond->dev, old_active->dev); + + bond_slave_ns_maddrs_add(bond, old_active); } if (new_active) { @@ -1024,6 +1026,8 @@ static void bond_hw_addr_swap(struct bonding *bond, struct slave *new_active, dev_mc_sync(new_active->dev, bond->dev); netif_addr_unlock_bh(bond->dev); } + + bond_slave_ns_maddrs_del(bond, new_active); } } @@ -2341,6 +2345,11 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, bond_compute_features(bond); bond_set_carrier(bond); + /* Needs to be called before bond_select_active_slave(), which will + * remove the maddrs if the slave is selected as active slave. + */ + bond_slave_ns_maddrs_add(bond, new_slave); + if (bond_uses_primary(bond)) { block_netpoll_tx(); bond_select_active_slave(bond); @@ -2350,7 +2359,6 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, if (bond_mode_can_use_xmit_hash(bond)) bond_update_slave_arr(bond, NULL); - if (!slave_dev->netdev_ops->ndo_bpf || !slave_dev->netdev_ops->ndo_xdp_xmit) { if (bond->xdp_prog) { @@ -2548,6 +2556,12 @@ static int __bond_release_one(struct net_device *bond_dev, if (oldcurrent == slave) bond_change_active_slave(bond, NULL); + /* Must be called after bond_change_active_slave () as the slave + * might change from an active slave to a backup slave. Then it is + * necessary to clear the maddrs on the backup slave. + */ + bond_slave_ns_maddrs_del(bond, slave); + if (bond_is_lb(bond)) { /* Must be called only after the slave has been * detached from the list and the curr_active_slave diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 95d59a18c022..327b6ecdc77e 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -15,6 +15,7 @@ #include #include +#include static int bond_option_active_slave_set(struct bonding *bond, const struct bond_opt_value *newval); @@ -1234,6 +1235,68 @@ static int bond_option_arp_ip_targets_set(struct bonding *bond, } #if IS_ENABLED(CONFIG_IPV6) +static bool slave_can_set_ns_maddr(const struct bonding *bond, struct slave *slave) +{ + return BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP && + !bond_is_active_slave(slave) && + slave->dev->flags & IFF_MULTICAST; +} + +static void slave_set_ns_maddrs(struct bonding *bond, struct slave *slave, bool add) +{ + struct in6_addr *targets = bond->params.ns_targets; + char slot_maddr[MAX_ADDR_LEN]; + int i; + + if (!slave_can_set_ns_maddr(bond, slave)) + return; + + for (i = 0; i < BOND_MAX_NS_TARGETS; i++) { + if (ipv6_addr_any(&targets[i])) + break; + + if (!ndisc_mc_map(&targets[i], slot_maddr, slave->dev, 0)) { + if (add) + dev_mc_add(slave->dev, slot_maddr); + else + dev_mc_del(slave->dev, slot_maddr); + } + } +} + +void bond_slave_ns_maddrs_add(struct bonding *bond, struct slave *slave) +{ + if (!bond->params.arp_validate) + return; + slave_set_ns_maddrs(bond, slave, true); +} + +void bond_slave_ns_maddrs_del(struct bonding *bond, struct slave *slave) +{ + if (!bond->params.arp_validate) + return; + slave_set_ns_maddrs(bond, slave, false); +} + +static void slave_set_ns_maddr(struct bonding *bond, struct slave *slave, + struct in6_addr *target, struct in6_addr *slot) +{ + char target_maddr[MAX_ADDR_LEN], slot_maddr[MAX_ADDR_LEN]; + + if (!bond->params.arp_validate || !slave_can_set_ns_maddr(bond, slave)) + return; + + /* remove the previous maddr from slave */ + if (!ipv6_addr_any(slot) && + !ndisc_mc_map(slot, slot_maddr, slave->dev, 0)) + dev_mc_del(slave->dev, slot_maddr); + + /* add new maddr on slave if target is set */ + if (!ipv6_addr_any(target) && + !ndisc_mc_map(target, target_maddr, slave->dev, 0)) + dev_mc_add(slave->dev, target_maddr); +} + static void _bond_options_ns_ip6_target_set(struct bonding *bond, int slot, struct in6_addr *target, unsigned long last_rx) @@ -1243,8 +1306,10 @@ static void _bond_options_ns_ip6_target_set(struct bonding *bond, int slot, struct slave *slave; if (slot >= 0 && slot < BOND_MAX_NS_TARGETS) { - bond_for_each_slave(bond, slave, iter) + bond_for_each_slave(bond, slave, iter) { slave->target_last_arp_rx[slot] = last_rx; + slave_set_ns_maddr(bond, slave, target, &targets[slot]); + } targets[slot] = *target; } } @@ -1296,15 +1361,30 @@ static int bond_option_ns_ip6_targets_set(struct bonding *bond, { return -EPERM; } + +static void slave_set_ns_maddrs(struct bonding *bond, struct slave *slave, bool add) {} + +void bond_slave_ns_maddrs_add(struct bonding *bond, struct slave *slave) {} + +void bond_slave_ns_maddrs_del(struct bonding *bond, struct slave *slave) {} #endif static int bond_option_arp_validate_set(struct bonding *bond, const struct bond_opt_value *newval) { + bool changed = !!bond->params.arp_validate != !!newval->value; + struct list_head *iter; + struct slave *slave; + netdev_dbg(bond->dev, "Setting arp_validate to %s (%llu)\n", newval->string, newval->value); bond->params.arp_validate = newval->value; + if (changed) { + bond_for_each_slave(bond, slave, iter) + slave_set_ns_maddrs(bond, slave, !!bond->params.arp_validate); + } + return 0; } diff --git a/include/net/bond_options.h b/include/net/bond_options.h index 473a0147769e..18687ccf0638 100644 --- a/include/net/bond_options.h +++ b/include/net/bond_options.h @@ -161,5 +161,7 @@ void bond_option_arp_ip_targets_clear(struct bonding *bond); #if IS_ENABLED(CONFIG_IPV6) void bond_option_ns_ip6_targets_clear(struct bonding *bond); #endif +void bond_slave_ns_maddrs_add(struct bonding *bond, struct slave *slave); +void bond_slave_ns_maddrs_del(struct bonding *bond, struct slave *slave); #endif /* _NET_BOND_OPTIONS_H */ -- cgit v1.2.3 From a8ee6b900c147d3bedced6c52ba6cb603226aaa3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 13 Nov 2024 16:35:50 +0100 Subject: netfilter: nf_tables: prepare for multiple elements in nft_trans_elem structure Add helpers to release the individual elements contained in the trans_elem container structure. No functional change intended. Followup patch will add 'nelems' member and will turn 'priv' into a flexible array. These helpers can then loop over all elements. Care needs to be taken to handle a mix of new elements and existing elements that are being updated (e.g. timeout refresh). Before this patch, NEWSETELEM transaction with update is released early so nft_trans_set_elem_destroy() won't get called, so we need to skip elements marked as update. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 21 ++-- net/netfilter/nf_tables_api.c | 228 +++++++++++++++++++++++++++----------- 2 files changed, 173 insertions(+), 76 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index f24278767bfd..37af0b174c39 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1759,28 +1759,25 @@ enum nft_trans_elem_flags { NFT_TRANS_UPD_EXPIRATION = (1 << 1), }; -struct nft_trans_elem { - struct nft_trans nft_trans; - struct nft_set *set; - struct nft_elem_priv *elem_priv; +struct nft_trans_one_elem { + struct nft_elem_priv *priv; u64 timeout; u64 expiration; u8 update_flags; +}; + +struct nft_trans_elem { + struct nft_trans nft_trans; + struct nft_set *set; bool bound; + unsigned int nelems; + struct nft_trans_one_elem elems[] __counted_by(nelems); }; #define nft_trans_container_elem(t) \ container_of(t, struct nft_trans_elem, nft_trans) #define nft_trans_elem_set(trans) \ nft_trans_container_elem(trans)->set -#define nft_trans_elem_priv(trans) \ - nft_trans_container_elem(trans)->elem_priv -#define nft_trans_elem_update_flags(trans) \ - nft_trans_container_elem(trans)->update_flags -#define nft_trans_elem_timeout(trans) \ - nft_trans_container_elem(trans)->timeout -#define nft_trans_elem_expiration(trans) \ - nft_trans_container_elem(trans)->expiration #define nft_trans_elem_set_bound(trans) \ nft_trans_container_elem(trans)->bound diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 75c84b17ab99..0882f78c2204 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6446,13 +6446,17 @@ static struct nft_trans *nft_trans_elem_alloc(const struct nft_ctx *ctx, int msg_type, struct nft_set *set) { + struct nft_trans_elem *te; struct nft_trans *trans; - trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_elem)); + trans = nft_trans_alloc(ctx, msg_type, struct_size(te, elems, 1)); if (trans == NULL) return NULL; - nft_trans_elem_set(trans) = set; + te = nft_trans_container_elem(trans); + te->nelems = 1; + te->set = set; + return trans; } @@ -6574,28 +6578,51 @@ static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx, } /* Drop references and destroy. Called from gc, dynset and abort path. */ -void nft_set_elem_destroy(const struct nft_set *set, - const struct nft_elem_priv *elem_priv, - bool destroy_expr) +static void __nft_set_elem_destroy(const struct nft_ctx *ctx, + const struct nft_set *set, + const struct nft_elem_priv *elem_priv, + bool destroy_expr) { struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); - struct nft_ctx ctx = { - .net = read_pnet(&set->net), - .family = set->table->family, - }; nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_release(nft_set_ext_data(ext), set->dtype); if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) - nft_set_elem_expr_destroy(&ctx, nft_set_ext_expr(ext)); + nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext)); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) nft_use_dec(&(*nft_set_ext_obj(ext))->use); kfree(elem_priv); } + +/* Drop references and destroy. Called from gc and dynset. */ +void nft_set_elem_destroy(const struct nft_set *set, + const struct nft_elem_priv *elem_priv, + bool destroy_expr) +{ + struct nft_ctx ctx = { + .net = read_pnet(&set->net), + .family = set->table->family, + }; + + __nft_set_elem_destroy(&ctx, set, elem_priv, destroy_expr); +} EXPORT_SYMBOL_GPL(nft_set_elem_destroy); +/* Drop references and destroy. Called from abort path. */ +static void nft_trans_set_elem_destroy(const struct nft_ctx *ctx, struct nft_trans_elem *te) +{ + int i; + + for (i = 0; i < te->nelems; i++) { + if (te->elems[i].update_flags) + continue; + + __nft_set_elem_destroy(ctx, te->set, te->elems[i].priv, true); + } +} + /* Destroy element. References have been already dropped in the preparation * path via nft_setelem_data_deactivate(). */ @@ -6611,6 +6638,15 @@ void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, kfree(elem_priv); } +static void nft_trans_elems_destroy(const struct nft_ctx *ctx, + const struct nft_trans_elem *te) +{ + int i; + + for (i = 0; i < te->nelems; i++) + nf_tables_set_elem_destroy(ctx, te->set, te->elems[i].priv); +} + int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, struct nft_expr *expr_array[]) { @@ -6767,6 +6803,36 @@ static void nft_setelem_activate(struct net *net, struct nft_set *set, } } +static void nft_trans_elem_update(const struct nft_set *set, + const struct nft_trans_one_elem *elem) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + + if (elem->update_flags & NFT_TRANS_UPD_TIMEOUT) + WRITE_ONCE(nft_set_ext_timeout(ext)->timeout, elem->timeout); + + if (elem->update_flags & NFT_TRANS_UPD_EXPIRATION) + WRITE_ONCE(nft_set_ext_timeout(ext)->expiration, get_jiffies_64() + elem->expiration); +} + +static void nft_trans_elems_add(const struct nft_ctx *ctx, + struct nft_trans_elem *te) +{ + int i; + + for (i = 0; i < te->nelems; i++) { + const struct nft_trans_one_elem *elem = &te->elems[i]; + + if (elem->update_flags) + nft_trans_elem_update(te->set, elem); + else + nft_setelem_activate(ctx->net, te->set, elem->priv); + + nf_tables_setelem_notify(ctx, te->set, elem->priv, + NFT_MSG_NEWSETELEM); + } +} + static int nft_setelem_catchall_deactivate(const struct net *net, struct nft_set *set, struct nft_set_elem *elem) @@ -6849,6 +6915,24 @@ static void nft_setelem_remove(const struct net *net, set->ops->remove(net, set, elem_priv); } +static void nft_trans_elems_remove(const struct nft_ctx *ctx, + const struct nft_trans_elem *te) +{ + int i; + + for (i = 0; i < te->nelems; i++) { + nf_tables_setelem_notify(ctx, te->set, + te->elems[i].priv, + te->nft_trans.msg_type); + + nft_setelem_remove(ctx->net, te->set, te->elems[i].priv); + if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) { + atomic_dec(&te->set->nelems); + te->set->ndeact--; + } + } +} + static bool nft_setelem_valid_key_end(const struct nft_set *set, struct nlattr **nla, u32 flags) { @@ -7200,22 +7284,26 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, else if (!(nlmsg_flags & NLM_F_EXCL)) { err = 0; if (nft_set_ext_exists(ext2, NFT_SET_EXT_TIMEOUT)) { + struct nft_trans_one_elem *update; + + update = &nft_trans_container_elem(trans)->elems[0]; + update_flags = 0; if (timeout != nft_set_ext_timeout(ext2)->timeout) { - nft_trans_elem_timeout(trans) = timeout; + update->timeout = timeout; if (expiration == 0) expiration = timeout; update_flags |= NFT_TRANS_UPD_TIMEOUT; } if (expiration) { - nft_trans_elem_expiration(trans) = expiration; + update->expiration = expiration; update_flags |= NFT_TRANS_UPD_EXPIRATION; } if (update_flags) { - nft_trans_elem_priv(trans) = elem_priv; - nft_trans_elem_update_flags(trans) = update_flags; + update->priv = elem_priv; + update->update_flags = update_flags; nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); goto err_elem_free; } @@ -7239,7 +7327,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } } - nft_trans_elem_priv(trans) = elem.priv; + nft_trans_container_elem(trans)->elems[0].priv = elem.priv; nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); return 0; @@ -7377,6 +7465,50 @@ void nft_setelem_data_deactivate(const struct net *net, nft_use_dec(&(*nft_set_ext_obj(ext))->use); } +/* similar to nft_trans_elems_remove, but called from abort path to undo newsetelem. + * No notifications and no ndeact changes. + * + * Returns true if set had been added to (i.e., elements need to be removed again). + */ +static bool nft_trans_elems_new_abort(const struct nft_ctx *ctx, + const struct nft_trans_elem *te) +{ + bool removed = false; + int i; + + for (i = 0; i < te->nelems; i++) { + if (te->elems[i].update_flags) + continue; + + if (!te->set->ops->abort || nft_setelem_is_catchall(te->set, te->elems[i].priv)) + nft_setelem_remove(ctx->net, te->set, te->elems[i].priv); + + if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) + atomic_dec(&te->set->nelems); + + removed = true; + } + + return removed; +} + +/* Called from abort path to undo DELSETELEM/DESTROYSETELEM. */ +static void nft_trans_elems_destroy_abort(const struct nft_ctx *ctx, + const struct nft_trans_elem *te) +{ + int i; + + for (i = 0; i < te->nelems; i++) { + if (!nft_setelem_active_next(ctx->net, te->set, te->elems[i].priv)) { + nft_setelem_data_activate(ctx->net, te->set, te->elems[i].priv); + nft_setelem_activate(ctx->net, te->set, te->elems[i].priv); + } + + if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) + te->set->ndeact--; + } +} + static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { @@ -7456,7 +7588,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, nft_setelem_data_deactivate(ctx->net, set, elem.priv); - nft_trans_elem_priv(trans) = elem.priv; + nft_trans_container_elem(trans)->elems[0].priv = elem.priv; nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); return 0; @@ -7483,7 +7615,8 @@ static int nft_setelem_flush(const struct nft_ctx *ctx, return 0; trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM, - sizeof(struct nft_trans_elem), GFP_ATOMIC); + struct_size_t(struct nft_trans_elem, elems, 1), + GFP_ATOMIC); if (!trans) return -ENOMEM; @@ -7492,7 +7625,8 @@ static int nft_setelem_flush(const struct nft_ctx *ctx, nft_setelem_data_deactivate(ctx->net, set, elem_priv); nft_trans_elem_set(trans) = set; - nft_trans_elem_priv(trans) = elem_priv; + nft_trans_container_elem(trans)->nelems = 1; + nft_trans_container_elem(trans)->elems[0].priv = elem_priv; nft_trans_commit_list_add_elem(ctx->net, trans, GFP_ATOMIC); return 0; @@ -7509,7 +7643,7 @@ static int __nft_set_catchall_flush(const struct nft_ctx *ctx, return -ENOMEM; nft_setelem_data_deactivate(ctx->net, set, elem_priv); - nft_trans_elem_priv(trans) = elem_priv; + nft_trans_container_elem(trans)->elems[0].priv = elem_priv; nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); return 0; @@ -9691,9 +9825,7 @@ static void nft_commit_release(struct nft_trans *trans) break; case NFT_MSG_DELSETELEM: case NFT_MSG_DESTROYSETELEM: - nf_tables_set_elem_destroy(&ctx, - nft_trans_elem_set(trans), - nft_trans_elem_priv(trans)); + nft_trans_elems_destroy(&ctx, nft_trans_container_elem(trans)); break; case NFT_MSG_DELOBJ: case NFT_MSG_DESTROYOBJ: @@ -10546,25 +10678,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) case NFT_MSG_NEWSETELEM: te = nft_trans_container_elem(trans); - if (te->update_flags) { - const struct nft_set_ext *ext = - nft_set_elem_ext(te->set, te->elem_priv); + nft_trans_elems_add(&ctx, te); - if (te->update_flags & NFT_TRANS_UPD_TIMEOUT) { - WRITE_ONCE(nft_set_ext_timeout(ext)->timeout, - te->timeout); - } - if (te->update_flags & NFT_TRANS_UPD_EXPIRATION) { - WRITE_ONCE(nft_set_ext_timeout(ext)->expiration, - get_jiffies_64() + te->expiration); - } - } else { - nft_setelem_activate(net, te->set, te->elem_priv); - } - - nf_tables_setelem_notify(&ctx, te->set, - te->elem_priv, - NFT_MSG_NEWSETELEM); if (te->set->ops->commit && list_empty(&te->set->pending_update)) { list_add_tail(&te->set->pending_update, @@ -10576,14 +10691,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) case NFT_MSG_DESTROYSETELEM: te = nft_trans_container_elem(trans); - nf_tables_setelem_notify(&ctx, te->set, - te->elem_priv, - trans->msg_type); - nft_setelem_remove(net, te->set, te->elem_priv); - if (!nft_setelem_is_catchall(te->set, te->elem_priv)) { - atomic_dec(&te->set->nelems); - te->set->ndeact--; - } + nft_trans_elems_remove(&ctx, te); + if (te->set->ops->commit && list_empty(&te->set->pending_update)) { list_add_tail(&te->set->pending_update, @@ -10703,8 +10812,7 @@ static void nf_tables_abort_release(struct nft_trans *trans) nft_set_destroy(&ctx, nft_trans_set(trans)); break; case NFT_MSG_NEWSETELEM: - nft_set_elem_destroy(nft_trans_elem_set(trans), - nft_trans_elem_priv(trans), true); + nft_trans_set_elem_destroy(&ctx, nft_trans_container_elem(trans)); break; case NFT_MSG_NEWOBJ: nft_obj_destroy(&ctx, nft_trans_obj(trans)); @@ -10861,18 +10969,15 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_trans_destroy(trans); break; case NFT_MSG_NEWSETELEM: - if (nft_trans_elem_update_flags(trans) || - nft_trans_elem_set_bound(trans)) { + if (nft_trans_elem_set_bound(trans)) { nft_trans_destroy(trans); break; } te = nft_trans_container_elem(trans); - if (!te->set->ops->abort || - nft_setelem_is_catchall(te->set, te->elem_priv)) - nft_setelem_remove(net, te->set, te->elem_priv); - - if (!nft_setelem_is_catchall(te->set, te->elem_priv)) - atomic_dec(&te->set->nelems); + if (!nft_trans_elems_new_abort(&ctx, te)) { + nft_trans_destroy(trans); + break; + } if (te->set->ops->abort && list_empty(&te->set->pending_update)) { @@ -10884,12 +10989,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) case NFT_MSG_DESTROYSETELEM: te = nft_trans_container_elem(trans); - if (!nft_setelem_active_next(net, te->set, te->elem_priv)) { - nft_setelem_data_activate(net, te->set, te->elem_priv); - nft_setelem_activate(net, te->set, te->elem_priv); - } - if (!nft_setelem_is_catchall(te->set, te->elem_priv)) - te->set->ndeact--; + nft_trans_elems_destroy_abort(&ctx, te); if (te->set->ops->abort && list_empty(&te->set->pending_update)) { -- cgit v1.2.3 From 508180850b732c7a0e3a728460cf3f95f25e1fbd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 13 Nov 2024 16:35:53 +0100 Subject: netfilter: nf_tables: allocate element update information dynamically Move the timeout/expire/flag members from nft_trans_one_elem struct into a dybamically allocated structure, only needed when timeout update was requested. This halves size of nft_trans_one_elem struct and allows to compact up to 124 elements in one transaction container rather than 62. This halves memory requirements for a large flush or insert transaction, where ->update remains NULL. Care has to be taken to release the extra data in all spots, including abort path. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 10 ++++--- net/netfilter/nf_tables_api.c | 57 ++++++++++++++++++++++++--------------- 2 files changed, 43 insertions(+), 24 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 37af0b174c39..80a537ac26cd 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1759,11 +1759,15 @@ enum nft_trans_elem_flags { NFT_TRANS_UPD_EXPIRATION = (1 << 1), }; -struct nft_trans_one_elem { - struct nft_elem_priv *priv; +struct nft_elem_update { u64 timeout; u64 expiration; - u8 update_flags; + u8 flags; +}; + +struct nft_trans_one_elem { + struct nft_elem_priv *priv; + struct nft_elem_update *update; }; struct nft_trans_elem { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 679312d71bbe..21b6f7410a1f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6706,7 +6706,8 @@ static void nft_trans_set_elem_destroy(const struct nft_ctx *ctx, struct nft_tra int i; for (i = 0; i < te->nelems; i++) { - if (te->elems[i].update_flags) + /* skip update request, see nft_trans_elems_new_abort() */ + if (!te->elems[i].priv) continue; __nft_set_elem_destroy(ctx, te->set, te->elems[i].priv, true); @@ -6897,12 +6898,13 @@ static void nft_trans_elem_update(const struct nft_set *set, const struct nft_trans_one_elem *elem) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + const struct nft_elem_update *update = elem->update; - if (elem->update_flags & NFT_TRANS_UPD_TIMEOUT) - WRITE_ONCE(nft_set_ext_timeout(ext)->timeout, elem->timeout); + if (update->flags & NFT_TRANS_UPD_TIMEOUT) + WRITE_ONCE(nft_set_ext_timeout(ext)->timeout, update->timeout); - if (elem->update_flags & NFT_TRANS_UPD_EXPIRATION) - WRITE_ONCE(nft_set_ext_timeout(ext)->expiration, get_jiffies_64() + elem->expiration); + if (update->flags & NFT_TRANS_UPD_EXPIRATION) + WRITE_ONCE(nft_set_ext_timeout(ext)->expiration, get_jiffies_64() + update->expiration); } static void nft_trans_elems_add(const struct nft_ctx *ctx, @@ -6911,15 +6913,16 @@ static void nft_trans_elems_add(const struct nft_ctx *ctx, int i; for (i = 0; i < te->nelems; i++) { - const struct nft_trans_one_elem *elem = &te->elems[i]; + struct nft_trans_one_elem *elem = &te->elems[i]; - if (elem->update_flags) + if (elem->update) nft_trans_elem_update(te->set, elem); else nft_setelem_activate(ctx->net, te->set, elem->priv); nf_tables_setelem_notify(ctx, te->set, elem->priv, NFT_MSG_NEWSETELEM); + kfree(elem->update); } } @@ -7011,6 +7014,8 @@ static void nft_trans_elems_remove(const struct nft_ctx *ctx, int i; for (i = 0; i < te->nelems; i++) { + WARN_ON_ONCE(te->elems[i].update); + nf_tables_setelem_notify(ctx, te->set, te->elems[i].priv, te->nft_trans.msg_type); @@ -7059,7 +7064,6 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_data_desc desc; enum nft_registers dreg; struct nft_trans *trans; - u8 update_flags; u64 expiration; u64 timeout; int err, i; @@ -7374,26 +7378,32 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, else if (!(nlmsg_flags & NLM_F_EXCL)) { err = 0; if (nft_set_ext_exists(ext2, NFT_SET_EXT_TIMEOUT)) { - struct nft_trans_one_elem *update; - - update = &nft_trans_container_elem(trans)->elems[0]; + struct nft_elem_update update = { }; - update_flags = 0; if (timeout != nft_set_ext_timeout(ext2)->timeout) { - update->timeout = timeout; + update.timeout = timeout; if (expiration == 0) expiration = timeout; - update_flags |= NFT_TRANS_UPD_TIMEOUT; + update.flags |= NFT_TRANS_UPD_TIMEOUT; } if (expiration) { - update->expiration = expiration; - update_flags |= NFT_TRANS_UPD_EXPIRATION; + update.expiration = expiration; + update.flags |= NFT_TRANS_UPD_EXPIRATION; } - if (update_flags) { - update->priv = elem_priv; - update->update_flags = update_flags; + if (update.flags) { + struct nft_trans_one_elem *ue; + + ue = &nft_trans_container_elem(trans)->elems[0]; + + ue->update = kmemdup(&update, sizeof(update), GFP_KERNEL); + if (!ue->update) { + err = -ENOMEM; + goto err_element_clash; + } + + ue->priv = elem_priv; nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); goto err_elem_free; } @@ -7561,14 +7571,19 @@ void nft_setelem_data_deactivate(const struct net *net, * Returns true if set had been added to (i.e., elements need to be removed again). */ static bool nft_trans_elems_new_abort(const struct nft_ctx *ctx, - const struct nft_trans_elem *te) + struct nft_trans_elem *te) { bool removed = false; int i; for (i = 0; i < te->nelems; i++) { - if (te->elems[i].update_flags) + if (te->elems[i].update) { + kfree(te->elems[i].update); + te->elems[i].update = NULL; + /* Update request, so do not release this element */ + te->elems[i].priv = NULL; continue; + } if (!te->set->ops->abort || nft_setelem_is_catchall(te->set, te->elems[i].priv)) nft_setelem_remove(ctx->net, te->set, te->elems[i].priv); -- cgit v1.2.3 From 94464a7b71634037b13d54021e0dfd0fb0d8c1f0 Mon Sep 17 00:00:00 2001 From: Danil Pylaev Date: Mon, 21 Oct 2024 12:22:44 +0000 Subject: Bluetooth: Add new quirks for ATS2851 This adds quirks for broken extended create connection, and write auth payload timeout. Signed-off-by: Danil Pylaev Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 14 ++++++++++++++ include/net/bluetooth/hci_core.h | 10 ++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index bab1e3d7452a..4f64066915be 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -300,6 +300,20 @@ enum { */ HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT, + /* + * When this quirk is set, the HCI_OP_LE_EXT_CREATE_CONN command is + * disabled. This is required for the Actions Semiconductor ATS2851 + * based controllers, which erroneously claims to support it. + */ + HCI_QUIRK_BROKEN_EXT_CREATE_CONN, + + /* + * When this quirk is set, the command WRITE_AUTH_PAYLOAD_TIMEOUT is + * skipped. This is required for the Actions Semiconductor ATS2851 + * based controllers, due to a race condition in pairing process. + */ + HCI_QUIRK_BROKEN_WRITE_AUTH_PAYLOAD_TIMEOUT, + /* When this quirk is set, MSFT extension monitor tracking by * address filter is supported. Since tracking quantity of each * pattern is limited, this feature supports tracking multiple diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 88265d37aa72..94ddc8684973 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1871,8 +1871,8 @@ void hci_conn_del_sysfs(struct hci_conn *conn); !test_bit(HCI_QUIRK_BROKEN_EXT_SCAN, &(dev)->quirks)) /* Use ext create connection if command is supported */ -#define use_ext_conn(dev) ((dev)->commands[37] & 0x80) - +#define use_ext_conn(dev) (((dev)->commands[37] & 0x80) && \ + !test_bit(HCI_QUIRK_BROKEN_EXT_CREATE_CONN, &(dev)->quirks)) /* Extended advertising support */ #define ext_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_EXT_ADV)) @@ -1885,8 +1885,10 @@ void hci_conn_del_sysfs(struct hci_conn *conn); * C24: Mandatory if the LE Controller supports Connection State and either * LE Feature (LL Privacy) or LE Feature (Extended Advertising) is supported */ -#define use_enhanced_conn_complete(dev) (ll_privacy_capable(dev) || \ - ext_adv_capable(dev)) +#define use_enhanced_conn_complete(dev) ((ll_privacy_capable(dev) || \ + ext_adv_capable(dev)) && \ + !test_bit(HCI_QUIRK_BROKEN_EXT_CREATE_CONN, \ + &(dev)->quirks)) /* Periodic advertising support */ #define per_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_PERIODIC_ADV)) -- cgit v1.2.3 From 4a5e0ba68676b3a77298cf646cd2b39c94fbd2f5 Mon Sep 17 00:00:00 2001 From: Iulia Tanasescu Date: Fri, 1 Nov 2024 10:23:36 +0200 Subject: Bluetooth: ISO: Do not emit LE PA Create Sync if previous is pending The Bluetooth Core spec does not allow a LE PA Create sync command to be sent to Controller if another one is pending (Vol 4, Part E, page 2493). In order to avoid this issue, the HCI_CONN_CREATE_PA_SYNC was added to mark that the LE PA Create Sync command has been sent for a hcon. Once the PA Sync Established event is received, the hcon flag is erased and the next pending hcon is handled. Signed-off-by: Iulia Tanasescu Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 3 +- include/net/bluetooth/hci_core.h | 34 +++++++++++ net/bluetooth/hci_conn.c | 123 +++++++++++++++++++++++++++------------ net/bluetooth/hci_event.c | 19 +++++- 4 files changed, 139 insertions(+), 40 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 4f64066915be..4becf201b063 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1,7 +1,7 @@ /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated - Copyright 2023 NXP + Copyright 2023-2024 NXP Written 2000,2001 by Maxim Krasnyansky @@ -697,6 +697,7 @@ enum { #define HCI_RSSI_INVALID 127 #define HCI_SYNC_HANDLE_INVALID 0xffff +#define HCI_SID_INVALID 0xff #define HCI_ROLE_MASTER 0x00 #define HCI_ROLE_SLAVE 0x01 diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 94ddc8684973..43474b751a50 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -668,6 +668,7 @@ struct hci_conn { __u8 adv_instance; __u16 handle; __u16 sync_handle; + __u8 sid; __u16 state; __u16 mtu; __u8 mode; @@ -947,6 +948,7 @@ enum { HCI_CONN_CREATE_CIS, HCI_CONN_BIG_SYNC, HCI_CONN_BIG_SYNC_FAILED, + HCI_CONN_CREATE_PA_SYNC, HCI_CONN_PA_SYNC, HCI_CONN_PA_SYNC_FAILED, }; @@ -1099,6 +1101,30 @@ static inline struct hci_conn *hci_conn_hash_lookup_bis(struct hci_dev *hdev, return NULL; } +static inline struct hci_conn *hci_conn_hash_lookup_sid(struct hci_dev *hdev, + __u8 sid, + bdaddr_t *dst, + __u8 dst_type) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type != ISO_LINK || bacmp(&c->dst, dst) || + c->dst_type != dst_type || c->sid != sid) + continue; + + rcu_read_unlock(); + return c; + } + + rcu_read_unlock(); + + return NULL; +} + static inline struct hci_conn * hci_conn_hash_lookup_per_adv_bis(struct hci_dev *hdev, bdaddr_t *ba, @@ -1328,6 +1354,13 @@ hci_conn_hash_lookup_pa_sync_handle(struct hci_dev *hdev, __u16 sync_handle) if (c->type != ISO_LINK) continue; + /* Ignore the listen hcon, we are looking + * for the child hcon that was created as + * a result of the PA sync established event. + */ + if (c->state == BT_LISTEN) + continue; + if (c->sync_handle == sync_handle) { rcu_read_unlock(); return c; @@ -1445,6 +1478,7 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle); void hci_sco_setup(struct hci_conn *conn, __u8 status); bool hci_iso_setup_path(struct hci_conn *conn); int hci_le_create_cis_pending(struct hci_dev *hdev); +int hci_pa_create_sync_pending(struct hci_dev *hdev); int hci_conn_check_create_cis(struct hci_conn *conn); struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 40c4a36d2be3..f9da12339db8 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -952,6 +952,7 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t conn->tx_power = HCI_TX_POWER_INVALID; conn->max_tx_power = HCI_TX_POWER_INVALID; conn->sync_handle = HCI_SYNC_HANDLE_INVALID; + conn->sid = HCI_SID_INVALID; set_bit(HCI_CONN_POWER_SAVE, &conn->flags); conn->disc_timeout = HCI_DISCONN_TIMEOUT; @@ -2062,73 +2063,119 @@ static int create_big_sync(struct hci_dev *hdev, void *data) static void create_pa_complete(struct hci_dev *hdev, void *data, int err) { - struct hci_cp_le_pa_create_sync *cp = data; - bt_dev_dbg(hdev, ""); if (err) bt_dev_err(hdev, "Unable to create PA: %d", err); +} + +static bool hci_conn_check_create_pa_sync(struct hci_conn *conn) +{ + if (conn->type != ISO_LINK || conn->sid == HCI_SID_INVALID) + return false; - kfree(cp); + return true; } static int create_pa_sync(struct hci_dev *hdev, void *data) { - struct hci_cp_le_pa_create_sync *cp = data; - int err; + struct hci_cp_le_pa_create_sync *cp = NULL; + struct hci_conn *conn; + int err = 0; - err = __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_CREATE_SYNC, - sizeof(*cp), cp, HCI_CMD_TIMEOUT); - if (err) { - hci_dev_clear_flag(hdev, HCI_PA_SYNC); - return err; + hci_dev_lock(hdev); + + rcu_read_lock(); + + /* The spec allows only one pending LE Periodic Advertising Create + * Sync command at a time. If the command is pending now, don't do + * anything. We check for pending connections after each PA Sync + * Established event. + * + * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E + * page 2493: + * + * If the Host issues this command when another HCI_LE_Periodic_ + * Advertising_Create_Sync command is pending, the Controller shall + * return the error code Command Disallowed (0x0C). + */ + list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { + if (test_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags)) + goto unlock; } - return hci_update_passive_scan_sync(hdev); + list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { + if (hci_conn_check_create_pa_sync(conn)) { + struct bt_iso_qos *qos = &conn->iso_qos; + + cp = kzalloc(sizeof(*cp), GFP_KERNEL); + if (!cp) { + err = -ENOMEM; + goto unlock; + } + + cp->options = qos->bcast.options; + cp->sid = conn->sid; + cp->addr_type = conn->dst_type; + bacpy(&cp->addr, &conn->dst); + cp->skip = cpu_to_le16(qos->bcast.skip); + cp->sync_timeout = cpu_to_le16(qos->bcast.sync_timeout); + cp->sync_cte_type = qos->bcast.sync_cte_type; + + break; + } + } + +unlock: + rcu_read_unlock(); + + hci_dev_unlock(hdev); + + if (cp) { + hci_dev_set_flag(hdev, HCI_PA_SYNC); + set_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags); + + err = __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_CREATE_SYNC, + sizeof(*cp), cp, HCI_CMD_TIMEOUT); + if (!err) + err = hci_update_passive_scan_sync(hdev); + + kfree(cp); + + if (err) { + hci_dev_clear_flag(hdev, HCI_PA_SYNC); + clear_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags); + } + } + + return err; +} + +int hci_pa_create_sync_pending(struct hci_dev *hdev) +{ + /* Queue start pa_create_sync and scan */ + return hci_cmd_sync_queue(hdev, create_pa_sync, + NULL, create_pa_complete); } struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, __u8 sid, struct bt_iso_qos *qos) { - struct hci_cp_le_pa_create_sync *cp; struct hci_conn *conn; - int err; - - if (hci_dev_test_and_set_flag(hdev, HCI_PA_SYNC)) - return ERR_PTR(-EBUSY); conn = hci_conn_add_unset(hdev, ISO_LINK, dst, HCI_ROLE_SLAVE); if (IS_ERR(conn)) return conn; conn->iso_qos = *qos; + conn->dst_type = dst_type; + conn->sid = sid; conn->state = BT_LISTEN; hci_conn_hold(conn); - cp = kzalloc(sizeof(*cp), GFP_KERNEL); - if (!cp) { - hci_dev_clear_flag(hdev, HCI_PA_SYNC); - hci_conn_drop(conn); - return ERR_PTR(-ENOMEM); - } - - cp->options = qos->bcast.options; - cp->sid = sid; - cp->addr_type = dst_type; - bacpy(&cp->addr, dst); - cp->skip = cpu_to_le16(qos->bcast.skip); - cp->sync_timeout = cpu_to_le16(qos->bcast.sync_timeout); - cp->sync_cte_type = qos->bcast.sync_cte_type; - - /* Queue start pa_create_sync and scan */ - err = hci_cmd_sync_queue(hdev, create_pa_sync, cp, create_pa_complete); - if (err < 0) { - hci_conn_drop(conn); - kfree(cp); - return ERR_PTR(err); - } + hci_pa_create_sync_pending(hdev); return conn; } diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 65f5ed2ded70..fd269fcabc2e 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6352,7 +6352,7 @@ static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data, struct hci_ev_le_pa_sync_established *ev = data; int mask = hdev->link_mode; __u8 flags = 0; - struct hci_conn *pa_sync; + struct hci_conn *pa_sync, *conn; bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); @@ -6360,6 +6360,20 @@ static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data, hci_dev_clear_flag(hdev, HCI_PA_SYNC); + conn = hci_conn_hash_lookup_sid(hdev, ev->sid, &ev->bdaddr, + ev->bdaddr_type); + if (!conn) { + bt_dev_err(hdev, + "Unable to find connection for dst %pMR sid 0x%2.2x", + &ev->bdaddr, ev->sid); + goto unlock; + } + + clear_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags); + + conn->sync_handle = le16_to_cpu(ev->handle); + conn->sid = HCI_SID_INVALID; + mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, ISO_LINK, &flags); if (!(mask & HCI_LM_ACCEPT)) { hci_le_pa_term_sync(hdev, ev->handle); @@ -6386,6 +6400,9 @@ static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data, } unlock: + /* Handle any other pending PA sync command */ + hci_pa_create_sync_pending(hdev); + hci_dev_unlock(hdev); } -- cgit v1.2.3 From 42ecf1947135110ea08abeaca39741636f9a2285 Mon Sep 17 00:00:00 2001 From: Iulia Tanasescu Date: Fri, 1 Nov 2024 10:23:38 +0200 Subject: Bluetooth: ISO: Do not emit LE BIG Create Sync if previous is pending The Bluetooth Core spec does not allow a LE BIG Create sync command to be sent to Controller if another one is pending (Vol 4, Part E, page 2586). In order to avoid this issue, the HCI_CONN_CREATE_BIG_SYNC was added to mark that the LE BIG Create Sync command has been sent for a hcon. Once the BIG Sync Established event is received, the hcon flag is erased and the next pending hcon is handled. Signed-off-by: Iulia Tanasescu Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 1 + include/net/bluetooth/hci_core.h | 29 ++++++++++++++ net/bluetooth/hci_conn.c | 87 +++++++++++++++++++++++++++++++++------- net/bluetooth/hci_event.c | 20 ++++++++- net/bluetooth/iso.c | 4 +- 5 files changed, 125 insertions(+), 16 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 4becf201b063..5bb4eaa52e14 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -29,6 +29,7 @@ #define HCI_MAX_ACL_SIZE 1024 #define HCI_MAX_SCO_SIZE 255 #define HCI_MAX_ISO_SIZE 251 +#define HCI_MAX_ISO_BIS 31 #define HCI_MAX_EVENT_SIZE 260 #define HCI_MAX_FRAME_SIZE (HCI_MAX_ACL_SIZE + 4) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 43474b751a50..c95f7e6ba255 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -711,6 +711,9 @@ struct hci_conn { __s8 tx_power; __s8 max_tx_power; struct bt_iso_qos iso_qos; + __u8 num_bis; + __u8 bis[HCI_MAX_ISO_BIS]; + unsigned long flags; enum conn_reasons conn_reason; @@ -946,6 +949,7 @@ enum { HCI_CONN_PER_ADV, HCI_CONN_BIG_CREATED, HCI_CONN_CREATE_CIS, + HCI_CONN_CREATE_BIG_SYNC, HCI_CONN_BIG_SYNC, HCI_CONN_BIG_SYNC_FAILED, HCI_CONN_CREATE_PA_SYNC, @@ -1295,6 +1299,30 @@ static inline struct hci_conn *hci_conn_hash_lookup_big(struct hci_dev *hdev, return NULL; } +static inline struct hci_conn * +hci_conn_hash_lookup_big_sync_pend(struct hci_dev *hdev, + __u8 handle, __u8 num_bis) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type != ISO_LINK) + continue; + + if (handle == c->iso_qos.bcast.big && num_bis == c->num_bis) { + rcu_read_unlock(); + return c; + } + } + + rcu_read_unlock(); + + return NULL; +} + static inline struct hci_conn * hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state) { @@ -1479,6 +1507,7 @@ void hci_sco_setup(struct hci_conn *conn, __u8 status); bool hci_iso_setup_path(struct hci_conn *conn); int hci_le_create_cis_pending(struct hci_dev *hdev); int hci_pa_create_sync_pending(struct hci_dev *hdev); +int hci_le_big_create_sync_pending(struct hci_dev *hdev); int hci_conn_check_create_cis(struct hci_conn *conn); struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index f9da12339db8..e996e9763666 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -2180,34 +2180,93 @@ struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, return conn; } +static bool hci_conn_check_create_big_sync(struct hci_conn *conn) +{ + if (!conn->num_bis) + return false; + + return true; +} + +int hci_le_big_create_sync_pending(struct hci_dev *hdev) +{ + DEFINE_FLEX(struct hci_cp_le_big_create_sync, pdu, bis, num_bis, 0x11); + struct hci_conn *conn; + + rcu_read_lock(); + + pdu->num_bis = 0; + + /* The spec allows only one pending LE BIG Create Sync command at + * a time. If the command is pending now, don't do anything. We + * check for pending connections after each BIG Sync Established + * event. + * + * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E + * page 2586: + * + * If the Host sends this command when the Controller is in the + * process of synchronizing to any BIG, i.e. the HCI_LE_BIG_Sync_ + * Established event has not been generated, the Controller shall + * return the error code Command Disallowed (0x0C). + */ + list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { + if (test_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags)) + goto unlock; + } + + list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { + if (hci_conn_check_create_big_sync(conn)) { + struct bt_iso_qos *qos = &conn->iso_qos; + + set_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags); + + pdu->handle = qos->bcast.big; + pdu->sync_handle = cpu_to_le16(conn->sync_handle); + pdu->encryption = qos->bcast.encryption; + memcpy(pdu->bcode, qos->bcast.bcode, + sizeof(pdu->bcode)); + pdu->mse = qos->bcast.mse; + pdu->timeout = cpu_to_le16(qos->bcast.timeout); + pdu->num_bis = conn->num_bis; + memcpy(pdu->bis, conn->bis, conn->num_bis); + + break; + } + } + +unlock: + rcu_read_unlock(); + + if (!pdu->num_bis) + return 0; + + return hci_send_cmd(hdev, HCI_OP_LE_BIG_CREATE_SYNC, + struct_size(pdu, bis, pdu->num_bis), pdu); +} + int hci_le_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, struct bt_iso_qos *qos, __u16 sync_handle, __u8 num_bis, __u8 bis[]) { - DEFINE_FLEX(struct hci_cp_le_big_create_sync, pdu, bis, num_bis, 0x11); int err; - if (num_bis < 0x01 || num_bis > pdu->num_bis) + if (num_bis < 0x01 || num_bis > ISO_MAX_NUM_BIS) return -EINVAL; err = qos_set_big(hdev, qos); if (err) return err; - if (hcon) - hcon->iso_qos.bcast.big = qos->bcast.big; + if (hcon) { + /* Update hcon QoS */ + hcon->iso_qos = *qos; - pdu->handle = qos->bcast.big; - pdu->sync_handle = cpu_to_le16(sync_handle); - pdu->encryption = qos->bcast.encryption; - memcpy(pdu->bcode, qos->bcast.bcode, sizeof(pdu->bcode)); - pdu->mse = qos->bcast.mse; - pdu->timeout = cpu_to_le16(qos->bcast.timeout); - pdu->num_bis = num_bis; - memcpy(pdu->bis, bis, num_bis); + hcon->num_bis = num_bis; + memcpy(hcon->bis, bis, num_bis); + } - return hci_send_cmd(hdev, HCI_OP_LE_BIG_CREATE_SYNC, - struct_size(pdu, bis, num_bis), pdu); + return hci_le_big_create_sync_pending(hdev); } static void create_big_complete(struct hci_dev *hdev, void *data, int err) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index fd269fcabc2e..2b5ba8acd1d8 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6920,7 +6920,7 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { struct hci_evt_le_big_sync_estabilished *ev = data; - struct hci_conn *bis; + struct hci_conn *bis, *conn; int i; bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); @@ -6931,6 +6931,20 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, hci_dev_lock(hdev); + conn = hci_conn_hash_lookup_big_sync_pend(hdev, ev->handle, + ev->num_bis); + if (!conn) { + bt_dev_err(hdev, + "Unable to find connection for big 0x%2.2x", + ev->handle); + goto unlock; + } + + clear_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags); + + conn->num_bis = 0; + memset(conn->bis, 0, sizeof(conn->num_bis)); + for (i = 0; i < ev->num_bis; i++) { u16 handle = le16_to_cpu(ev->bis[i]); __le32 interval; @@ -6980,6 +6994,10 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, hci_connect_cfm(bis, ev->status); } +unlock: + /* Handle any other pending BIG sync command */ + hci_le_big_create_sync_pending(hdev); + hci_dev_unlock(hdev); } diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 0d98cc16bbac..9499ddfd25e7 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1957,6 +1957,7 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) if (sk) { int err; + struct hci_conn *hcon = iso_pi(sk)->conn->hcon; iso_pi(sk)->qos.bcast.encryption = ev2->encryption; @@ -1965,7 +1966,8 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) if (!test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags) && !test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) { - err = hci_le_big_create_sync(hdev, NULL, + err = hci_le_big_create_sync(hdev, + hcon, &iso_pi(sk)->qos, iso_pi(sk)->sync_handle, iso_pi(sk)->bc_num_bis, -- cgit v1.2.3 From 83d328a72eff3268ea4c19deb0a6cf4c7da15746 Mon Sep 17 00:00:00 2001 From: Iulia Tanasescu Date: Fri, 1 Nov 2024 10:23:39 +0200 Subject: Bluetooth: ISO: Update hci_conn_hash_lookup_big for Broadcast slave Currently, hci_conn_hash_lookup_big only checks for BIS master connections, by filtering out connections with the destination address set. This commit updates this function to also consider BIS slave connections, since it is also used for a Broadcast Receiver to set an available BIG handle before issuing the LE BIG Create Sync command. Signed-off-by: Iulia Tanasescu Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 12 +++++++++++- net/bluetooth/hci_event.c | 1 + net/bluetooth/iso.c | 1 - 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index c95f7e6ba255..ea798f07c5a2 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1285,7 +1285,17 @@ static inline struct hci_conn *hci_conn_hash_lookup_big(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (bacmp(&c->dst, BDADDR_ANY) || c->type != ISO_LINK) + if (c->type != ISO_LINK) + continue; + + /* An ISO_LINK hcon with BDADDR_ANY as destination + * address is a Broadcast connection. A Broadcast + * slave connection is associated with a PA train, + * so the sync_handle can be used to differentiate + * from unicast. + */ + if (bacmp(&c->dst, BDADDR_ANY) && + c->sync_handle == HCI_SYNC_HANDLE_INVALID) continue; if (handle == c->iso_qos.bcast.big) { diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 2b5ba8acd1d8..aca121408369 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6965,6 +6965,7 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, /* Mark PA sync as established */ set_bit(HCI_CONN_PA_SYNC, &bis->flags); + bis->sync_handle = conn->sync_handle; bis->iso_qos.bcast.big = ev->handle; memset(&interval, 0, sizeof(interval)); memcpy(&interval, ev->latency, sizeof(ev->latency)); diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 9499ddfd25e7..9e119da43147 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1839,7 +1839,6 @@ static void iso_conn_ready(struct iso_conn *conn) if (!bacmp(&hcon->dst, BDADDR_ANY)) { bacpy(&hcon->dst, &iso_pi(parent)->dst); hcon->dst_type = iso_pi(parent)->dst_type; - hcon->sync_handle = iso_pi(parent)->sync_handle; } if (ev3) { -- cgit v1.2.3 From 96e7c4273560be4744ba8c444bc6969745315251 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 6 Nov 2024 09:53:45 -0500 Subject: Bluetooth: HCI: Add IPC(11) bus type Zephyr(1) has been using the same bus defines as Linux so tools likes of btmon, etc, are able to decode the bus used by the driver to transport HCI packets. Link: https://github.com/zephyrproject-rtos/zephyr/pull/80808 Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 5bb4eaa52e14..6203bd8663b7 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -68,6 +68,7 @@ #define HCI_I2C 8 #define HCI_SMD 9 #define HCI_VIRTIO 10 +#define HCI_IPC 11 /* HCI device quirks */ enum { -- cgit v1.2.3 From 827af4787e74e8df9e8e0677a69fbb15e0856d2f Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 23 Oct 2024 16:55:57 -0400 Subject: Bluetooth: MGMT: Add initial implementation of MGMT_OP_HCI_CMD_SYNC This adds the initial implementation of MGMT_OP_HCI_CMD_SYNC as documented in mgmt-api (BlueZ tree): Send HCI command and wait for event Command =========================================== Command Code: 0x005B Controller Index: Command Parameters: Opcode (2 Octets) Event (1 Octet) Timeout (1 Octet) Parameter Length (2 Octets) Parameter (variable) Return Parameters: Response (1-variable Octets) This command may be used to send a HCI command and wait for an (optional) event. The HCI command is specified by the Opcode, any arbitrary is supported including vendor commands, but contrary to the like of Raw/User channel it is run as an HCI command send by the kernel since it uses its command synchronization thus it is possible to wait for a specific event as a response. Setting event to 0x00 will cause the command to wait for either HCI Command Status or HCI Command Complete. Timeout is specified in seconds, setting it to 0 will cause the default timeout to be used. Possible errors: Failed Invalid Parameters Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/mgmt.h | 10 ++++++++ net/bluetooth/mgmt.c | 60 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index d382679efd2b..affac861efdc 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -878,6 +878,16 @@ struct mgmt_cp_mesh_send_cancel { } __packed; #define MGMT_MESH_SEND_CANCEL_SIZE 1 +#define MGMT_OP_HCI_CMD_SYNC 0x005B +struct mgmt_cp_hci_cmd_sync { + __le16 opcode; + __u8 event; + __u8 timeout; + __le16 params_len; + __u8 params[]; +} __packed; +#define MGMT_HCI_CMD_SYNC_SIZE 6 + #define MGMT_EV_CMD_COMPLETE 0x0001 struct mgmt_ev_cmd_complete { __le16 opcode; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index a429661b676a..1f6d083682b8 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -132,6 +132,7 @@ static const u16 mgmt_commands[] = { MGMT_OP_MESH_READ_FEATURES, MGMT_OP_MESH_SEND, MGMT_OP_MESH_SEND_CANCEL, + MGMT_OP_HCI_CMD_SYNC, }; static const u16 mgmt_events[] = { @@ -2515,6 +2516,64 @@ unlock: return err; } +static int send_hci_cmd_sync(struct hci_dev *hdev, void *data) +{ + struct mgmt_pending_cmd *cmd = data; + struct mgmt_cp_hci_cmd_sync *cp = cmd->param; + struct sk_buff *skb; + + skb = __hci_cmd_sync_ev(hdev, le16_to_cpu(cp->opcode), + le16_to_cpu(cp->params_len), cp->params, + cp->event, cp->timeout ? + msecs_to_jiffies(cp->timeout * 1000) : + HCI_CMD_TIMEOUT); + if (IS_ERR(skb)) { + mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_HCI_CMD_SYNC, + mgmt_status(PTR_ERR(skb))); + goto done; + } + + mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_HCI_CMD_SYNC, 0, + skb->data, skb->len); + + kfree_skb(skb); + +done: + mgmt_pending_free(cmd); + + return 0; +} + +static int mgmt_hci_cmd_sync(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + struct mgmt_cp_hci_cmd_sync *cp = data; + struct mgmt_pending_cmd *cmd; + int err; + + if (len < sizeof(*cp)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_HCI_CMD_SYNC, + MGMT_STATUS_INVALID_PARAMS); + + hci_dev_lock(hdev); + cmd = mgmt_pending_new(sk, MGMT_OP_HCI_CMD_SYNC, hdev, data, len); + if (!cmd) + err = -ENOMEM; + else + err = hci_cmd_sync_queue(hdev, send_hci_cmd_sync, cmd, NULL); + + if (err < 0) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_HCI_CMD_SYNC, + MGMT_STATUS_FAILED); + + if (cmd) + mgmt_pending_free(cmd); + } + + hci_dev_unlock(hdev); + return err; +} + /* This is a helper function to test for pending mgmt commands that can * cause CoD or EIR HCI commands. We can only allow one such pending * mgmt command at a time since otherwise we cannot easily track what @@ -9371,6 +9430,7 @@ static const struct hci_mgmt_handler mgmt_handlers[] = { { mesh_send, MGMT_MESH_SEND_SIZE, HCI_MGMT_VAR_LEN }, { mesh_send_cancel, MGMT_MESH_SEND_CANCEL_SIZE }, + { mgmt_hci_cmd_sync, MGMT_HCI_CMD_SYNC_SIZE, HCI_MGMT_VAR_LEN }, }; void mgmt_index_added(struct hci_dev *hdev) -- cgit v1.2.3 From accdd51dc74ff65b7b7be1961b11723d228fbbbd Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Thu, 14 Nov 2024 18:52:04 +0800 Subject: net/udp: Add a new struct for hash2 slot Preparing for udp 4-tuple hash (uhash4 for short). To implement uhash4 without cache line missing when lookup, hslot2 is used to record the number of hashed sockets in hslot4. Thus adding a new struct udp_hslot_main with field hash4_cnt, which is used by hash2. The new struct is used to avoid doubling the size of udp_hslot. Before uhash4 lookup, firstly checking hash4_cnt to see if there are hashed sks in hslot4. Because hslot2 is always used in lookup, there is no cache line miss. Related helpers are updated, and use the helpers as possible. uhash4 is implemented in following patches. Signed-off-by: Philo Lu Acked-by: Willem de Bruijn Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/udp.h | 38 ++++++++++++++++++++++++++++++++++---- net/ipv4/udp.c | 44 +++++++++++++++++++++++--------------------- net/ipv6/udp.c | 15 ++++++--------- 3 files changed, 63 insertions(+), 34 deletions(-) (limited to 'include/net') diff --git a/include/net/udp.h b/include/net/udp.h index 61222545ab1c..62a7207e65f2 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -50,7 +50,7 @@ struct udp_skb_cb { #define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb)) /** - * struct udp_hslot - UDP hash slot + * struct udp_hslot - UDP hash slot used by udp_table.hash * * @head: head of list of sockets * @count: number of sockets in 'head' list @@ -60,7 +60,22 @@ struct udp_hslot { struct hlist_head head; int count; spinlock_t lock; -} __attribute__((aligned(2 * sizeof(long)))); +} __aligned(2 * sizeof(long)); + +/** + * struct udp_hslot_main - UDP hash slot used by udp_table.hash2 + * + * @hslot: basic hash slot + * @hash4_cnt: number of sockets in hslot4 of the same + * (local port, local address) + */ +struct udp_hslot_main { + struct udp_hslot hslot; /* must be the first member */ +#if !IS_ENABLED(CONFIG_BASE_SMALL) + u32 hash4_cnt; +#endif +} __aligned(2 * sizeof(long)); +#define UDP_HSLOT_MAIN(__hslot) ((struct udp_hslot_main *)(__hslot)) /** * struct udp_table - UDP table @@ -72,7 +87,7 @@ struct udp_hslot { */ struct udp_table { struct udp_hslot *hash; - struct udp_hslot *hash2; + struct udp_hslot_main *hash2; unsigned int mask; unsigned int log; }; @@ -84,6 +99,7 @@ static inline struct udp_hslot *udp_hashslot(struct udp_table *table, { return &table->hash[udp_hashfn(net, num, table->mask)]; } + /* * For secondary hash, net_hash_mix() is performed before calling * udp_hashslot2(), this explains difference with udp_hashslot() @@ -91,8 +107,22 @@ static inline struct udp_hslot *udp_hashslot(struct udp_table *table, static inline struct udp_hslot *udp_hashslot2(struct udp_table *table, unsigned int hash) { - return &table->hash2[hash & table->mask]; + return &table->hash2[hash & table->mask].hslot; +} + +#if IS_ENABLED(CONFIG_BASE_SMALL) +static inline void udp_table_hash4_init(struct udp_table *table) +{ +} +#else /* !CONFIG_BASE_SMALL */ + +/* Must be called with table->hash2 initialized */ +static inline void udp_table_hash4_init(struct udp_table *table) +{ + for (int i = 0; i <= table->mask; i++) + table->hash2[i].hash4_cnt = 0; } +#endif /* CONFIG_BASE_SMALL */ extern struct proto udp_prot; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0e24916b39d4..2fdac5fae2a8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -486,13 +486,12 @@ struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr, int sdif, struct udp_table *udptable, struct sk_buff *skb) { unsigned short hnum = ntohs(dport); - unsigned int hash2, slot2; struct udp_hslot *hslot2; struct sock *result, *sk; + unsigned int hash2; hash2 = ipv4_portaddr_hash(net, daddr, hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; + hslot2 = udp_hashslot2(udptable, hash2); /* Lookup connected or non-wildcard socket */ result = udp4_lib_lookup2(net, saddr, sport, @@ -519,8 +518,7 @@ struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr, /* Lookup wildcard sockets */ hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; + hslot2 = udp_hashslot2(udptable, hash2); result = udp4_lib_lookup2(net, saddr, sport, htonl(INADDR_ANY), hnum, dif, sdif, @@ -2268,7 +2266,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, udptable->mask; hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: - hslot = &udptable->hash2[hash2]; + hslot = &udptable->hash2[hash2].hslot; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); } @@ -2539,14 +2537,13 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, struct udp_table *udptable = net->ipv4.udp_table; INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); unsigned short hnum = ntohs(loc_port); - unsigned int hash2, slot2; struct udp_hslot *hslot2; + unsigned int hash2; __portpair ports; struct sock *sk; hash2 = ipv4_portaddr_hash(net, loc_addr, hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; + hslot2 = udp_hashslot2(udptable, hash2); ports = INET_COMBINED_PORTS(rmt_port, hnum); udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { @@ -3187,7 +3184,7 @@ again: batch_sks = 0; for (; state->bucket <= udptable->mask; state->bucket++) { - struct udp_hslot *hslot2 = &udptable->hash2[state->bucket]; + struct udp_hslot *hslot2 = &udptable->hash2[state->bucket].hslot; if (hlist_empty(&hslot2->head)) continue; @@ -3428,10 +3425,11 @@ __setup("uhash_entries=", set_uhash_entries); void __init udp_table_init(struct udp_table *table, const char *name) { - unsigned int i; + unsigned int i, slot_size; + slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main); table->hash = alloc_large_system_hash(name, - 2 * sizeof(struct udp_hslot), + slot_size, uhash_entries, 21, /* one slot per 2 MB */ 0, @@ -3440,17 +3438,18 @@ void __init udp_table_init(struct udp_table *table, const char *name) UDP_HTABLE_SIZE_MIN, UDP_HTABLE_SIZE_MAX); - table->hash2 = table->hash + (table->mask + 1); + table->hash2 = (void *)(table->hash + (table->mask + 1)); for (i = 0; i <= table->mask; i++) { INIT_HLIST_HEAD(&table->hash[i].head); table->hash[i].count = 0; spin_lock_init(&table->hash[i].lock); } for (i = 0; i <= table->mask; i++) { - INIT_HLIST_HEAD(&table->hash2[i].head); - table->hash2[i].count = 0; - spin_lock_init(&table->hash2[i].lock); + INIT_HLIST_HEAD(&table->hash2[i].hslot.head); + table->hash2[i].hslot.count = 0; + spin_lock_init(&table->hash2[i].hslot.lock); } + udp_table_hash4_init(table); } u32 udp_flow_hashrnd(void) @@ -3476,18 +3475,20 @@ static void __net_init udp_sysctl_init(struct net *net) static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_entries) { struct udp_table *udptable; + unsigned int slot_size; int i; udptable = kmalloc(sizeof(*udptable), GFP_KERNEL); if (!udptable) goto out; - udptable->hash = vmalloc_huge(hash_entries * 2 * sizeof(struct udp_hslot), + slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main); + udptable->hash = vmalloc_huge(hash_entries * slot_size, GFP_KERNEL_ACCOUNT); if (!udptable->hash) goto free_table; - udptable->hash2 = udptable->hash + hash_entries; + udptable->hash2 = (void *)(udptable->hash + hash_entries); udptable->mask = hash_entries - 1; udptable->log = ilog2(hash_entries); @@ -3496,10 +3497,11 @@ static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_ent udptable->hash[i].count = 0; spin_lock_init(&udptable->hash[i].lock); - INIT_HLIST_HEAD(&udptable->hash2[i].head); - udptable->hash2[i].count = 0; - spin_lock_init(&udptable->hash2[i].lock); + INIT_HLIST_HEAD(&udptable->hash2[i].hslot.head); + udptable->hash2[i].hslot.count = 0; + spin_lock_init(&udptable->hash2[i].hslot.lock); } + udp_table_hash4_init(udptable); return udptable; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 0cef8ae5d1ea..0d7aac9d44e5 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -224,13 +224,12 @@ struct sock *__udp6_lib_lookup(const struct net *net, struct sk_buff *skb) { unsigned short hnum = ntohs(dport); - unsigned int hash2, slot2; struct udp_hslot *hslot2; struct sock *result, *sk; + unsigned int hash2; hash2 = ipv6_portaddr_hash(net, daddr, hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; + hslot2 = udp_hashslot2(udptable, hash2); /* Lookup connected or non-wildcard sockets */ result = udp6_lib_lookup2(net, saddr, sport, @@ -257,8 +256,7 @@ struct sock *__udp6_lib_lookup(const struct net *net, /* Lookup wildcard sockets */ hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; + hslot2 = udp_hashslot2(udptable, hash2); result = udp6_lib_lookup2(net, saddr, sport, &in6addr_any, hnum, dif, sdif, @@ -859,7 +857,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, udptable->mask; hash2 = ipv6_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: - hslot = &udptable->hash2[hash2]; + hslot = &udptable->hash2[hash2].hslot; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); } @@ -1065,14 +1063,13 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net, { struct udp_table *udptable = net->ipv4.udp_table; unsigned short hnum = ntohs(loc_port); - unsigned int hash2, slot2; struct udp_hslot *hslot2; + unsigned int hash2; __portpair ports; struct sock *sk; hash2 = ipv6_portaddr_hash(net, loc_addr, hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; + hslot2 = udp_hashslot2(udptable, hash2); ports = INET_COMBINED_PORTS(rmt_port, hnum); udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { -- cgit v1.2.3 From dab78a1745ab3c6001e1e4d50a9d09efef8e260d Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Thu, 14 Nov 2024 18:52:05 +0800 Subject: net/udp: Add 4-tuple hash list basis Add a new hash list, hash4, in udp table. It will be used to implement 4-tuple hash for connected udp sockets. This patch adds the hlist to table, and implements helpers and the initialization. 4-tuple hash is implemented in the following patch. hash4 uses hlist_nulls to avoid moving wrongly onto another hlist due to concurrent rehash, because rehash() can happen with lookup(). Co-developed-by: Cambda Zhu Signed-off-by: Cambda Zhu Co-developed-by: Fred Chen Signed-off-by: Fred Chen Co-developed-by: Yubing Qiu Signed-off-by: Yubing Qiu Signed-off-by: Philo Lu Acked-by: Willem de Bruijn Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/udp.h | 11 +++++++ include/net/udp.h | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++-- net/ipv4/udp.c | 6 ++-- 3 files changed, 97 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/linux/udp.h b/include/linux/udp.h index 3eb3f2b9a2a0..0807e21cfec9 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -56,6 +56,12 @@ struct udp_sock { int pending; /* Any pending frames ? */ __u8 encap_type; /* Is this an Encapsulation socket? */ +#if !IS_ENABLED(CONFIG_BASE_SMALL) + /* For UDP 4-tuple hash */ + __u16 udp_lrpa_hash; + struct hlist_nulls_node udp_lrpa_node; +#endif + /* * Following member retains the information to create a UDP header * when the socket is uncorked. @@ -206,6 +212,11 @@ static inline void udp_allow_gso(struct sock *sk) #define udp_portaddr_for_each_entry_rcu(__sk, list) \ hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node) +#if !IS_ENABLED(CONFIG_BASE_SMALL) +#define udp_lrpa_for_each_entry_rcu(__up, node, list) \ + hlist_nulls_for_each_entry_rcu(__up, node, list, udp_lrpa_node) +#endif + #define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE) #endif /* _LINUX_UDP_H */ diff --git a/include/net/udp.h b/include/net/udp.h index 62a7207e65f2..edb669967130 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -50,14 +50,21 @@ struct udp_skb_cb { #define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb)) /** - * struct udp_hslot - UDP hash slot used by udp_table.hash + * struct udp_hslot - UDP hash slot used by udp_table.hash/hash4 * * @head: head of list of sockets + * @nulls_head: head of list of sockets, only used by hash4 * @count: number of sockets in 'head' list * @lock: spinlock protecting changes to head/count */ struct udp_hslot { - struct hlist_head head; + union { + struct hlist_head head; + /* hash4 uses hlist_nulls to avoid moving wrongly onto another + * hlist, because rehash() can happen with lookup(). + */ + struct hlist_nulls_head nulls_head; + }; int count; spinlock_t lock; } __aligned(2 * sizeof(long)); @@ -82,12 +89,17 @@ struct udp_hslot_main { * * @hash: hash table, sockets are hashed on (local port) * @hash2: hash table, sockets are hashed on (local port, local address) + * @hash4: hash table, connected sockets are hashed on + * (local port, local address, remote port, remote address) * @mask: number of slots in hash tables, minus 1 * @log: log2(number of slots in hash table) */ struct udp_table { struct udp_hslot *hash; struct udp_hslot_main *hash2; +#if !IS_ENABLED(CONFIG_BASE_SMALL) + struct udp_hslot *hash4; +#endif unsigned int mask; unsigned int log; }; @@ -114,13 +126,80 @@ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table, static inline void udp_table_hash4_init(struct udp_table *table) { } + +static inline struct udp_hslot *udp_hashslot4(struct udp_table *table, + unsigned int hash) +{ + BUILD_BUG(); + return NULL; +} + +static inline bool udp_hashed4(const struct sock *sk) +{ + return false; +} + +static inline unsigned int udp_hash4_slot_size(void) +{ + return 0; +} + +static inline bool udp_has_hash4(const struct udp_hslot *hslot2) +{ + return false; +} + +static inline void udp_hash4_inc(struct udp_hslot *hslot2) +{ +} + +static inline void udp_hash4_dec(struct udp_hslot *hslot2) +{ +} #else /* !CONFIG_BASE_SMALL */ /* Must be called with table->hash2 initialized */ static inline void udp_table_hash4_init(struct udp_table *table) { - for (int i = 0; i <= table->mask; i++) + table->hash4 = (void *)(table->hash2 + (table->mask + 1)); + for (int i = 0; i <= table->mask; i++) { table->hash2[i].hash4_cnt = 0; + + INIT_HLIST_NULLS_HEAD(&table->hash4[i].nulls_head, i); + table->hash4[i].count = 0; + spin_lock_init(&table->hash4[i].lock); + } +} + +static inline struct udp_hslot *udp_hashslot4(struct udp_table *table, + unsigned int hash) +{ + return &table->hash4[hash & table->mask]; +} + +static inline bool udp_hashed4(const struct sock *sk) +{ + return !hlist_nulls_unhashed(&udp_sk(sk)->udp_lrpa_node); +} + +static inline unsigned int udp_hash4_slot_size(void) +{ + return sizeof(struct udp_hslot); +} + +static inline bool udp_has_hash4(const struct udp_hslot *hslot2) +{ + return UDP_HSLOT_MAIN(hslot2)->hash4_cnt; +} + +static inline void udp_hash4_inc(struct udp_hslot *hslot2) +{ + UDP_HSLOT_MAIN(hslot2)->hash4_cnt++; +} + +static inline void udp_hash4_dec(struct udp_hslot *hslot2) +{ + UDP_HSLOT_MAIN(hslot2)->hash4_cnt--; } #endif /* CONFIG_BASE_SMALL */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 2fdac5fae2a8..0bc0881d6569 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -3427,7 +3427,8 @@ void __init udp_table_init(struct udp_table *table, const char *name) { unsigned int i, slot_size; - slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main); + slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main) + + udp_hash4_slot_size(); table->hash = alloc_large_system_hash(name, slot_size, uhash_entries, @@ -3482,7 +3483,8 @@ static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_ent if (!udptable) goto out; - slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main); + slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main) + + udp_hash4_slot_size(); udptable->hash = vmalloc_huge(hash_entries * slot_size, GFP_KERNEL_ACCOUNT); if (!udptable->hash) -- cgit v1.2.3 From 78c91ae2c6deb5d236a5a93ff2995cdd05514380 Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Thu, 14 Nov 2024 18:52:06 +0800 Subject: ipv4/udp: Add 4-tuple hash for connected socket Currently, the udp_table has two hash table, the port hash and portaddr hash. Usually for UDP servers, all sockets have the same local port and addr, so they are all on the same hash slot within a reuseport group. In some applications, UDP servers use connect() to manage clients. In particular, when firstly receiving from an unseen 4 tuple, a new socket is created and connect()ed to the remote addr:port, and then the fd is used exclusively by the client. Once there are connected sks in a reuseport group, udp has to score all sks in the same hash2 slot to find the best match. This could be inefficient with a large number of connections, resulting in high softirq overhead. To solve the problem, this patch implement 4-tuple hash for connected udp sockets. During connect(), hash4 slot is updated, as well as a corresponding counter, hash4_cnt, in hslot2. In __udp4_lib_lookup(), hslot4 will be searched firstly if the counter is non-zero. Otherwise, hslot2 is used like before. Note that only connected sockets enter this hash4 path, while un-connected ones are not affected. hlist_nulls is used for hash4, because we probably move to another hslot wrongly when lookup with concurrent rehash. Then we check nulls at the list end to see if we should restart lookup. Because udp does not use SLAB_TYPESAFE_BY_RCU, we don't need to touch sk_refcnt when lookup. Stress test results (with 1 cpu fully used) are shown below, in pps: (1) _un-connected_ socket as server [a] w/o hash4: 1,825176 [b] w/ hash4: 1,831750 (+0.36%) (2) 500 _connected_ sockets as server [c] w/o hash4: 290860 (only 16% of [a]) [d] w/ hash4: 1,889658 (+3.1% compared with [b]) With hash4, compute_score is skipped when lookup, so [d] is slightly better than [b]. Co-developed-by: Cambda Zhu Signed-off-by: Cambda Zhu Co-developed-by: Fred Chen Signed-off-by: Fred Chen Co-developed-by: Yubing Qiu Signed-off-by: Yubing Qiu Signed-off-by: Philo Lu Acked-by: Willem de Bruijn Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/udp.h | 16 ++++- net/ipv4/udp.c | 197 +++++++++++++++++++++++++++++++++++++++++++++++++++++- net/ipv6/udp.c | 2 +- 3 files changed, 210 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/udp.h b/include/net/udp.h index edb669967130..feb06c0e48fb 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -302,13 +302,27 @@ static inline int udp_lib_hash(struct sock *sk) } void udp_lib_unhash(struct sock *sk); -void udp_lib_rehash(struct sock *sk, u16 new_hash); +void udp_lib_rehash(struct sock *sk, u16 new_hash, u16 new_hash4); static inline void udp_lib_close(struct sock *sk, long timeout) { sk_common_release(sk); } +/* hash4 routines shared between UDPv4/6 */ +#if IS_ENABLED(CONFIG_BASE_SMALL) +static inline void udp_lib_hash4(struct sock *sk, u16 hash) +{ +} + +static inline void udp4_hash4(struct sock *sk) +{ +} +#else /* !CONFIG_BASE_SMALL */ +void udp_lib_hash4(struct sock *sk, u16 hash); +void udp4_hash4(struct sock *sk); +#endif /* CONFIG_BASE_SMALL */ + int udp_lib_get_port(struct sock *sk, unsigned short snum, unsigned int hash2_nulladdr); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0bc0881d6569..b6c5edd7ff48 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -478,6 +478,159 @@ rescore: return result; } +#if IS_ENABLED(CONFIG_BASE_SMALL) +static struct sock *udp4_lib_lookup4(const struct net *net, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned int hnum, + int dif, int sdif, + struct udp_table *udptable) +{ + return NULL; +} + +static void udp_rehash4(struct udp_table *udptable, struct sock *sk, + u16 newhash4) +{ +} + +static void udp_unhash4(struct udp_table *udptable, struct sock *sk) +{ +} +#else /* !CONFIG_BASE_SMALL */ +static struct sock *udp4_lib_lookup4(const struct net *net, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned int hnum, + int dif, int sdif, + struct udp_table *udptable) +{ + const __portpair ports = INET_COMBINED_PORTS(sport, hnum); + const struct hlist_nulls_node *node; + struct udp_hslot *hslot4; + unsigned int hash4, slot; + struct udp_sock *up; + struct sock *sk; + + hash4 = udp_ehashfn(net, daddr, hnum, saddr, sport); + slot = hash4 & udptable->mask; + hslot4 = &udptable->hash4[slot]; + INET_ADDR_COOKIE(acookie, saddr, daddr); + +begin: + /* SLAB_TYPESAFE_BY_RCU not used, so we don't need to touch sk_refcnt */ + udp_lrpa_for_each_entry_rcu(up, node, &hslot4->nulls_head) { + sk = (struct sock *)up; + if (inet_match(net, sk, acookie, ports, dif, sdif)) + return sk; + } + + /* if the nulls value we got at the end of this lookup is not the + * expected one, we must restart lookup. We probably met an item that + * was moved to another chain due to rehash. + */ + if (get_nulls_value(node) != slot) + goto begin; + + return NULL; +} + +/* In hash4, rehash can happen in connect(), where hash4_cnt keeps unchanged. */ +static void udp_rehash4(struct udp_table *udptable, struct sock *sk, + u16 newhash4) +{ + struct udp_hslot *hslot4, *nhslot4; + + hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash); + nhslot4 = udp_hashslot4(udptable, newhash4); + udp_sk(sk)->udp_lrpa_hash = newhash4; + + if (hslot4 != nhslot4) { + spin_lock_bh(&hslot4->lock); + hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_lrpa_node); + hslot4->count--; + spin_unlock_bh(&hslot4->lock); + + spin_lock_bh(&nhslot4->lock); + hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_lrpa_node, + &nhslot4->nulls_head); + nhslot4->count++; + spin_unlock_bh(&nhslot4->lock); + } +} + +static void udp_unhash4(struct udp_table *udptable, struct sock *sk) +{ + struct udp_hslot *hslot2, *hslot4; + + if (udp_hashed4(sk)) { + hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); + hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash); + + spin_lock(&hslot4->lock); + hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_lrpa_node); + hslot4->count--; + spin_unlock(&hslot4->lock); + + spin_lock(&hslot2->lock); + udp_hash4_dec(hslot2); + spin_unlock(&hslot2->lock); + } +} + +void udp_lib_hash4(struct sock *sk, u16 hash) +{ + struct udp_hslot *hslot, *hslot2, *hslot4; + struct net *net = sock_net(sk); + struct udp_table *udptable; + + /* Connected udp socket can re-connect to another remote address, + * so rehash4 is needed. + */ + udptable = net->ipv4.udp_table; + if (udp_hashed4(sk)) { + udp_rehash4(udptable, sk, hash); + return; + } + + hslot = udp_hashslot(udptable, net, udp_sk(sk)->udp_port_hash); + hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); + hslot4 = udp_hashslot4(udptable, hash); + udp_sk(sk)->udp_lrpa_hash = hash; + + spin_lock_bh(&hslot->lock); + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); + + spin_lock(&hslot4->lock); + hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_lrpa_node, + &hslot4->nulls_head); + hslot4->count++; + spin_unlock(&hslot4->lock); + + spin_lock(&hslot2->lock); + udp_hash4_inc(hslot2); + spin_unlock(&hslot2->lock); + + spin_unlock_bh(&hslot->lock); +} +EXPORT_SYMBOL(udp_lib_hash4); + +/* call with sock lock */ +void udp4_hash4(struct sock *sk) +{ + struct net *net = sock_net(sk); + unsigned int hash; + + if (sk_unhashed(sk) || sk->sk_rcv_saddr == htonl(INADDR_ANY)) + return; + + hash = udp_ehashfn(net, sk->sk_rcv_saddr, sk->sk_num, + sk->sk_daddr, sk->sk_dport); + + udp_lib_hash4(sk, hash); +} +EXPORT_SYMBOL(udp4_hash4); +#endif /* CONFIG_BASE_SMALL */ + /* UDP is nearly always wildcards out the wazoo, it makes no sense to try * harder than this. -DaveM */ @@ -493,6 +646,13 @@ struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr, hash2 = ipv4_portaddr_hash(net, daddr, hnum); hslot2 = udp_hashslot2(udptable, hash2); + if (udp_has_hash4(hslot2)) { + result = udp4_lib_lookup4(net, saddr, sport, daddr, hnum, + dif, sdif, udptable); + if (result) /* udp4_lib_lookup4 return sk or NULL */ + return result; + } + /* Lookup connected or non-wildcard socket */ result = udp4_lib_lookup2(net, saddr, sport, daddr, hnum, dif, sdif, @@ -1933,6 +2093,18 @@ int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) } EXPORT_SYMBOL(udp_pre_connect); +static int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + int res; + + lock_sock(sk); + res = __ip4_datagram_connect(sk, uaddr, addr_len); + if (!res) + udp4_hash4(sk); + release_sock(sk); + return res; +} + int __udp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); @@ -1992,6 +2164,8 @@ void udp_lib_unhash(struct sock *sk) hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); hslot2->count--; spin_unlock(&hslot2->lock); + + udp_unhash4(udptable, sk); } spin_unlock_bh(&hslot->lock); } @@ -2001,7 +2175,7 @@ EXPORT_SYMBOL(udp_lib_unhash); /* * inet_rcv_saddr was changed, we must rehash secondary hash */ -void udp_lib_rehash(struct sock *sk, u16 newhash) +void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4) { if (sk_hashed(sk)) { struct udp_table *udptable = udp_get_table_prot(sk); @@ -2033,6 +2207,19 @@ void udp_lib_rehash(struct sock *sk, u16 newhash) spin_unlock(&nhslot2->lock); } + if (udp_hashed4(sk)) { + udp_rehash4(udptable, sk, newhash4); + + if (hslot2 != nhslot2) { + spin_lock(&hslot2->lock); + udp_hash4_dec(hslot2); + spin_unlock(&hslot2->lock); + + spin_lock(&nhslot2->lock); + udp_hash4_inc(nhslot2); + spin_unlock(&nhslot2->lock); + } + } spin_unlock_bh(&hslot->lock); } } @@ -2044,7 +2231,11 @@ void udp_v4_rehash(struct sock *sk) u16 new_hash = ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, inet_sk(sk)->inet_num); - udp_lib_rehash(sk, new_hash); + u16 new_hash4 = udp_ehashfn(sock_net(sk), + sk->sk_rcv_saddr, sk->sk_num, + sk->sk_daddr, sk->sk_dport); + + udp_lib_rehash(sk, new_hash, new_hash4); } static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) @@ -2937,7 +3128,7 @@ struct proto udp_prot = { .owner = THIS_MODULE, .close = udp_lib_close, .pre_connect = udp_pre_connect, - .connect = ip4_datagram_connect, + .connect = udp_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, .init = udp_init_sock, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 0d7aac9d44e5..1ea99d704e31 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -111,7 +111,7 @@ void udp_v6_rehash(struct sock *sk) &sk->sk_v6_rcv_saddr, inet_sk(sk)->inet_num); - udp_lib_rehash(sk, new_hash); + udp_lib_rehash(sk, new_hash, 0); /* 4-tuple hash not implemented */ } static int compute_score(struct sock *sk, const struct net *net, -- cgit v1.2.3 From 1b29a730ef8b6fd3aa3e11c2f6d409cf201cd913 Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Thu, 14 Nov 2024 18:52:07 +0800 Subject: ipv6/udp: Add 4-tuple hash for connected socket Implement ipv6 udp hash4 like that in ipv4. The major difference is that the hash value should be calculated with udp6_ehashfn(). Besides, ipv4-mapped ipv6 address is handled before hash() and rehash(). Export udp_ehashfn because now we use it in udpv6 rehash. Core procedures of hash/unhash/rehash are same as ipv4, and udpv4 and udpv6 share the same udptable, so some functions in ipv4 hash4 can also be shared. Co-developed-by: Cambda Zhu Signed-off-by: Cambda Zhu Co-developed-by: Fred Chen Signed-off-by: Fred Chen Co-developed-by: Yubing Qiu Signed-off-by: Yubing Qiu Signed-off-by: Philo Lu Acked-by: Willem de Bruijn Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/udp.h | 2 ++ net/ipv4/udp.c | 2 +- net/ipv6/udp.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 103 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/udp.h b/include/net/udp.h index feb06c0e48fb..6e89520e100d 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -303,6 +303,8 @@ static inline int udp_lib_hash(struct sock *sk) void udp_lib_unhash(struct sock *sk); void udp_lib_rehash(struct sock *sk, u16 new_hash, u16 new_hash4); +u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, + const __be32 faddr, const __be16 fport); static inline void udp_lib_close(struct sock *sk, long timeout) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b6c5edd7ff48..6a01905d379f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -410,7 +410,6 @@ static int compute_score(struct sock *sk, const struct net *net, return score; } -INDIRECT_CALLABLE_SCOPE u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, const __be16 fport) { @@ -419,6 +418,7 @@ u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, return __inet_ehashfn(laddr, lport, faddr, fport, udp_ehash_secret + net_hash_mix(net)); } +EXPORT_SYMBOL(udp_ehashfn); /* called with rcu_read_lock() */ static struct sock *udp4_lib_lookup2(const struct net *net, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 1ea99d704e31..d766fd798ecf 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -110,8 +110,19 @@ void udp_v6_rehash(struct sock *sk) u16 new_hash = ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, inet_sk(sk)->inet_num); + u16 new_hash4; - udp_lib_rehash(sk, new_hash, 0); /* 4-tuple hash not implemented */ + if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) { + new_hash4 = udp_ehashfn(sock_net(sk), + sk->sk_rcv_saddr, sk->sk_num, + sk->sk_daddr, sk->sk_dport); + } else { + new_hash4 = udp6_ehashfn(sock_net(sk), + &sk->sk_v6_rcv_saddr, sk->sk_num, + &sk->sk_v6_daddr, sk->sk_dport); + } + + udp_lib_rehash(sk, new_hash, new_hash4); } static int compute_score(struct sock *sk, const struct net *net, @@ -216,6 +227,74 @@ rescore: return result; } +#if IS_ENABLED(CONFIG_BASE_SMALL) +static struct sock *udp6_lib_lookup4(const struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, + unsigned int hnum, int dif, int sdif, + struct udp_table *udptable) +{ + return NULL; +} + +static void udp6_hash4(struct sock *sk) +{ +} +#else /* !CONFIG_BASE_SMALL */ +static struct sock *udp6_lib_lookup4(const struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, + unsigned int hnum, int dif, int sdif, + struct udp_table *udptable) +{ + const __portpair ports = INET_COMBINED_PORTS(sport, hnum); + const struct hlist_nulls_node *node; + struct udp_hslot *hslot4; + unsigned int hash4, slot; + struct udp_sock *up; + struct sock *sk; + + hash4 = udp6_ehashfn(net, daddr, hnum, saddr, sport); + slot = hash4 & udptable->mask; + hslot4 = &udptable->hash4[slot]; + +begin: + udp_lrpa_for_each_entry_rcu(up, node, &hslot4->nulls_head) { + sk = (struct sock *)up; + if (inet6_match(net, sk, saddr, daddr, ports, dif, sdif)) + return sk; + } + + /* if the nulls value we got at the end of this lookup is not the + * expected one, we must restart lookup. We probably met an item that + * was moved to another chain due to rehash. + */ + if (get_nulls_value(node) != slot) + goto begin; + + return NULL; +} + +static void udp6_hash4(struct sock *sk) +{ + struct net *net = sock_net(sk); + unsigned int hash; + + if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) { + udp4_hash4(sk); + return; + } + + if (sk_unhashed(sk) || ipv6_addr_any(&sk->sk_v6_rcv_saddr)) + return; + + hash = udp6_ehashfn(net, &sk->sk_v6_rcv_saddr, sk->sk_num, + &sk->sk_v6_daddr, sk->sk_dport); + + udp_lib_hash4(sk, hash); +} +#endif /* CONFIG_BASE_SMALL */ + /* rcu_read_lock() must be held */ struct sock *__udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr, __be16 sport, @@ -231,6 +310,13 @@ struct sock *__udp6_lib_lookup(const struct net *net, hash2 = ipv6_portaddr_hash(net, daddr, hnum); hslot2 = udp_hashslot2(udptable, hash2); + if (udp_has_hash4(hslot2)) { + result = udp6_lib_lookup4(net, saddr, sport, daddr, hnum, + dif, sdif, udptable); + if (result) /* udp6_lib_lookup4 return sk or NULL */ + return result; + } + /* Lookup connected or non-wildcard sockets */ result = udp6_lib_lookup2(net, saddr, sport, daddr, hnum, dif, sdif, @@ -1166,6 +1252,18 @@ static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr, return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len); } +static int udpv6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + int res; + + lock_sock(sk); + res = __ip6_datagram_connect(sk, uaddr, addr_len); + if (!res) + udp6_hash4(sk); + release_sock(sk); + return res; +} + /** * udp6_hwcsum_outgoing - handle outgoing HW checksumming * @sk: socket we are sending on @@ -1761,7 +1859,7 @@ struct proto udpv6_prot = { .owner = THIS_MODULE, .close = udp_lib_close, .pre_connect = udpv6_pre_connect, - .connect = ip6_datagram_connect, + .connect = udpv6_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, .init = udpv6_init_sock, -- cgit v1.2.3 From dbefa1f31a91670c9e7dac9b559625336206466f Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 7 Nov 2024 01:14:41 +0900 Subject: Rename .data.once to .data..once to fix resetting WARN*_ONCE Commit b1fca27d384e ("kernel debug: support resetting WARN*_ONCE") added support for clearing the state of once warnings. However, it is not functional when CONFIG_LD_DEAD_CODE_DATA_ELIMINATION or CONFIG_LTO_CLANG is enabled, because .data.once matches the .data.[0-9a-zA-Z_]* pattern in the DATA_MAIN macro. Commit cb87481ee89d ("kbuild: linker script do not match C names unless LD_DEAD_CODE_DATA_ELIMINATION is configured") was introduced to suppress the issue for the default CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=n case, providing a minimal fix for stable backporting. We were aware this did not address the issue for CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y. The plan was to apply correct fixes and then revert cb87481ee89d. [1] Seven years have passed since then, yet the #ifdef workaround remains in place. Meanwhile, commit b1fca27d384e introduced the .data.once section, and commit dc5723b02e52 ("kbuild: add support for Clang LTO") extended the #ifdef. Using a ".." separator in the section name fixes the issue for CONFIG_LD_DEAD_CODE_DATA_ELIMINATION and CONFIG_LTO_CLANG. [1]: https://lore.kernel.org/linux-kbuild/CAK7LNASck6BfdLnESxXUeECYL26yUDm0cwRZuM4gmaWUkxjL5g@mail.gmail.com/ Fixes: b1fca27d384e ("kernel debug: support resetting WARN*_ONCE") Fixes: dc5723b02e52 ("kbuild: add support for Clang LTO") Signed-off-by: Masahiro Yamada --- include/asm-generic/vmlinux.lds.h | 2 +- include/linux/mmdebug.h | 6 +++--- include/linux/once.h | 4 ++-- include/linux/once_lite.h | 2 +- include/net/net_debug.h | 2 +- mm/internal.h | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 3c9dc1fd094d..54504013c749 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -359,7 +359,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) *(.data..shared_aligned) /* percpu related */ \ *(.data..unlikely) \ __start_once = .; \ - *(.data.once) \ + *(.data..once) \ __end_once = .; \ STRUCT_ALIGN(); \ *(__tracepoints) \ diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 39a7714605a7..d7cb1e5ecbda 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -46,7 +46,7 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi); } \ } while (0) #define VM_WARN_ON_ONCE_PAGE(cond, page) ({ \ - static bool __section(".data.once") __warned; \ + static bool __section(".data..once") __warned; \ int __ret_warn_once = !!(cond); \ \ if (unlikely(__ret_warn_once && !__warned)) { \ @@ -66,7 +66,7 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi); unlikely(__ret_warn); \ }) #define VM_WARN_ON_ONCE_FOLIO(cond, folio) ({ \ - static bool __section(".data.once") __warned; \ + static bool __section(".data..once") __warned; \ int __ret_warn_once = !!(cond); \ \ if (unlikely(__ret_warn_once && !__warned)) { \ @@ -77,7 +77,7 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi); unlikely(__ret_warn_once); \ }) #define VM_WARN_ON_ONCE_MM(cond, mm) ({ \ - static bool __section(".data.once") __warned; \ + static bool __section(".data..once") __warned; \ int __ret_warn_once = !!(cond); \ \ if (unlikely(__ret_warn_once && !__warned)) { \ diff --git a/include/linux/once.h b/include/linux/once.h index bc714d414448..30346fcdc799 100644 --- a/include/linux/once.h +++ b/include/linux/once.h @@ -46,7 +46,7 @@ void __do_once_sleepable_done(bool *done, struct static_key_true *once_key, #define DO_ONCE(func, ...) \ ({ \ bool ___ret = false; \ - static bool __section(".data.once") ___done = false; \ + static bool __section(".data..once") ___done = false; \ static DEFINE_STATIC_KEY_TRUE(___once_key); \ if (static_branch_unlikely(&___once_key)) { \ unsigned long ___flags; \ @@ -64,7 +64,7 @@ void __do_once_sleepable_done(bool *done, struct static_key_true *once_key, #define DO_ONCE_SLEEPABLE(func, ...) \ ({ \ bool ___ret = false; \ - static bool __section(".data.once") ___done = false; \ + static bool __section(".data..once") ___done = false; \ static DEFINE_STATIC_KEY_TRUE(___once_key); \ if (static_branch_unlikely(&___once_key)) { \ ___ret = __do_once_sleepable_start(&___done); \ diff --git a/include/linux/once_lite.h b/include/linux/once_lite.h index b7bce4983638..27de7bc32a06 100644 --- a/include/linux/once_lite.h +++ b/include/linux/once_lite.h @@ -12,7 +12,7 @@ #define __ONCE_LITE_IF(condition) \ ({ \ - static bool __section(".data.once") __already_done; \ + static bool __section(".data..once") __already_done; \ bool __ret_cond = !!(condition); \ bool __ret_once = false; \ \ diff --git a/include/net/net_debug.h b/include/net/net_debug.h index 1e74684cbbdb..4a79204c8d30 100644 --- a/include/net/net_debug.h +++ b/include/net/net_debug.h @@ -27,7 +27,7 @@ void netdev_info(const struct net_device *dev, const char *format, ...); #define netdev_level_once(level, dev, fmt, ...) \ do { \ - static bool __section(".data.once") __print_once; \ + static bool __section(".data..once") __print_once; \ \ if (!__print_once) { \ __print_once = true; \ diff --git a/mm/internal.h b/mm/internal.h index 93083bbeeefa..a23f7b11b760 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -48,7 +48,7 @@ struct folio_batch; * when we specify __GFP_NOWARN. */ #define WARN_ON_ONCE_GFP(cond, gfp) ({ \ - static bool __section(".data.once") __warned; \ + static bool __section(".data..once") __warned; \ int __ret_warn_once = !!(cond); \ \ if (unlikely(!(gfp & __GFP_NOWARN) && __ret_warn_once && !__warned)) { \ -- cgit v1.2.3