From 8f5b408d7661e33157b16c4e4d232f483e8e4f79 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Tue, 1 Oct 2024 22:06:05 +0200 Subject: bpf: Remove unused macro Commit 7aebfa1b3885 ("bpf: Support narrow loads from bpf_sock_addr.user_port") removed one and only SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD callsite but kept the macro. Remove it to clean up the code base. Found while getting lost in the BPF code. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20241001200605.249526-1-maciej.fijalkowski@intel.com --- net/core/filter.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index cd3524cb326b..e61ac225c41b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -10241,10 +10241,6 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, } \ } while (0) -#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF) \ - SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( \ - S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF) - static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, -- cgit v1.3 From 32b7580be4e5a4236cc1e0ddf403ccd2ddda9525 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 30 Sep 2024 14:33:28 +0200 Subject: netem: Include in sch_netem.c Include header to allow the removal of legacy inclusion of from . Signed-off-by: Uros Bizjak Cc: Stephen Hemminger Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Acked-by: Stephen Hemminger Signed-off-by: Jason A. Donenfeld --- net/sched/sch_netem.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 39382ee1e331..fe6fed291a7b 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include -- cgit v1.3 From 9b8ca04854fd1253a58aeb1bd089c191cb5a074c Mon Sep 17 00:00:00 2001 From: Alexandre Ferrieux Date: Wed, 2 Oct 2024 01:14:38 +0200 Subject: ipv4: avoid quadratic behavior in FIB insertion of common address Mix netns into all IPv4 FIB hashes to avoid massive collision when inserting the same address in many netns. Signed-off-by: Alexandre Ferrieux Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241001231438.3855035-1-alexandre.ferrieux@orange.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_semantics.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index ba2df3d2ac15..1a847ba40458 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -347,11 +347,10 @@ static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope, return val; } -static unsigned int fib_info_hashfn_result(unsigned int val) +static unsigned int fib_info_hashfn_result(const struct net *net, + unsigned int val) { - unsigned int mask = (fib_info_hash_size - 1); - - return (val ^ (val >> 7) ^ (val >> 12)) & mask; + return hash_32(val ^ net_hash_mix(net), fib_info_hash_bits); } static inline unsigned int fib_info_hashfn(struct fib_info *fi) @@ -370,7 +369,7 @@ static inline unsigned int fib_info_hashfn(struct fib_info *fi) } endfor_nexthops(fi) } - return fib_info_hashfn_result(val); + return fib_info_hashfn_result(fi->fib_net, val); } /* no metrics, only nexthop id */ @@ -385,7 +384,7 @@ static struct fib_info *fib_find_info_nh(struct net *net, cfg->fc_protocol, cfg->fc_scope, (__force u32)cfg->fc_prefsrc, cfg->fc_priority); - hash = fib_info_hashfn_result(hash); + hash = fib_info_hashfn_result(net, hash); head = &fib_info_hash[hash]; hlist_for_each_entry(fi, head, fib_hash) { -- cgit v1.3 From 913c83a610bb7dd8e5952a2b4663e1feec0b5de6 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 1 Oct 2024 21:28:37 +0200 Subject: ipv4: Convert icmp_route_lookup() to dscp_t. Pass a dscp_t variable to icmp_route_lookup(), instead of a plain u8, to prevent accidental setting of ECN bits in ->flowi4_tos. Rename that variable ("tos" -> "dscp") to make the intent clear. While there, reorganise the function parameters to fill up horizontal space. Signed-off-by: Guillaume Nault Reviewed-by: David Ahern Link: https://patch.msgid.link/294fead85c6035bcdc5fcf9a6bb4ce8798c45ba1.1727807926.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv4/icmp.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index e1384e7331d8..7d7b25ed8d21 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -478,13 +478,11 @@ static struct net_device *icmp_get_route_lookup_dev(struct sk_buff *skb) return route_lookup_dev; } -static struct rtable *icmp_route_lookup(struct net *net, - struct flowi4 *fl4, +static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, struct sk_buff *skb_in, - const struct iphdr *iph, - __be32 saddr, u8 tos, u32 mark, - int type, int code, - struct icmp_bxm *param) + const struct iphdr *iph, __be32 saddr, + dscp_t dscp, u32 mark, int type, + int code, struct icmp_bxm *param) { struct net_device *route_lookup_dev; struct dst_entry *dst, *dst2; @@ -498,7 +496,7 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->saddr = saddr; fl4->flowi4_mark = mark; fl4->flowi4_uid = sock_net_uid(net, NULL); - fl4->flowi4_tos = tos & INET_DSCP_MASK; + fl4->flowi4_tos = inet_dscp_to_dsfield(dscp); fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; @@ -547,7 +545,7 @@ static struct rtable *icmp_route_lookup(struct net *net, orefdst = skb_in->_skb_refdst; /* save old refdst */ skb_dst_set(skb_in, NULL); err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr, - tos, rt2->dst.dev); + inet_dscp_to_dsfield(dscp), rt2->dst.dev); dst_release(&rt2->dst); rt2 = skb_rtable(skb_in); @@ -741,8 +739,9 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, ipc.opt = &icmp_param.replyopts.opt; ipc.sockc.mark = mark; - rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, - type, code, &icmp_param); + rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, + inet_dsfield_to_dscp(tos), mark, type, code, + &icmp_param); if (IS_ERR(rt)) goto out_unlock; -- cgit v1.3 From 7e863e5db6185b1add0df4cb01b31a4ed1c4b738 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 1 Oct 2024 21:28:43 +0200 Subject: ipv4: Convert ip_route_input() to dscp_t. Pass a dscp_t variable to ip_route_input(), instead of a plain u8, to prevent accidental setting of ECN bits in ->flowi4_tos. Callers of ip_route_input() to consider are: * input_action_end_dx4_finish() and input_action_end_dt4() in net/ipv6/seg6_local.c. These functions set the tos parameter to 0, which is already a valid dscp_t value, so they don't need to be adjusted for the new prototype. * icmp_route_lookup(), which already has a dscp_t variable to pass as parameter. We just need to remove the inet_dscp_to_dsfield() conversion. * br_nf_pre_routing_finish(), ip_options_rcv_srr() and ip4ip6_err(), which get the DSCP directly from IPv4 headers. Define a helper to read the .tos field of struct iphdr as dscp_t, so that these function don't have to do the conversion manually. While there, declare *iph as const in br_nf_pre_routing_finish(), declare its local variables in reverse-christmas-tree order and move the "err = ip_route_input()" assignment out of the conditional to avoid checkpatch warning. Signed-off-by: Guillaume Nault Reviewed-by: David Ahern Link: https://patch.msgid.link/e9d40781d64d3d69f4c79ac8a008b8d67a033e8d.1727807926.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 5 +++++ include/net/route.h | 5 +++-- net/bridge/br_netfilter_hooks.c | 8 +++++--- net/ipv4/icmp.c | 2 +- net/ipv4/ip_options.c | 3 ++- net/ipv6/ip6_tunnel.c | 4 ++-- 6 files changed, 18 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/ip.h b/include/net/ip.h index d92d3bc3ec0e..bab084df1567 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -424,6 +424,11 @@ int ip_decrease_ttl(struct iphdr *iph) return --iph->ttl; } +static inline dscp_t ip4h_dscp(const struct iphdr *ip4h) +{ + return inet_dsfield_to_dscp(ip4h->tos); +} + static inline int ip_mtu_locked(const struct dst_entry *dst) { const struct rtable *rt = dst_rtable(dst); diff --git a/include/net/route.h b/include/net/route.h index 1789f1e6640b..03dd28cf4bc4 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -208,12 +208,13 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 dst, __be32 src, const struct sk_buff *hint); static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin) + dscp_t dscp, struct net_device *devin) { int err; rcu_read_lock(); - err = ip_route_input_noref(skb, dst, src, tos, devin); + err = ip_route_input_noref(skb, dst, src, inet_dscp_to_dsfield(dscp), + devin); if (!err) { skb_dst_force(skb); if (!skb_dst(skb)) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 0e8bc0ea6175..c6bab2b5e834 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -369,9 +369,9 @@ br_nf_ipv4_daddr_was_changed(const struct sk_buff *skb, */ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_device *dev = skb->dev, *br_indev; - struct iphdr *iph = ip_hdr(skb); struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); + struct net_device *dev = skb->dev, *br_indev; + const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; int err; @@ -389,7 +389,9 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_ } nf_bridge->in_prerouting = 0; if (br_nf_ipv4_daddr_was_changed(skb, nf_bridge)) { - if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) { + err = ip_route_input(skb, iph->daddr, iph->saddr, + ip4h_dscp(iph), dev); + if (err) { struct in_device *in_dev = __in_dev_get_rcu(dev); /* If err equals -EHOSTUNREACH the error is due to a diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 7d7b25ed8d21..23664434922e 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -545,7 +545,7 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, orefdst = skb_in->_skb_refdst; /* save old refdst */ skb_dst_set(skb_in, NULL); err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr, - inet_dscp_to_dsfield(dscp), rt2->dst.dev); + dscp, rt2->dst.dev); dst_release(&rt2->dst); rt2 = skb_rtable(skb_in); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 68aedb8877b9..81e86e5defee 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -617,7 +617,8 @@ int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev) orefdst = skb->_skb_refdst; skb_dst_set(skb, NULL); - err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, dev); + err = ip_route_input(skb, nexthop, iph->saddr, ip4h_dscp(iph), + dev); rt2 = skb_rtable(skb); if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { skb_dst_drop(skb); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index b60e13c42bca..48fd53b98972 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -630,8 +630,8 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } skb_dst_set(skb2, &rt->dst); } else { - if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, - skb2->dev) || + if (ip_route_input(skb2, eiph->daddr, eiph->saddr, + ip4h_dscp(eiph), skb2->dev) || skb_dst(skb2)->dev->type != ARPHRD_TUNNEL6) goto out; } -- cgit v1.3 From 66fb6386d358a04edd5c640e38b4a02b323b89d8 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 1 Oct 2024 21:28:49 +0200 Subject: ipv4: Convert ip_route_input_noref() to dscp_t. Pass a dscp_t variable to ip_route_input_noref(), instead of a plain u8, to prevent accidental setting of ECN bits in ->flowi4_tos. Callers of ip_route_input_noref() to consider are: * arp_process() in net/ipv4/arp.c. This function sets the tos parameter to 0, which is already a valid dscp_t value, so it doesn't need to be adjusted for the new prototype. * ip_route_input(), which already has a dscp_t variable to pass as parameter. We just need to remove the inet_dscp_to_dsfield() conversion. * ipvlan_l3_rcv(), bpf_lwt_input_reroute(), ip_expire(), ip_rcv_finish_core(), xfrm4_rcv_encap_finish() and xfrm4_rcv_encap(), which get the DSCP directly from IPv4 headers and can simply use the ip4h_dscp() helper. While there, declare the IPv4 header pointers as const in ipvlan_l3_rcv() and bpf_lwt_input_reroute(). Also, modify the declaration of ip_route_input_noref() in include/net/route.h so that it matches the prototype of its implementation in net/ipv4/route.c. Signed-off-by: Guillaume Nault Reviewed-by: David Ahern Link: https://patch.msgid.link/a8a747bed452519c4d0cc06af32c7e7795d7b627.1727807926.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/ipvlan/ipvlan_l3s.c | 6 ++++-- include/net/route.h | 7 +++---- net/core/lwt_bpf.c | 5 +++-- net/ipv4/ip_fragment.c | 4 ++-- net/ipv4/ip_input.c | 2 +- net/ipv4/route.c | 6 +++--- net/ipv4/xfrm4_input.c | 2 +- net/ipv4/xfrm4_protocol.c | 2 +- 8 files changed, 18 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/drivers/net/ipvlan/ipvlan_l3s.c b/drivers/net/ipvlan/ipvlan_l3s.c index d5b05e803219..b4ef386bdb1b 100644 --- a/drivers/net/ipvlan/ipvlan_l3s.c +++ b/drivers/net/ipvlan/ipvlan_l3s.c @@ -2,6 +2,8 @@ /* Copyright (c) 2014 Mahesh Bandewar */ +#include + #include "ipvlan.h" static unsigned int ipvlan_netid __read_mostly; @@ -48,11 +50,11 @@ static struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, switch (proto) { case AF_INET: { - struct iphdr *ip4h = ip_hdr(skb); + const struct iphdr *ip4h = ip_hdr(skb); int err; err = ip_route_input_noref(skb, ip4h->daddr, ip4h->saddr, - ip4h->tos, sdev); + ip4h_dscp(ip4h), sdev); if (unlikely(err)) goto out; break; diff --git a/include/net/route.h b/include/net/route.h index 03dd28cf4bc4..5e4374d66927 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -201,8 +201,8 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev, struct in_device *in_dev, u32 *itag); -int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin); +int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev); int ip_route_use_hint(struct sk_buff *skb, __be32 dst, __be32 src, u8 tos, struct net_device *devin, const struct sk_buff *hint); @@ -213,8 +213,7 @@ static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, int err; rcu_read_lock(); - err = ip_route_input_noref(skb, dst, src, inet_dscp_to_dsfield(dscp), - devin); + err = ip_route_input_noref(skb, dst, src, dscp, devin); if (!err) { skb_dst_force(skb); if (!skb_dst(skb)) diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 1a14f915b7a4..e0ca24a58810 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -91,12 +92,12 @@ static int bpf_lwt_input_reroute(struct sk_buff *skb) if (skb->protocol == htons(ETH_P_IP)) { struct net_device *dev = skb_dst(skb)->dev; - struct iphdr *iph = ip_hdr(skb); + const struct iphdr *iph = ip_hdr(skb); dev_hold(dev); skb_dst_drop(skb); err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, dev); + ip4h_dscp(iph), dev); dev_put(dev); } else if (skb->protocol == htons(ETH_P_IPV6)) { skb_dst_drop(skb); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index a92664a5ef2e..48e2810f1f27 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -175,8 +175,8 @@ static void ip_expire(struct timer_list *t) /* skb has no dst, perform route lookup again */ iph = ip_hdr(head); - err = ip_route_input_noref(head, iph->daddr, iph->saddr, - iph->tos, head->dev); + err = ip_route_input_noref(head, iph->daddr, iph->saddr, ip4h_dscp(iph), + head->dev); if (err) goto out; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index b6e7d4921309..c0a2490eb7c1 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -363,7 +363,7 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk, */ if (!skb_valid_dst(skb)) { err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, dev); + ip4h_dscp(iph), dev); if (unlikely(err)) goto drop_error; } else { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 723ac9181558..00bfc0a11f64 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2465,14 +2465,14 @@ static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, } int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev) + dscp_t dscp, struct net_device *dev) { struct fib_result res; int err; - tos &= INET_DSCP_MASK; rcu_read_lock(); - err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res); + err = ip_route_input_rcu(skb, daddr, saddr, inet_dscp_to_dsfield(dscp), + dev, &res); rcu_read_unlock(); return err; diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index a620618cc568..b5b06323cfd9 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -33,7 +33,7 @@ static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk, const struct iphdr *iph = ip_hdr(skb); if (ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev)) + ip4h_dscp(iph), skb->dev)) goto drop; } diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c index b146ce88c5d0..4ee624d8e66f 100644 --- a/net/ipv4/xfrm4_protocol.c +++ b/net/ipv4/xfrm4_protocol.c @@ -76,7 +76,7 @@ int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, const struct iphdr *iph = ip_hdr(skb); if (ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev)) + ip4h_dscp(iph), skb->dev)) goto drop; } -- cgit v1.3 From be612f5e99e1d48de34f4befcb700d840c15e05e Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 1 Oct 2024 21:28:55 +0200 Subject: ipv4: Convert ip_route_input_rcu() to dscp_t. Pass a dscp_t variable to ip_route_input_rcu(), instead of a plain u8, to prevent accidental setting of ECN bits in ->flowi4_tos. Callers of ip_route_input_rcu() to consider are: * ip_route_input_noref(), which already has a dscp_t variable to pass as parameter. We just need to remove the inet_dscp_to_dsfield() conversion. * inet_rtm_getroute(), which receives a u8 from user space and needs to convert it with inet_dsfield_to_dscp(). Signed-off-by: Guillaume Nault Reviewed-by: David Ahern Link: https://patch.msgid.link/c4dbb5aa9cbc79c4fcb317abbffa7c7156bc56a7.1727807926.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv4/route.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 00bfc0a11f64..a693b57b4111 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2415,7 +2415,8 @@ martian_source: /* called with rcu_read_lock held */ static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, struct fib_result *res) + dscp_t dscp, struct net_device *dev, + struct fib_result *res) { /* Multicast recognition logic is moved from route cache to here. * The problem was that too many Ethernet cards have broken/missing @@ -2456,12 +2457,14 @@ static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, #endif ) { err = ip_route_input_mc(skb, daddr, saddr, - tos, dev, our); + inet_dscp_to_dsfield(dscp), + dev, our); } return err; } - return ip_route_input_slow(skb, daddr, saddr, tos, dev, res); + return ip_route_input_slow(skb, daddr, saddr, + inet_dscp_to_dsfield(dscp), dev, res); } int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, @@ -2471,8 +2474,7 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, int err; rcu_read_lock(); - err = ip_route_input_rcu(skb, daddr, saddr, inet_dscp_to_dsfield(dscp), - dev, &res); + err = ip_route_input_rcu(skb, daddr, saddr, dscp, dev, &res); rcu_read_unlock(); return err; @@ -3286,8 +3288,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, skb->dev = dev; skb->mark = mark; err = ip_route_input_rcu(skb, dst, src, - rtm->rtm_tos & INET_DSCP_MASK, dev, - &res); + inet_dsfield_to_dscp(rtm->rtm_tos), + dev, &res); rt = skb_rtable(skb); if (err == 0 && rt->dst.error) -- cgit v1.3 From 783946aa0358c8a9e5f88d74dfc047d855813a06 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 1 Oct 2024 21:29:01 +0200 Subject: ipv4: Convert ip_route_input_slow() to dscp_t. Pass a dscp_t variable to ip_route_input_slow(), instead of a plain u8, to prevent accidental setting of ECN bits in ->flowi4_tos. Only ip_route_input_rcu() actually calls ip_route_input_slow(). Since it already has a dscp_t variable to pass as parameter, we only need to remove the inet_dscp_to_dsfield() conversion. Signed-off-by: Guillaume Nault Reviewed-by: David Ahern Link: https://patch.msgid.link/d6bca5f87eea9e83a3861e6e05594cdd252583c9.1727807926.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv4/route.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a693b57b4111..6e1cd0065b87 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2201,7 +2201,7 @@ static struct net_device *ip_rt_get_dev(struct net *net, */ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, + dscp_t dscp, struct net_device *dev, struct fib_result *res) { struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -2266,7 +2266,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.flowi4_oif = 0; fl4.flowi4_iif = dev->ifindex; fl4.flowi4_mark = skb->mark; - fl4.flowi4_tos = tos; + fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; fl4.daddr = daddr; @@ -2299,8 +2299,9 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, } if (res->type == RTN_LOCAL) { - err = fib_validate_source(skb, saddr, daddr, tos, - 0, dev, in_dev, &itag); + err = fib_validate_source(skb, saddr, daddr, + inet_dscp_to_dsfield(dscp), 0, dev, + in_dev, &itag); if (err < 0) goto martian_source; goto local_input; @@ -2314,7 +2315,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto martian_destination; make_route: - err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys); + err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, + inet_dscp_to_dsfield(dscp), flkeys); out: return err; brd_input: @@ -2322,7 +2324,8 @@ brd_input: goto e_inval; if (!ipv4_is_zeronet(saddr)) { - err = fib_validate_source(skb, saddr, 0, tos, 0, dev, + err = fib_validate_source(skb, saddr, 0, + inet_dscp_to_dsfield(dscp), 0, dev, in_dev, &itag); if (err < 0) goto martian_source; @@ -2463,8 +2466,7 @@ static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, return err; } - return ip_route_input_slow(skb, daddr, saddr, - inet_dscp_to_dsfield(dscp), dev, res); + return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res); } int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, -- cgit v1.3 From 25ba2a5adab2f4e660be631b50f64b7ea218af33 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 30 Sep 2024 14:43:58 +0100 Subject: net/rds: remove unused struct 'rds_ib_dereg_odp_mr' 'rds_ib_dereg_odp_mr' has been unused since the original commit 2eafa1746f17 ("net/rds: Handle ODP mr registration/unregistration"). Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Simon Horman Reviewed-by: Allison Henderson Reviewed-by: Zhu Yanjun Link: https://patch.msgid.link/20240930134358.48647-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- net/rds/ib_rdma.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 8f070ee7e742..d1cfceeff133 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -40,10 +40,6 @@ #include "rds.h" struct workqueue_struct *rds_ib_mr_wq; -struct rds_ib_dereg_odp_mr { - struct work_struct work; - struct ib_mr *mr; -}; static void rds_ib_odp_mr_worker(struct work_struct *work); -- cgit v1.3 From b63c755cb65d43c8aba987c4f6b57c77c6f123f2 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 30 Sep 2024 14:29:53 +0100 Subject: appletalk: Remove deadcode alloc_ltalkdev in net/appletalk/dev.c is dead since commit 00f3696f7555 ("net: appletalk: remove cops support") Removing it (and it's helper) leaves dev.c and if_ltalk.h empty; remove them and the Makefile entry. tun.c was including that if_ltalk.h but actually wanted the uapi version for LTALK_ALEN, fix up the path. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Willem de Bruijn Acked-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tun.c | 2 +- include/linux/if_ltalk.h | 8 -------- net/appletalk/Makefile | 2 +- net/appletalk/dev.c | 46 ---------------------------------------------- 4 files changed, 2 insertions(+), 56 deletions(-) delete mode 100644 include/linux/if_ltalk.h delete mode 100644 net/appletalk/dev.c (limited to 'net') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 9a0f6eb32016..d7a865ef370b 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -71,7 +71,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/include/linux/if_ltalk.h b/include/linux/if_ltalk.h deleted file mode 100644 index 4cc1c0b77870..000000000000 --- a/include/linux/if_ltalk.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __LINUX_LTALK_H -#define __LINUX_LTALK_H - -#include - -extern struct net_device *alloc_ltalkdev(int sizeof_priv); -#endif diff --git a/net/appletalk/Makefile b/net/appletalk/Makefile index 33164d972d37..152312a15180 100644 --- a/net/appletalk/Makefile +++ b/net/appletalk/Makefile @@ -5,6 +5,6 @@ obj-$(CONFIG_ATALK) += appletalk.o -appletalk-y := aarp.o ddp.o dev.o +appletalk-y := aarp.o ddp.o appletalk-$(CONFIG_PROC_FS) += atalk_proc.o appletalk-$(CONFIG_SYSCTL) += sysctl_net_atalk.o diff --git a/net/appletalk/dev.c b/net/appletalk/dev.c deleted file mode 100644 index 284c8e585533..000000000000 --- a/net/appletalk/dev.c +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Moved here from drivers/net/net_init.c, which is: - * Written 1993,1994,1995 by Donald Becker. - */ - -#include -#include -#include -#include -#include - -static void ltalk_setup(struct net_device *dev) -{ - /* Fill in the fields of the device structure with localtalk-generic values. */ - - dev->type = ARPHRD_LOCALTLK; - dev->hard_header_len = LTALK_HLEN; - dev->mtu = LTALK_MTU; - dev->addr_len = LTALK_ALEN; - dev->tx_queue_len = 10; - - dev->broadcast[0] = 0xFF; - - dev->flags = IFF_BROADCAST|IFF_MULTICAST|IFF_NOARP; -} - -/** - * alloc_ltalkdev - Allocates and sets up an localtalk device - * @sizeof_priv: Size of additional driver-private structure to be allocated - * for this localtalk device - * - * Fill in the fields of the device structure with localtalk-generic - * values. Basically does everything except registering the device. - * - * Constructs a new net device, complete with a private data area of - * size @sizeof_priv. A 32-byte (not bit) alignment is enforced for - * this private data area. - */ - -struct net_device *alloc_ltalkdev(int sizeof_priv) -{ - return alloc_netdev(sizeof_priv, "lt%d", NET_NAME_UNKNOWN, - ltalk_setup); -} -EXPORT_SYMBOL(alloc_ltalkdev); -- cgit v1.3 From 4aecca4c76808f3736056d18ff510df80424bc9f Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Tue, 1 Oct 2024 05:57:14 -0700 Subject: net_tstamp: add SCM_TS_OPT_ID to provide OPT_ID in control message SOF_TIMESTAMPING_OPT_ID socket option flag gives a way to correlate TX timestamps and packets sent via socket. Unfortunately, there is no way to reliably predict socket timestamp ID value in case of error returned by sendmsg. For UDP sockets it's impossible because of lockless nature of UDP transmit, several threads may send packets in parallel. In case of RAW sockets MSG_MORE option makes things complicated. More details are in the conversation [1]. This patch adds new control message type to give user-space software an opportunity to control the mapping between packets and values by providing ID with each sendmsg for UDP sockets. The documentation is also added in this patch. [1] https://lore.kernel.org/netdev/CALCETrU0jB+kg0mhV6A8mrHfTE1D1pr1SD_B9Eaa9aDPfgHdtA@mail.gmail.com/ Reviewed-by: Willem de Bruijn Reviewed-by: Jason Xing Signed-off-by: Vadim Fedorenko Link: https://patch.msgid.link/20241001125716.2832769-2-vadfed@meta.com Signed-off-by: Jakub Kicinski --- Documentation/networking/timestamping.rst | 14 ++++++++++++++ arch/alpha/include/uapi/asm/socket.h | 2 ++ arch/mips/include/uapi/asm/socket.h | 2 ++ arch/parisc/include/uapi/asm/socket.h | 2 ++ arch/sparc/include/uapi/asm/socket.h | 2 ++ include/net/inet_sock.h | 4 +++- include/net/sock.h | 7 +++++++ include/uapi/asm-generic/socket.h | 2 ++ net/core/sock.c | 13 +++++++++++++ net/ipv4/ip_output.c | 19 ++++++++++++++----- net/ipv6/ip6_output.c | 20 ++++++++++++++------ 11 files changed, 75 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/Documentation/networking/timestamping.rst b/Documentation/networking/timestamping.rst index 8199e6917671..b37bfbfc7d79 100644 --- a/Documentation/networking/timestamping.rst +++ b/Documentation/networking/timestamping.rst @@ -194,6 +194,20 @@ SOF_TIMESTAMPING_OPT_ID: among all possibly concurrently outstanding timestamp requests for that socket. + The process can optionally override the default generated ID, by + passing a specific ID with control message SCM_TS_OPT_ID (not + supported for TCP sockets):: + + struct msghdr *msg; + ... + cmsg = CMSG_FIRSTHDR(msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_TS_OPT_ID; + cmsg->cmsg_len = CMSG_LEN(sizeof(__u32)); + *((__u32 *) CMSG_DATA(cmsg)) = opt_id; + err = sendmsg(fd, msg, 0); + + SOF_TIMESTAMPING_OPT_ID_TCP: Pass this modifier along with SOF_TIMESTAMPING_OPT_ID for new TCP timestamping applications. SOF_TIMESTAMPING_OPT_ID defines how the diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 251b73c5481e..302507bf9b5d 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -146,6 +146,8 @@ #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF #define SO_DEVMEM_DONTNEED 80 +#define SCM_TS_OPT_ID 81 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 8ab7582291ab..d118d4731580 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -157,6 +157,8 @@ #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF #define SO_DEVMEM_DONTNEED 80 +#define SCM_TS_OPT_ID 81 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 38fc0b188e08..d268d69bfcd2 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -138,6 +138,8 @@ #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF #define SO_DEVMEM_DONTNEED 80 +#define SCM_TS_OPT_ID 0x404C + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 57084ed2f3c4..113cd9f353e3 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -139,6 +139,8 @@ #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF #define SO_DEVMEM_DONTNEED 0x0059 +#define SCM_TS_OPT_ID 0x005a + #if !defined(__KERNEL__) diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 394c3b66065e..f01dd273bea6 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -174,6 +174,7 @@ struct inet_cork { __s16 tos; char priority; __u16 gso_size; + u32 ts_opt_id; u64 transmit_time; u32 mark; }; @@ -241,7 +242,8 @@ struct inet_sock { struct inet_cork_full cork; }; -#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ +#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ +#define IPCORK_TS_OPT_ID 2 /* ts_opt_id field is valid, overriding sk_tskey */ enum { INET_FLAGS_PKTINFO = 0, diff --git a/include/net/sock.h b/include/net/sock.h index c58ca8dd561b..ccf28c2b70b1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -954,6 +954,12 @@ enum sock_flags { }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) +/* + * The highest bit of sk_tsflags is reserved for kernel-internal + * SOCKCM_FLAG_TS_OPT_ID. There is a check in core/sock.c to control that + * SOF_TIMESTAMPING* values do not reach this reserved area + */ +#define SOCKCM_FLAG_TS_OPT_ID BIT(31) static inline void sock_copy_flags(struct sock *nsk, const struct sock *osk) { @@ -1796,6 +1802,7 @@ struct sockcm_cookie { u64 transmit_time; u32 mark; u32 tsflags; + u32 ts_opt_id; }; static inline void sockcm_init(struct sockcm_cookie *sockc, diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 3b4e3e815602..deacfd6dd197 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -141,6 +141,8 @@ #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF #define SO_DEVMEM_DONTNEED 80 +#define SCM_TS_OPT_ID 81 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) diff --git a/net/core/sock.c b/net/core/sock.c index 039be95c40cf..846f494a17cf 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2899,6 +2899,8 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, { u32 tsflags; + BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31)); + switch (cmsg->cmsg_type) { case SO_MARK: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && @@ -2927,6 +2929,17 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, return -EINVAL; sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); break; + case SCM_TS_OPT_ID: + if (sk_is_tcp(sk)) + return -EINVAL; + tsflags = READ_ONCE(sk->sk_tsflags); + if (!(tsflags & SOF_TIMESTAMPING_OPT_ID)) + return -EINVAL; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg); + sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID; + break; /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ case SCM_RIGHTS: case SCM_CREDENTIALS: diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 49811c9281d4..0c7049f50369 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -973,7 +973,7 @@ static int __ip_append_data(struct sock *sk, unsigned int maxfraglen, fragheaderlen, maxnonfragsize; int csummode = CHECKSUM_NONE; struct rtable *rt = dst_rtable(cork->dst); - bool paged, hold_tskey, extra_uref = false; + bool paged, hold_tskey = false, extra_uref = false; unsigned int wmem_alloc_delta = 0; u32 tskey = 0; @@ -1049,10 +1049,15 @@ static int __ip_append_data(struct sock *sk, cork->length += length; - hold_tskey = cork->tx_flags & SKBTX_ANY_TSTAMP && - READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID; - if (hold_tskey) - tskey = atomic_inc_return(&sk->sk_tskey) - 1; + if (cork->tx_flags & SKBTX_ANY_TSTAMP && + READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { + if (cork->flags & IPCORK_TS_OPT_ID) { + tskey = cork->ts_opt_id; + } else { + tskey = atomic_inc_return(&sk->sk_tskey) - 1; + hold_tskey = true; + } + } /* So, what's going on in the loop below? * @@ -1327,6 +1332,10 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, cork->transmit_time = ipc->sockc.transmit_time; cork->tx_flags = 0; sock_tx_timestamp(sk, ipc->sockc.tsflags, &cork->tx_flags); + if (ipc->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { + cork->flags |= IPCORK_TS_OPT_ID; + cork->ts_opt_id = ipc->sockc.ts_opt_id; + } return 0; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index f26841f1490f..ff6bd8d85e9a 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1402,7 +1402,10 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, cork->base.tx_flags = 0; cork->base.mark = ipc6->sockc.mark; sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); - + if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { + cork->base.flags |= IPCORK_TS_OPT_ID; + cork->base.ts_opt_id = ipc6->sockc.ts_opt_id; + } cork->base.length = 0; cork->base.transmit_time = ipc6->sockc.transmit_time; @@ -1433,7 +1436,7 @@ static int __ip6_append_data(struct sock *sk, bool zc = false; u32 tskey = 0; struct rt6_info *rt = dst_rt6_info(cork->dst); - bool paged, hold_tskey, extra_uref = false; + bool paged, hold_tskey = false, extra_uref = false; struct ipv6_txoptions *opt = v6_cork->opt; int csummode = CHECKSUM_NONE; unsigned int maxnonfragsize, headersize; @@ -1543,10 +1546,15 @@ emsgsize: flags &= ~MSG_SPLICE_PAGES; } - hold_tskey = cork->tx_flags & SKBTX_ANY_TSTAMP && - READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID; - if (hold_tskey) - tskey = atomic_inc_return(&sk->sk_tskey) - 1; + if (cork->tx_flags & SKBTX_ANY_TSTAMP && + READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { + if (cork->flags & IPCORK_TS_OPT_ID) { + tskey = cork->ts_opt_id; + } else { + tskey = atomic_inc_return(&sk->sk_tskey) - 1; + hold_tskey = true; + } + } /* * Let's try using as much space as possible. -- cgit v1.3 From 822b5bc6db55f1c3ea51659c423784ac6919ddd4 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Tue, 1 Oct 2024 05:57:15 -0700 Subject: net_tstamp: add SCM_TS_OPT_ID for RAW sockets The last type of sockets which supports SOF_TIMESTAMPING_OPT_ID is RAW sockets. To add new option this patch converts all callers (direct and indirect) of _sock_tx_timestamp to provide sockcm_cookie instead of tsflags. And while here fix __sock_tx_timestamp to receive tsflags as __u32 instead of __u16. Reviewed-by: Willem de Bruijn Reviewed-by: Jason Xing Signed-off-by: Vadim Fedorenko Link: https://patch.msgid.link/20241001125716.2832769-3-vadfed@meta.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 27 ++++++++++++++++++--------- net/can/raw.c | 2 +- net/ipv4/ip_output.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv4/tcp.c | 7 ++++--- net/ipv6/ip6_output.c | 2 +- net/ipv6/raw.c | 2 +- net/packet/af_packet.c | 6 +++--- net/socket.c | 2 +- 9 files changed, 31 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index ccf28c2b70b1..e282127092ab 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2660,39 +2660,48 @@ static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, sock_write_timestamp(sk, 0); } -void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags); +void __sock_tx_timestamp(__u32 tsflags, __u8 *tx_flags); /** * _sock_tx_timestamp - checks whether the outgoing packet is to be time stamped * @sk: socket sending this packet - * @tsflags: timestamping flags to use + * @sockc: pointer to socket cmsg cookie to get timestamping info * @tx_flags: completed with instructions for time stamping * @tskey: filled in with next sk_tskey (not for TCP, which uses seqno) * * Note: callers should take care of initial ``*tx_flags`` value (usually 0) */ -static inline void _sock_tx_timestamp(struct sock *sk, __u16 tsflags, +static inline void _sock_tx_timestamp(struct sock *sk, + const struct sockcm_cookie *sockc, __u8 *tx_flags, __u32 *tskey) { + __u32 tsflags = sockc->tsflags; + if (unlikely(tsflags)) { __sock_tx_timestamp(tsflags, tx_flags); if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey && - tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) - *tskey = atomic_inc_return(&sk->sk_tskey) - 1; + tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) { + if (tsflags & SOCKCM_FLAG_TS_OPT_ID) + *tskey = sockc->ts_opt_id; + else + *tskey = atomic_inc_return(&sk->sk_tskey) - 1; + } } if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) *tx_flags |= SKBTX_WIFI_STATUS; } -static inline void sock_tx_timestamp(struct sock *sk, __u16 tsflags, +static inline void sock_tx_timestamp(struct sock *sk, + const struct sockcm_cookie *sockc, __u8 *tx_flags) { - _sock_tx_timestamp(sk, tsflags, tx_flags, NULL); + _sock_tx_timestamp(sk, sockc, tx_flags, NULL); } -static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags) +static inline void skb_setup_tx_timestamp(struct sk_buff *skb, + const struct sockcm_cookie *sockc) { - _sock_tx_timestamp(skb->sk, tsflags, &skb_shinfo(skb)->tx_flags, + _sock_tx_timestamp(skb->sk, sockc, &skb_shinfo(skb)->tx_flags, &skb_shinfo(skb)->tskey); } diff --git a/net/can/raw.c b/net/can/raw.c index 00533f64d69d..255c0a8f39d6 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -966,7 +966,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) skb->mark = READ_ONCE(sk->sk_mark); skb->tstamp = sockc.transmit_time; - skb_setup_tx_timestamp(skb, sockc.tsflags); + skb_setup_tx_timestamp(skb, &sockc); err = can_send(skb, ro->loopback); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 0c7049f50369..e5c55a95063d 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1331,7 +1331,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, cork->priority = ipc->priority; cork->transmit_time = ipc->sockc.transmit_time; cork->tx_flags = 0; - sock_tx_timestamp(sk, ipc->sockc.tsflags, &cork->tx_flags); + sock_tx_timestamp(sk, &ipc->sockc, &cork->tx_flags); if (ipc->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { cork->flags |= IPCORK_TS_OPT_ID; cork->ts_opt_id = ipc->sockc.ts_opt_id; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 474dfd263c8b..0e9e01967ec9 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -370,7 +370,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, skb->ip_summed = CHECKSUM_NONE; - skb_setup_tx_timestamp(skb, sockc->tsflags); + skb_setup_tx_timestamp(skb, sockc); if (flags & MSG_CONFIRM) skb_set_dst_pending_confirm(skb, 1); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4f77bd862e95..82cc4a5633ce 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -477,15 +477,16 @@ void tcp_init_sock(struct sock *sk) } EXPORT_SYMBOL(tcp_init_sock); -static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) +static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc) { struct sk_buff *skb = tcp_write_queue_tail(sk); + u32 tsflags = sockc->tsflags; if (tsflags && skb) { struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags); + sock_tx_timestamp(sk, sockc, &shinfo->tx_flags); if (tsflags & SOF_TIMESTAMPING_TX_ACK) tcb->txstamp_ack = 1; if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) @@ -1321,7 +1322,7 @@ wait_for_space: out: if (copied) { - tcp_tx_timestamp(sk, sockc.tsflags); + tcp_tx_timestamp(sk, &sockc); tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } out_nopush: diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index ff6bd8d85e9a..205673179b3c 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1401,7 +1401,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, cork->base.gso_size = ipc6->gso_size; cork->base.tx_flags = 0; cork->base.mark = ipc6->sockc.mark; - sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); + sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags); if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { cork->base.flags |= IPCORK_TS_OPT_ID; cork->base.ts_opt_id = ipc6->sockc.ts_opt_id; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 608fa9d05b55..8476a3944a88 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -629,7 +629,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb->ip_summed = CHECKSUM_NONE; - skb_setup_tx_timestamp(skb, sockc->tsflags); + skb_setup_tx_timestamp(skb, sockc); if (flags & MSG_CONFIRM) skb_set_dst_pending_confirm(skb, 1); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index a705ec214254..f8942062f776 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2118,7 +2118,7 @@ retry: skb->priority = READ_ONCE(sk->sk_priority); skb->mark = READ_ONCE(sk->sk_mark); skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); - skb_setup_tx_timestamp(skb, sockc.tsflags); + skb_setup_tx_timestamp(skb, &sockc); if (unlikely(extra_len == 4)) skb->no_fcs = 1; @@ -2650,7 +2650,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb->priority = READ_ONCE(po->sk.sk_priority); skb->mark = READ_ONCE(po->sk.sk_mark); skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, po->sk.sk_clockid); - skb_setup_tx_timestamp(skb, sockc->tsflags); + skb_setup_tx_timestamp(skb, sockc); skb_zcopy_set_nouarg(skb, ph.raw); skb_reserve(skb, hlen); @@ -3115,7 +3115,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) goto out_free; } - skb_setup_tx_timestamp(skb, sockc.tsflags); + skb_setup_tx_timestamp(skb, &sockc); if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) && !packet_extra_vlan_len_allowed(dev, skb)) { diff --git a/net/socket.c b/net/socket.c index 601ad74930ef..3b1b65b9f471 100644 --- a/net/socket.c +++ b/net/socket.c @@ -687,7 +687,7 @@ void sock_release(struct socket *sock) } EXPORT_SYMBOL(sock_release); -void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags) +void __sock_tx_timestamp(__u32 tsflags, __u8 *tx_flags) { u8 flags = *tx_flags; -- cgit v1.3 From 5c2ab978f9c90384198000a032d10382f44c3530 Mon Sep 17 00:00:00 2001 From: Daniel Zahka Date: Thu, 3 Oct 2024 09:23:10 -0700 Subject: ethtool: rss: fix rss key initialization warning This warning is emitted when a driver does not default populate an rss key when one is not provided from userspace. Some devices do not support individual rss keys per context. For these devices, it is ok to leave the key zeroed out in ethtool_rxfh_context. Do not warn on zeroed key when ethtool_ops.rxfh_per_ctx_key == 0. Signed-off-by: Daniel Zahka Link: https://patch.msgid.link/20241003162310.1310576-1-daniel.zahka@gmail.com Signed-off-by: Jakub Kicinski --- net/ethtool/ioctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 65cfe76dafbe..04b34dc6b369 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1505,6 +1505,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, extack); /* Make sure driver populates defaults */ WARN_ON_ONCE(!ret && !rxfh_dev.key && + ops->rxfh_per_ctx_key && !memchr_inv(ethtool_rxfh_context_key(ctx), 0, ctx->key_size)); } else if (rxfh_dev.rss_delete) { -- cgit v1.3 From 5a9071a760a61b00260334ad576fe60debafaafc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Oct 2024 17:30:40 +0000 Subject: tcp: annotate data-races around icsk->icsk_pending icsk->icsk_pending can be read locklessly already. Following patch in the series will add another lockless read. Add smp_load_acquire() and smp_store_release() annotations because following patch will add a test in tcp_write_timer(), and READ_ONCE()/WRITE_ONCE() alone would possibly lead to races. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20241002173042.917928-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 4 ++-- net/ipv4/inet_connection_sock.c | 6 ++++-- net/ipv4/inet_diag.c | 10 ++++++---- net/ipv4/tcp_ipv4.c | 10 ++++++---- net/ipv4/tcp_output.c | 4 ++-- net/ipv4/tcp_timer.c | 4 ++-- net/ipv6/tcp_ipv6.c | 10 ++++++---- 7 files changed, 28 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index c0deaafebfdc..914d19772704 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -197,7 +197,7 @@ static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) struct inet_connection_sock *icsk = inet_csk(sk); if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) { - icsk->icsk_pending = 0; + smp_store_release(&icsk->icsk_pending, 0); #ifdef INET_CSK_CLEAR_TIMERS sk_stop_timer(sk, &icsk->icsk_retransmit_timer); #endif @@ -229,7 +229,7 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 || what == ICSK_TIME_LOSS_PROBE || what == ICSK_TIME_REO_TIMEOUT) { - icsk->icsk_pending = what; + smp_store_release(&icsk->icsk_pending, what); icsk->icsk_timeout = jiffies + when; sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); } else if (what == ICSK_TIME_DACK) { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 2c5632d4fddb..8c53385cc808 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -775,7 +775,8 @@ void inet_csk_clear_xmit_timers(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - icsk->icsk_pending = icsk->icsk_ack.pending = 0; + smp_store_release(&icsk->icsk_pending, 0); + icsk->icsk_ack.pending = 0; sk_stop_timer(sk, &icsk->icsk_retransmit_timer); sk_stop_timer(sk, &icsk->icsk_delack_timer); @@ -790,7 +791,8 @@ void inet_csk_clear_xmit_timers_sync(struct sock *sk) /* ongoing timer handlers need to acquire socket lock. */ sock_not_owned_by_me(sk); - icsk->icsk_pending = icsk->icsk_ack.pending = 0; + smp_store_release(&icsk->icsk_pending, 0); + icsk->icsk_ack.pending = 0; sk_stop_timer_sync(sk, &icsk->icsk_retransmit_timer); sk_stop_timer_sync(sk, &icsk->icsk_delack_timer); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 67639309163d..321acc8abf17 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -247,6 +247,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct nlmsghdr *nlh; struct nlattr *attr; void *info = NULL; + u8 icsk_pending; int protocol; cb_data = cb->data; @@ -307,14 +308,15 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, goto out; } - if (icsk->icsk_pending == ICSK_TIME_RETRANS || - icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || - icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { + icsk_pending = smp_load_acquire(&icsk->icsk_pending); + if (icsk_pending == ICSK_TIME_RETRANS || + icsk_pending == ICSK_TIME_REO_TIMEOUT || + icsk_pending == ICSK_TIME_LOSS_PROBE) { r->idiag_timer = 1; r->idiag_retrans = icsk->icsk_retransmits; r->idiag_expires = jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies); - } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { + } else if (icsk_pending == ICSK_TIME_PROBE0) { r->idiag_timer = 4; r->idiag_retrans = icsk->icsk_probes_out; r->idiag_expires = diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5afe5e57c89b..985028434f64 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2900,15 +2900,17 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) __be32 src = inet->inet_rcv_saddr; __u16 destp = ntohs(inet->inet_dport); __u16 srcp = ntohs(inet->inet_sport); + u8 icsk_pending; int rx_queue; int state; - if (icsk->icsk_pending == ICSK_TIME_RETRANS || - icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || - icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { + icsk_pending = smp_load_acquire(&icsk->icsk_pending); + if (icsk_pending == ICSK_TIME_RETRANS || + icsk_pending == ICSK_TIME_REO_TIMEOUT || + icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; timer_expires = icsk->icsk_timeout; - } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { + } else if (icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; timer_expires = icsk->icsk_timeout; } else if (timer_pending(&sk->sk_timer)) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4fd746bd4d54..4d0407301603 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2960,7 +2960,7 @@ void tcp_send_loss_probe(struct sock *sk) WARN_ONCE(tp->packets_out, "invalid inflight: %u state %u cwnd %u mss %d\n", tp->packets_out, sk->sk_state, tcp_snd_cwnd(tp), mss); - inet_csk(sk)->icsk_pending = 0; + smp_store_release(&inet_csk(sk)->icsk_pending, 0); return; } @@ -2993,7 +2993,7 @@ probe_sent: NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); /* Reset s.t. tcp_rearm_rto will restart timer from now */ - inet_csk(sk)->icsk_pending = 0; + smp_store_release(&inet_csk(sk)->icsk_pending, 0); rearm_timer: tcp_rearm_rto(sk); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 79064580c8c0..56c597e763ac 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -701,11 +701,11 @@ void tcp_write_timer_handler(struct sock *sk) tcp_send_loss_probe(sk); break; case ICSK_TIME_RETRANS: - icsk->icsk_pending = 0; + smp_store_release(&icsk->icsk_pending, 0); tcp_retransmit_timer(sk); break; case ICSK_TIME_PROBE0: - icsk->icsk_pending = 0; + smp_store_release(&icsk->icsk_pending, 0); tcp_probe_timer(sk); break; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d71ab4e1efe1..7634c0be6acb 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2177,6 +2177,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) const struct tcp_sock *tp = tcp_sk(sp); const struct inet_connection_sock *icsk = inet_csk(sp); const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; + u8 icsk_pending; int rx_queue; int state; @@ -2185,12 +2186,13 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) destp = ntohs(inet->inet_dport); srcp = ntohs(inet->inet_sport); - if (icsk->icsk_pending == ICSK_TIME_RETRANS || - icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || - icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { + icsk_pending = smp_load_acquire(&icsk->icsk_pending); + if (icsk_pending == ICSK_TIME_RETRANS || + icsk_pending == ICSK_TIME_REO_TIMEOUT || + icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; timer_expires = icsk->icsk_timeout; - } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { + } else if (icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; timer_expires = icsk->icsk_timeout; } else if (timer_pending(&sp->sk_timer)) { -- cgit v1.3 From 3b784293016252118ed3b42c5479f20f89a0f384 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Oct 2024 17:30:41 +0000 Subject: tcp: add a fast path in tcp_write_timer() retransmit timer is not stopped from inet_csk_clear_xmit_timer() because we do not define INET_CSK_CLEAR_TIMERS. This is a conscious choice : for active TCP flows, it is better to only call mod_timer(), because there is more chances of keeping the timer unchanged. Also inet_csk_clear_xmit_timer() is often called from another cpu, and calling del_timer() would cause false sharing and lock contention. This means that very often, tcp_write_timer() is called at the timer expiration, while there is nothing to retransmit. This can be detected very early, avoiding the socket spinlock. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20241002173042.917928-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_timer.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 56c597e763ac..b7266b9101ce 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -717,6 +717,10 @@ static void tcp_write_timer(struct timer_list *t) from_timer(icsk, t, icsk_retransmit_timer); struct sock *sk = &icsk->icsk_inet.sk; + /* Avoid locking the socket when there is no pending event. */ + if (!smp_load_acquire(&icsk->icsk_pending)) + goto out; + bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { tcp_write_timer_handler(sk); @@ -726,6 +730,7 @@ static void tcp_write_timer(struct timer_list *t) sock_hold(sk); } bh_unlock_sock(sk); +out: sock_put(sk); } -- cgit v1.3 From 81df4fa94ee8c0800ed42c47357435602ed105ad Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Oct 2024 17:30:42 +0000 Subject: tcp: add a fast path in tcp_delack_timer() delack timer is not stopped from inet_csk_clear_xmit_timer() because we do not define INET_CSK_CLEAR_TIMERS. This is a conscious choice : inet_csk_clear_xmit_timer() is often called from another cpu. Calling del_timer() would cause false sharing and lock contention. This means that very often, tcp_delack_timer() is called at the timer expiration, while there is no ACK to transmit. This can be detected very early, avoiding the socket spinlock. Notes: - test about tp->compressed_ack is racy, but in the unlikely case there is a race, the dedicated compressed_ack_timer hrtimer would close it. - Even if the fast path is not taken, reading icsk->icsk_ack.pending and tp->compressed_ack before acquiring the socket spinlock reduces acquisition time and chances of contention. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20241002173042.917928-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 5 +++-- net/ipv4/inet_connection_sock.c | 4 ++-- net/ipv4/tcp_output.c | 3 ++- net/ipv4/tcp_timer.c | 9 +++++++++ net/mptcp/protocol.c | 3 ++- 5 files changed, 18 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 914d19772704..3c82fad904d4 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -202,7 +202,7 @@ static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) sk_stop_timer(sk, &icsk->icsk_retransmit_timer); #endif } else if (what == ICSK_TIME_DACK) { - icsk->icsk_ack.pending = 0; + smp_store_release(&icsk->icsk_ack.pending, 0); icsk->icsk_ack.retry = 0; #ifdef INET_CSK_CLEAR_TIMERS sk_stop_timer(sk, &icsk->icsk_delack_timer); @@ -233,7 +233,8 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, icsk->icsk_timeout = jiffies + when; sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); } else if (what == ICSK_TIME_DACK) { - icsk->icsk_ack.pending |= ICSK_ACK_TIMER; + smp_store_release(&icsk->icsk_ack.pending, + icsk->icsk_ack.pending | ICSK_ACK_TIMER); icsk->icsk_ack.timeout = jiffies + when; sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); } else { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 8c53385cc808..12e975ed4910 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -776,7 +776,7 @@ void inet_csk_clear_xmit_timers(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); smp_store_release(&icsk->icsk_pending, 0); - icsk->icsk_ack.pending = 0; + smp_store_release(&icsk->icsk_ack.pending, 0); sk_stop_timer(sk, &icsk->icsk_retransmit_timer); sk_stop_timer(sk, &icsk->icsk_delack_timer); @@ -792,7 +792,7 @@ void inet_csk_clear_xmit_timers_sync(struct sock *sk) sock_not_owned_by_me(sk); smp_store_release(&icsk->icsk_pending, 0); - icsk->icsk_ack.pending = 0; + smp_store_release(&icsk->icsk_ack.pending, 0); sk_stop_timer_sync(sk, &icsk->icsk_retransmit_timer); sk_stop_timer_sync(sk, &icsk->icsk_delack_timer); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4d0407301603..08772395690d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -4224,7 +4224,8 @@ void tcp_send_delayed_ack(struct sock *sk) if (!time_before(timeout, icsk->icsk_ack.timeout)) timeout = icsk->icsk_ack.timeout; } - icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; + smp_store_release(&icsk->icsk_ack.pending, + icsk->icsk_ack.pending | ICSK_ACK_SCHED | ICSK_ACK_TIMER); icsk->icsk_ack.timeout = timeout; sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b7266b9101ce..c3a7442332d4 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -361,6 +361,14 @@ static void tcp_delack_timer(struct timer_list *t) from_timer(icsk, t, icsk_delack_timer); struct sock *sk = &icsk->icsk_inet.sk; + /* Avoid taking socket spinlock if there is no ACK to send. + * The compressed_ack check is racy, but a separate hrtimer + * will take care of it eventually. + */ + if (!(smp_load_acquire(&icsk->icsk_ack.pending) & ICSK_ACK_TIMER) && + !READ_ONCE(tcp_sk(sk)->compressed_ack)) + goto out; + bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { tcp_delack_timer_handler(sk); @@ -371,6 +379,7 @@ static void tcp_delack_timer(struct timer_list *t) sock_hold(sk); } bh_unlock_sock(sk); +out: sock_put(sk); } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index c2317919fc14..e85862352084 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3504,7 +3504,8 @@ static void schedule_3rdack_retransmission(struct sock *ssk) timeout += jiffies; WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER); - icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; + smp_store_release(&icsk->icsk_ack.pending, + icsk->icsk_ack.pending | ICSK_ACK_SCHED | ICSK_ACK_TIMER); icsk->icsk_ack.timeout = timeout; sk_reset_timer(ssk, &icsk->icsk_delack_timer, timeout); } -- cgit v1.3 From f858cc9eed5b05cbe38d7ffd2787c21e3718eb7d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Oct 2024 12:12:18 +0000 Subject: net: add IFLA_MAX_PACING_OFFLOAD_HORIZON device attribute Some network devices have the ability to offload EDT (Earliest Departure Time) which is the model used for TCP pacing and FQ packet scheduler. Some of them implement the timing wheel mechanism described in https://saeed.github.io/files/carousel-sigcomm17.pdf with an associated 'timing wheel horizon'. This patch adds dev->max_pacing_offload_horizon expressing this timing wheel horizon in nsec units. This is a read-only attribute. Unless a driver sets it, dev->max_pacing_offload_horizon is zero. v2: addressed Jakub feedback ( https://lore.kernel.org/netdev/20240930152304.472767-2-edumazet@google.com/T/#mf6294d714c41cc459962154cc2580ce3c9693663 ) v3: added yaml doc (also per Jakub feedback) Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241003121219.2396589-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt_link.yaml | 4 ++++ Documentation/networking/net_cachelines/net_device.rst | 1 + include/linux/netdevice.h | 4 ++++ include/uapi/linux/if_link.h | 1 + net/core/rtnetlink.c | 4 ++++ tools/include/uapi/linux/if_link.h | 1 + 6 files changed, 15 insertions(+) (limited to 'net') diff --git a/Documentation/netlink/specs/rt_link.yaml b/Documentation/netlink/specs/rt_link.yaml index 0c4d5d40cae9..d7131a1afadf 100644 --- a/Documentation/netlink/specs/rt_link.yaml +++ b/Documentation/netlink/specs/rt_link.yaml @@ -1137,6 +1137,10 @@ attribute-sets: name: dpll-pin type: nest nested-attributes: link-dpll-pin-attrs + - + name: max-pacing-offload-horizon + type: uint + doc: EDT offload horizon supported by the device (in nsec). - name: af-spec-attrs attributes: diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index 22b07c814f4a..49f03cb78c6e 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -183,3 +183,4 @@ struct_devlink_port* devlink_port struct_dpll_pin* dpll_pin struct hlist_head page_pools struct dim_irq_moder* irq_moder +u64 max_pacing_offload_horizon diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4d20c776a4ff..49a7e7db0883 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2009,6 +2009,8 @@ enum netdev_reg_state { * @dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem, * where the clock is recovered. * + * @max_pacing_offload_horizon: max EDT offload horizon in nsec. + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -2399,6 +2401,8 @@ struct net_device { /** @irq_moder: dim parameters used if IS_ENABLED(CONFIG_DIMLIB). */ struct dim_irq_moder *irq_moder; + u64 max_pacing_offload_horizon; + u8 priv[] ____cacheline_aligned __counted_by(priv_len); } ____cacheline_aligned; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 6dc258993b17..506ba9c80e83 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -377,6 +377,7 @@ enum { IFLA_GSO_IPV4_MAX_SIZE, IFLA_GRO_IPV4_MAX_SIZE, IFLA_DPLL_PIN, + IFLA_MAX_PACING_OFFLOAD_HORIZON, __IFLA_MAX }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f0a520987085..682d8d3127db 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1118,6 +1118,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */ + rtnl_devlink_port_size(dev) + rtnl_dpll_pin_size(dev) + + nla_total_size(8) /* IFLA_MAX_PACING_OFFLOAD_HORIZON */ + 0; } @@ -1867,6 +1868,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, READ_ONCE(dev->tso_max_size)) || nla_put_u32(skb, IFLA_TSO_MAX_SEGS, READ_ONCE(dev->tso_max_segs)) || + nla_put_uint(skb, IFLA_MAX_PACING_OFFLOAD_HORIZON, + READ_ONCE(dev->max_pacing_offload_horizon)) || #ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, READ_ONCE(dev->num_rx_queues)) || @@ -1975,6 +1978,7 @@ nla_put_failure: } static const struct nla_policy ifla_policy[IFLA_MAX+1] = { + [IFLA_UNSPEC] = { .strict_start_type = IFLA_DPLL_PIN }, [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 }, [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index f0d71b2a3f1e..96ec2b01e725 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -377,6 +377,7 @@ enum { IFLA_GSO_IPV4_MAX_SIZE, IFLA_GRO_IPV4_MAX_SIZE, IFLA_DPLL_PIN, + IFLA_MAX_PACING_OFFLOAD_HORIZON, __IFLA_MAX }; -- cgit v1.3 From f26080d47007df2ee90e65b7d390207ff3a588af Mon Sep 17 00:00:00 2001 From: Jeffrey Ji Date: Thu, 3 Oct 2024 12:12:19 +0000 Subject: net_sched: sch_fq: add the ability to offload pacing Some network devices have the ability to offload EDT (Earliest Departure Time) which is the model used for TCP pacing and FQ packet scheduler. Some of them implement the timing wheel mechanism described in https://saeed.github.io/files/carousel-sigcomm17.pdf with an associated 'timing wheel horizon'. This patchs adds to FQ packet scheduler TCA_FQ_OFFLOAD_HORIZON attribute. Its value is capped by the device max_pacing_offload_horizon, added in the prior patch. It allows FQ to let packets within pacing offload horizon to be delivered to the device, which will handle the needed delay without host involvement. Signed-off-by: Jeffrey Ji Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241003121219.2396589-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/pkt_sched.h | 2 ++ net/sched/sch_fq.c | 33 +++++++++++++++++++++++++++------ 2 files changed, 29 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index a3cd0c2dc995..25a9a47001cd 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -836,6 +836,8 @@ enum { TCA_FQ_WEIGHTS, /* Weights for each band */ + TCA_FQ_OFFLOAD_HORIZON, /* dequeue paced packets within this horizon immediately (us units) */ + __TCA_FQ_MAX }; diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 19a49af5a9e5..aeabf45c9200 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -111,6 +111,7 @@ struct fq_perband_flows { struct fq_sched_data { /* Read mostly cache line */ + u64 offload_horizon; u32 quantum; u32 initial_quantum; u32 flow_refill_delay; @@ -299,7 +300,7 @@ static void fq_gc(struct fq_sched_data *q, } /* Fast path can be used if : - * 1) Packet tstamp is in the past. + * 1) Packet tstamp is in the past, or within the pacing offload horizon. * 2) FQ qlen == 0 OR * (no flow is currently eligible for transmit, * AND fast path queue has less than 8 packets) @@ -314,7 +315,7 @@ static bool fq_fastpath_check(const struct Qdisc *sch, struct sk_buff *skb, const struct fq_sched_data *q = qdisc_priv(sch); const struct sock *sk; - if (fq_skb_cb(skb)->time_to_send > now) + if (fq_skb_cb(skb)->time_to_send > now + q->offload_horizon) return false; if (sch->q.qlen != 0) { @@ -595,15 +596,18 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now) unsigned long sample; struct rb_node *p; - if (q->time_next_delayed_flow > now) + if (q->time_next_delayed_flow > now + q->offload_horizon) return; /* Update unthrottle latency EWMA. * This is cheap and can help diagnosing timer/latency problems. */ sample = (unsigned long)(now - q->time_next_delayed_flow); - q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3; - q->unthrottle_latency_ns += sample >> 3; + if ((long)sample > 0) { + q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3; + q->unthrottle_latency_ns += sample >> 3; + } + now += q->offload_horizon; q->time_next_delayed_flow = ~0ULL; while ((p = rb_first(&q->delayed)) != NULL) { @@ -687,7 +691,7 @@ begin: u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send, f->time_next_packet); - if (now < time_next_packet) { + if (now + q->offload_horizon < time_next_packet) { head->first = f->next; f->time_next_packet = time_next_packet; fq_flow_set_throttled(q, f); @@ -925,6 +929,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 }, [TCA_FQ_PRIOMAP] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_prio_qopt)), [TCA_FQ_WEIGHTS] = NLA_POLICY_EXACT_LEN(FQ_BANDS * sizeof(s32)), + [TCA_FQ_OFFLOAD_HORIZON] = { .type = NLA_U32 }, }; /* compress a u8 array with all elems <= 3 to an array of 2-bit fields */ @@ -1100,6 +1105,17 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, WRITE_ONCE(q->horizon_drop, nla_get_u8(tb[TCA_FQ_HORIZON_DROP])); + if (tb[TCA_FQ_OFFLOAD_HORIZON]) { + u64 offload_horizon = (u64)NSEC_PER_USEC * + nla_get_u32(tb[TCA_FQ_OFFLOAD_HORIZON]); + + if (offload_horizon <= qdisc_dev(sch)->max_pacing_offload_horizon) { + WRITE_ONCE(q->offload_horizon, offload_horizon); + } else { + NL_SET_ERR_MSG_MOD(extack, "invalid offload_horizon"); + err = -EINVAL; + } + } if (!err) { sch_tree_unlock(sch); @@ -1183,6 +1199,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) .bands = FQ_BANDS, }; struct nlattr *opts; + u64 offload_horizon; u64 ce_threshold; s32 weights[3]; u64 horizon; @@ -1199,6 +1216,9 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) horizon = READ_ONCE(q->horizon); do_div(horizon, NSEC_PER_USEC); + offload_horizon = READ_ONCE(q->offload_horizon); + do_div(offload_horizon, NSEC_PER_USEC); + if (nla_put_u32(skb, TCA_FQ_PLIMIT, READ_ONCE(sch->limit)) || nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, @@ -1224,6 +1244,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_FQ_TIMER_SLACK, READ_ONCE(q->timer_slack)) || nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) || + nla_put_u32(skb, TCA_FQ_OFFLOAD_HORIZON, (u32)offload_horizon) || nla_put_u8(skb, TCA_FQ_HORIZON_DROP, READ_ONCE(q->horizon_drop))) goto nla_put_failure; -- cgit v1.3 From 0f4e6f94760026b4c46873bc21b440e6cef9c0b6 Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Mon, 3 Jun 2024 10:16:14 +0200 Subject: batman-adv: Start new development cycle This version will contain all the (major or even only minor) changes for Linux 6.13. The version number isn't a semantic version number with major and minor information. It is just encoding the year of the expected publishing as Linux -rc1 and the number of published versions this year (starting at 0). Signed-off-by: Simon Wunderlich --- net/batman-adv/main.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 3d4c36ae2e1a..97ea71a052f8 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2024.2" +#define BATADV_SOURCE_VERSION "2024.3" #endif /* B.A.T.M.A.N. parameters */ -- cgit v1.3 From 4436df478860bb5da1864df2cd20f281a210f139 Mon Sep 17 00:00:00 2001 From: Erick Archer Date: Fri, 7 Jun 2024 18:19:12 +0200 Subject: batman-adv: Add flex array to struct batadv_tvlv_tt_data The "struct batadv_tvlv_tt_data" uses a dynamically sized set of trailing elements. Specifically, it uses an array of structures of type "batadv_tvlv_tt_vlan_data". So, use the preferred way in the kernel declaring a flexible array [1]. At the same time, prepare for the coming implementation by GCC and Clang of the __counted_by attribute. Flexible array members annotated with __counted_by can have their accesses bounds-checked at run-time via CONFIG_UBSAN_BOUNDS (for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family functions). In this case, it is important to note that the attribute used is specifically __counted_by_be since variable "num_vlan" is of type __be16. The following change to the "batadv_tt_tvlv_ogm_handler_v1" function: - tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(tt_data + 1); - tt_change = (struct batadv_tvlv_tt_change *)(tt_vlan + num_vlan); + tt_change = (struct batadv_tvlv_tt_change *)((void *)tt_data + + flex_size); is intended to prevent the compiler from generating an "out-of-bounds" notification due to the __counted_by attribute. The compiler can do a pointer calculation using the vlan_data flexible array memory, or in other words, this may be calculated as an array offset, since it is the same as: &tt_data->vlan_data[num_vlan] Therefore, we go past the end of the array. In other "multiple trailing flexible array" situations, this has been solved by addressing from the base pointer, since the compiler either knows the full allocation size or it knows nothing about it (this case, since it came from a "void *" function argument). The order in which the structure batadv_tvlv_tt_data and the structure batadv_tvlv_tt_vlan_data are defined must be swap to avoid an incomplete type error. Also, avoid the open-coded arithmetic in memory allocator functions [2] using the "struct_size" macro and use the "flex_array_size" helper to clarify some calculations, when possible. Moreover, the new structure member also allow us to avoid the open-coded arithmetic on pointers in some situations. Take advantage of this. This code was detected with the help of Coccinelle, and audited and modified manually. Link: https://www.kernel.org/doc/html/next/process/deprecated.html#zero-length-and-one-element-arrays [1] Link: https://www.kernel.org/doc/html/next/process/deprecated.html#open-coded-arithmetic-in-allocator-arguments [2] Reviewed-by: Kees Cook Signed-off-by: Erick Archer Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batadv_packet.h | 29 ++++++++++++---------- net/batman-adv/translation-table.c | 49 ++++++++++++++++---------------------- 2 files changed, 36 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h index 6e25753015df..439132a819ea 100644 --- a/include/uapi/linux/batadv_packet.h +++ b/include/uapi/linux/batadv_packet.h @@ -9,6 +9,7 @@ #include #include +#include #include /** @@ -592,19 +593,6 @@ struct batadv_tvlv_gateway_data { __be32 bandwidth_up; }; -/** - * struct batadv_tvlv_tt_data - tt data propagated through the tt tvlv container - * @flags: translation table flags (see batadv_tt_data_flags) - * @ttvn: translation table version number - * @num_vlan: number of announced VLANs. In the TVLV this struct is followed by - * one batadv_tvlv_tt_vlan_data object per announced vlan - */ -struct batadv_tvlv_tt_data { - __u8 flags; - __u8 ttvn; - __be16 num_vlan; -}; - /** * struct batadv_tvlv_tt_vlan_data - vlan specific tt data propagated through * the tt tvlv container @@ -618,6 +606,21 @@ struct batadv_tvlv_tt_vlan_data { __u16 reserved; }; +/** + * struct batadv_tvlv_tt_data - tt data propagated through the tt tvlv container + * @flags: translation table flags (see batadv_tt_data_flags) + * @ttvn: translation table version number + * @num_vlan: number of announced VLANs. In the TVLV this struct is followed by + * one batadv_tvlv_tt_vlan_data object per announced vlan + * @vlan_data: array of batadv_tvlv_tt_vlan_data objects + */ +struct batadv_tvlv_tt_data { + __u8 flags; + __u8 ttvn; + __be16 num_vlan; + struct batadv_tvlv_tt_vlan_data vlan_data[] __counted_by_be(num_vlan); +}; + /** * struct batadv_tvlv_tt_change - translation table diff data * @flags: status indicators concerning the non-mesh client (see diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 2243cec18ecc..6815d1262feb 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -856,8 +857,7 @@ batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node, num_entries += atomic_read(&vlan->tt.num_entries); } - change_offset = sizeof(**tt_data); - change_offset += num_vlan * sizeof(*tt_vlan); + change_offset = struct_size(*tt_data, vlan_data, num_vlan); /* if tt_len is negative, allocate the space needed by the full table */ if (*tt_len < 0) @@ -876,7 +876,7 @@ batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node, (*tt_data)->ttvn = atomic_read(&orig_node->last_ttvn); (*tt_data)->num_vlan = htons(num_vlan); - tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(*tt_data + 1); + tt_vlan = (*tt_data)->vlan_data; hlist_for_each_entry(vlan, &orig_node->vlan_list, list) { tt_vlan->vid = htons(vlan->vid); tt_vlan->crc = htonl(vlan->tt.crc); @@ -936,8 +936,7 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, total_entries += vlan_entries; } - change_offset = sizeof(**tt_data); - change_offset += num_vlan * sizeof(*tt_vlan); + change_offset = struct_size(*tt_data, vlan_data, num_vlan); /* if tt_len is negative, allocate the space needed by the full table */ if (*tt_len < 0) @@ -956,7 +955,7 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, (*tt_data)->ttvn = atomic_read(&bat_priv->tt.vn); (*tt_data)->num_vlan = htons(num_vlan); - tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(*tt_data + 1); + tt_vlan = (*tt_data)->vlan_data; hlist_for_each_entry(vlan, &bat_priv->softif_vlan_list, list) { vlan_entries = atomic_read(&vlan->tt.num_entries); if (vlan_entries < 1) @@ -2916,7 +2915,6 @@ static bool batadv_send_tt_request(struct batadv_priv *bat_priv, { struct batadv_tvlv_tt_data *tvlv_tt_data = NULL; struct batadv_tt_req_node *tt_req_node = NULL; - struct batadv_tvlv_tt_vlan_data *tt_vlan_req; struct batadv_hard_iface *primary_if; bool ret = false; int i, size; @@ -2932,7 +2930,7 @@ static bool batadv_send_tt_request(struct batadv_priv *bat_priv, if (!tt_req_node) goto out; - size = sizeof(*tvlv_tt_data) + sizeof(*tt_vlan_req) * num_vlan; + size = struct_size(tvlv_tt_data, vlan_data, num_vlan); tvlv_tt_data = kzalloc(size, GFP_ATOMIC); if (!tvlv_tt_data) goto out; @@ -2944,12 +2942,10 @@ static bool batadv_send_tt_request(struct batadv_priv *bat_priv, /* send all the CRCs within the request. This is needed by intermediate * nodes to ensure they have the correct table before replying */ - tt_vlan_req = (struct batadv_tvlv_tt_vlan_data *)(tvlv_tt_data + 1); for (i = 0; i < num_vlan; i++) { - tt_vlan_req->vid = tt_vlan->vid; - tt_vlan_req->crc = tt_vlan->crc; + tvlv_tt_data->vlan_data[i].vid = tt_vlan->vid; + tvlv_tt_data->vlan_data[i].crc = tt_vlan->crc; - tt_vlan_req++; tt_vlan++; } @@ -3001,7 +2997,6 @@ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv, struct batadv_orig_node *res_dst_orig_node = NULL; struct batadv_tvlv_tt_change *tt_change; struct batadv_tvlv_tt_data *tvlv_tt_data = NULL; - struct batadv_tvlv_tt_vlan_data *tt_vlan; bool ret = false, full_table; u8 orig_ttvn, req_ttvn; u16 tvlv_len; @@ -3024,10 +3019,9 @@ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv, orig_ttvn = (u8)atomic_read(&req_dst_orig_node->last_ttvn); req_ttvn = tt_data->ttvn; - tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(tt_data + 1); /* this node doesn't have the requested data */ if (orig_ttvn != req_ttvn || - !batadv_tt_global_check_crc(req_dst_orig_node, tt_vlan, + !batadv_tt_global_check_crc(req_dst_orig_node, tt_data->vlan_data, ntohs(tt_data->num_vlan))) goto out; @@ -3370,7 +3364,6 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node = NULL; struct batadv_tvlv_tt_change *tt_change; u8 *tvlv_ptr = (u8 *)tt_data; - u16 change_offset; batadv_dbg(BATADV_DBG_TT, bat_priv, "Received TT_RESPONSE from %pM for ttvn %d t_size: %d [%c]\n", @@ -3383,10 +3376,7 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv, spin_lock_bh(&orig_node->tt_lock); - change_offset = sizeof(struct batadv_tvlv_tt_vlan_data); - change_offset *= ntohs(tt_data->num_vlan); - change_offset += sizeof(*tt_data); - tvlv_ptr += change_offset; + tvlv_ptr += struct_size(tt_data, vlan_data, ntohs(tt_data->num_vlan)); tt_change = (struct batadv_tvlv_tt_change *)tvlv_ptr; if (tt_data->flags & BATADV_TT_FULL_TABLE) { @@ -3985,10 +3975,10 @@ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, u8 flags, void *tvlv_value, u16 tvlv_value_len) { - struct batadv_tvlv_tt_vlan_data *tt_vlan; struct batadv_tvlv_tt_change *tt_change; struct batadv_tvlv_tt_data *tt_data; u16 num_entries, num_vlan; + size_t flex_size; if (tvlv_value_len < sizeof(*tt_data)) return; @@ -3998,17 +3988,18 @@ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, num_vlan = ntohs(tt_data->num_vlan); - if (tvlv_value_len < sizeof(*tt_vlan) * num_vlan) + flex_size = flex_array_size(tt_data, vlan_data, num_vlan); + if (tvlv_value_len < flex_size) return; - tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(tt_data + 1); - tt_change = (struct batadv_tvlv_tt_change *)(tt_vlan + num_vlan); - tvlv_value_len -= sizeof(*tt_vlan) * num_vlan; + tt_change = (struct batadv_tvlv_tt_change *)((void *)tt_data + + flex_size); + tvlv_value_len -= flex_size; num_entries = batadv_tt_entries(tvlv_value_len); - batadv_tt_update_orig(bat_priv, orig, tt_vlan, num_vlan, tt_change, - num_entries, tt_data->ttvn); + batadv_tt_update_orig(bat_priv, orig, tt_data->vlan_data, num_vlan, + tt_change, num_entries, tt_data->ttvn); } /** @@ -4039,8 +4030,8 @@ static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, tt_data = tvlv_value; tvlv_value_len -= sizeof(*tt_data); - tt_vlan_len = sizeof(struct batadv_tvlv_tt_vlan_data); - tt_vlan_len *= ntohs(tt_data->num_vlan); + tt_vlan_len = flex_array_size(tt_data, vlan_data, + ntohs(tt_data->num_vlan)); if (tvlv_value_len < tt_vlan_len) return NET_RX_SUCCESS; -- cgit v1.3 From 5c956d11cfca57650e881522f5995e9d8b548423 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 6 Oct 2024 19:00:01 +0200 Subject: batman-adv: Use string choice helper to print booleans The commit ea4692c75e1c ("lib/string_helpers: Consolidate string helpers implementation") introduced common helpers to print string representations of boolean helpers. These are supposed to be used instead of open coded versions. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_iv_ogm.c | 4 ++-- net/batman-adv/bridge_loop_avoidance.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 74b49c35ddc1..07ae5dd1f150 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -371,8 +372,7 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, batadv_ogm_packet->orig, ntohl(batadv_ogm_packet->seqno), batadv_ogm_packet->tq, batadv_ogm_packet->ttl, - ((batadv_ogm_packet->flags & BATADV_DIRECTLINK) ? - "on" : "off"), + str_on_off(batadv_ogm_packet->flags & BATADV_DIRECTLINK), hard_iface->net_dev->name, hard_iface->net_dev->dev_addr); diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 5f46ca3d4bb8..449faf5a5487 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -1946,16 +1947,15 @@ bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, claim = batadv_claim_hash_find(bat_priv, &search_claim); if (!claim) { + bool local = batadv_is_my_client(bat_priv, ethhdr->h_source, vid); + /* possible optimization: race for a claim */ /* No claim exists yet, claim it for us! */ batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): Unclaimed MAC %pM found. Claim it. Local: %s\n", - __func__, ethhdr->h_source, - batadv_is_my_client(bat_priv, - ethhdr->h_source, vid) ? - "yes" : "no"); + __func__, ethhdr->h_source, str_yes_no(local)); batadv_handle_claim(bat_priv, primary_if, primary_if->net_dev->dev_addr, ethhdr->h_source, vid); -- cgit v1.3 From be5498cac2ddb112c5bd7433d5e834a1a2493427 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 2 Jun 2024 23:58:44 -0400 Subject: remove pointless includes of some of those used to be needed, some had been cargo-culted for no reason... Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/fcntl.c | 1 - fs/file_table.c | 1 - fs/notify/fanotify/fanotify.c | 1 - fs/notify/fanotify/fanotify_user.c | 1 - fs/overlayfs/copy_up.c | 1 - fs/proc/base.c | 1 - io_uring/io_uring.c | 1 - kernel/bpf/bpf_inode_storage.c | 1 - kernel/bpf/bpf_task_storage.c | 1 - kernel/bpf/token.c | 1 - kernel/exit.c | 1 - kernel/module/dups.c | 1 - kernel/module/kmod.c | 1 - kernel/umh.c | 1 - net/handshake/request.c | 1 - security/apparmor/domain.c | 1 - 16 files changed, 16 deletions(-) (limited to 'net') diff --git a/fs/fcntl.c b/fs/fcntl.c index 22dd9dcce7ec..8928874c8a2e 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/file_table.c b/fs/file_table.c index eed5ffad9997..9e46fd4336b0 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 224bccaab4cc..24c7c5df4998 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include -#include #include #include #include diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 9644bc72e457..61b83039771e 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include #include -#include #include #include #include diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 2ed6ad641a20..ee2cbd044ce6 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include "overlayfs.h" diff --git a/fs/proc/base.c b/fs/proc/base.c index b31283d81c52..e9d7ddc52f69 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -58,7 +58,6 @@ #include #include #include -#include #include #include #include diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b2736e3491b8..5a1676bab998 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -51,7 +51,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 29da6d3838f6..e16e79f8cd6d 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -16,7 +16,6 @@ #include #include #include -#include #include DEFINE_BPF_STORAGE_CACHE(inode_cache); diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index adf6dfe0ba68..1eb9852a9f8e 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -16,7 +16,6 @@ #include #include #include -#include #include DEFINE_BPF_STORAGE_CACHE(task_cache); diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index dcbec1a0dfb3..26057aa13503 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -1,6 +1,5 @@ #include #include -#include #include #include #include diff --git a/kernel/exit.c b/kernel/exit.c index 619f0014c33b..1dcddfe537ee 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/module/dups.c b/kernel/module/dups.c index 9a92f2f8c9d3..bd2149fbe117 100644 --- a/kernel/module/dups.c +++ b/kernel/module/dups.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/module/kmod.c b/kernel/module/kmod.c index 0800d9891692..25f253812512 100644 --- a/kernel/module/kmod.c +++ b/kernel/module/kmod.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/umh.c b/kernel/umh.c index ff1f13a27d29..be9234270777 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/net/handshake/request.c b/net/handshake/request.c index 94d5cef3e048..274d2c89b6b2 100644 --- a/net/handshake/request.c +++ b/net/handshake/request.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c index 571158ec6188..2bc34dce9a46 100644 --- a/security/apparmor/domain.c +++ b/security/apparmor/domain.c @@ -9,7 +9,6 @@ */ #include -#include #include #include #include -- cgit v1.3 From 269084f748524fa1a3fb8eb530eb70f77e7c3e4a Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 3 Oct 2024 16:22:31 +0800 Subject: net: tcp: refresh tcp_mstamp for compressed ack in timer For now, we refresh the tcp_mstamp for delayed acks and keepalives, but not for the compressed ack in tcp_compressed_ack_kick(). I have not found out the effact of the tcp_mstamp when sending ack, but we can still refresh it for the compressed ack to keep consistent. Signed-off-by: Menglong Dong Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241003082231.759759-1-dongml2@chinatelecom.cn Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_timer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index c3a7442332d4..b412ed88ccd9 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -865,6 +865,7 @@ static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer) * LINUX_MIB_TCPACKCOMPRESSED accurate. */ tp->compressed_ack--; + tcp_mstamp_refresh(tp); tcp_send_ack(sk); } } else { -- cgit v1.3 From 539770616521e5b046ca7612eb79ba11b53edb1d Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 3 Oct 2024 12:52:17 +0100 Subject: net: dsa: remove obsolete phylink dsa_switch operations No driver now uses the DSA switch phylink members, so we can now remove the method pointers, but we need to leave empty shim functions to allow those drivers that do not provide phylink MAC operations structure to continue functioning. Signed-off-by: Russell King (oracle) Reviewed-by: Vladimir Oltean Tested-by: Vladimir Oltean # sja1105, felix, dsa_loop Link: https://patch.msgid.link/E1swKNV-0060oN-1b@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 15 --------------- net/dsa/dsa.c | 8 -------- net/dsa/port.c | 34 +--------------------------------- 3 files changed, 1 insertion(+), 56 deletions(-) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index d7a6c2930277..72ae65e7246a 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -885,21 +885,6 @@ struct dsa_switch_ops { */ void (*phylink_get_caps)(struct dsa_switch *ds, int port, struct phylink_config *config); - struct phylink_pcs *(*phylink_mac_select_pcs)(struct dsa_switch *ds, - int port, - phy_interface_t iface); - void (*phylink_mac_config)(struct dsa_switch *ds, int port, - unsigned int mode, - const struct phylink_link_state *state); - void (*phylink_mac_link_down)(struct dsa_switch *ds, int port, - unsigned int mode, - phy_interface_t interface); - void (*phylink_mac_link_up)(struct dsa_switch *ds, int port, - unsigned int mode, - phy_interface_t interface, - struct phy_device *phydev, - int speed, int duplex, - bool tx_pause, bool rx_pause); void (*phylink_fixed_state)(struct dsa_switch *ds, int port, struct phylink_link_state *state); /* diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 1664547deffd..5a7c0e565a89 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -1505,14 +1505,6 @@ static int dsa_switch_probe(struct dsa_switch *ds) if (!ds->num_ports) return -EINVAL; - if (ds->phylink_mac_ops) { - if (ds->ops->phylink_mac_select_pcs || - ds->ops->phylink_mac_config || - ds->ops->phylink_mac_link_down || - ds->ops->phylink_mac_link_up) - return -EINVAL; - } - if (np) { err = dsa_switch_parse_of(ds, np); if (err) diff --git a/net/dsa/port.c b/net/dsa/port.c index 25258b33e59e..f1e96706a701 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -1579,40 +1579,19 @@ static struct phylink_pcs * dsa_port_phylink_mac_select_pcs(struct phylink_config *config, phy_interface_t interface) { - struct dsa_port *dp = dsa_phylink_to_port(config); - struct phylink_pcs *pcs = ERR_PTR(-EOPNOTSUPP); - struct dsa_switch *ds = dp->ds; - - if (ds->ops->phylink_mac_select_pcs) - pcs = ds->ops->phylink_mac_select_pcs(ds, dp->index, interface); - - return pcs; + return ERR_PTR(-EOPNOTSUPP); } static void dsa_port_phylink_mac_config(struct phylink_config *config, unsigned int mode, const struct phylink_link_state *state) { - struct dsa_port *dp = dsa_phylink_to_port(config); - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->phylink_mac_config) - return; - - ds->ops->phylink_mac_config(ds, dp->index, mode, state); } static void dsa_port_phylink_mac_link_down(struct phylink_config *config, unsigned int mode, phy_interface_t interface) { - struct dsa_port *dp = dsa_phylink_to_port(config); - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->phylink_mac_link_down) - return; - - ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface); } static void dsa_port_phylink_mac_link_up(struct phylink_config *config, @@ -1622,14 +1601,6 @@ static void dsa_port_phylink_mac_link_up(struct phylink_config *config, int speed, int duplex, bool tx_pause, bool rx_pause) { - struct dsa_port *dp = dsa_phylink_to_port(config); - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->phylink_mac_link_up) - return; - - ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev, - speed, duplex, tx_pause, rx_pause); } static const struct phylink_mac_ops dsa_port_phylink_mac_ops = { @@ -1871,9 +1842,6 @@ static void dsa_shared_port_link_down(struct dsa_port *dp) if (ds->phylink_mac_ops && ds->phylink_mac_ops->mac_link_down) ds->phylink_mac_ops->mac_link_down(&dp->pl_config, MLO_AN_FIXED, PHY_INTERFACE_MODE_NA); - else if (ds->ops->phylink_mac_link_down) - ds->ops->phylink_mac_link_down(ds, dp->index, MLO_AN_FIXED, - PHY_INTERFACE_MODE_NA); } int dsa_shared_port_link_register_of(struct dsa_port *dp) -- cgit v1.3 From 8a0f62fdeb9ea66ad3d0e959c7c4addbabeac1be Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Oct 2024 13:47:17 +0000 Subject: ipv4: remove fib_devindex_hashfn() fib_devindex_hashfn() converts a 32bit ifindex value to a 8bit hash. It makes no sense doing this from fib_info_hashfn() and fib_find_info_nh(). It is better to keep as many bits as possible to let fib_info_hashfn_result() have better spread. Only fib_info_devhash_bucket() needs to make this operation, we can 'inline' trivial fib_devindex_hashfn() in it. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: David Ahern Link: https://patch.msgid.link/20241004134720.579244-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_semantics.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 1a847ba40458..1219d1b32591 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -322,17 +322,12 @@ static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi) return 0; } -static inline unsigned int fib_devindex_hashfn(unsigned int val) -{ - return hash_32(val, DEVINDEX_HASHBITS); -} - static struct hlist_head * fib_info_devhash_bucket(const struct net_device *dev) { u32 val = net_hash_mix(dev_net(dev)) ^ dev->ifindex; - return &fib_info_devhash[fib_devindex_hashfn(val)]; + return &fib_info_devhash[hash_32(val, DEVINDEX_HASHBITS)]; } static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope, @@ -362,10 +357,10 @@ static inline unsigned int fib_info_hashfn(struct fib_info *fi) fi->fib_priority); if (fi->nh) { - val ^= fib_devindex_hashfn(fi->nh->id); + val ^= fi->nh->id; } else { for_nexthops(fi) { - val ^= fib_devindex_hashfn(nh->fib_nh_oif); + val ^= nh->fib_nh_oif; } endfor_nexthops(fi) } @@ -380,7 +375,7 @@ static struct fib_info *fib_find_info_nh(struct net *net, struct fib_info *fi; unsigned int hash; - hash = fib_info_hashfn_1(fib_devindex_hashfn(cfg->fc_nh_id), + hash = fib_info_hashfn_1(cfg->fc_nh_id, cfg->fc_protocol, cfg->fc_scope, (__force u32)cfg->fc_prefsrc, cfg->fc_priority); -- cgit v1.3 From fc38b28365e5f1396209d2878a34065468765087 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Oct 2024 13:47:18 +0000 Subject: ipv4: use rcu in ip_fib_check_default() fib_info_devhash[] is not resized in fib_info_hash_move(). fib_nh structs are already freed after an rcu grace period. This will allow to remove fib_info_lock in the following patch. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: David Ahern Link: https://patch.msgid.link/20241004134720.579244-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_semantics.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 1219d1b32591..e0ffb4ffd95d 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -275,7 +275,7 @@ void fib_release_info(struct fib_info *fi) change_nexthops(fi) { if (!nexthop_nh->fib_nh_dev) continue; - hlist_del(&nexthop_nh->nh_hash); + hlist_del_rcu(&nexthop_nh->nh_hash); } endfor_nexthops(fi) } /* Paired with READ_ONCE() from fib_table_lookup() */ @@ -431,28 +431,23 @@ static struct fib_info *fib_find_info(struct fib_info *nfi) } /* Check, that the gateway is already configured. - * Used only by redirect accept routine. + * Used only by redirect accept routine, under rcu_read_lock(); */ int ip_fib_check_default(__be32 gw, struct net_device *dev) { struct hlist_head *head; struct fib_nh *nh; - spin_lock(&fib_info_lock); - head = fib_info_devhash_bucket(dev); - hlist_for_each_entry(nh, head, nh_hash) { + hlist_for_each_entry_rcu(nh, head, nh_hash) { if (nh->fib_nh_dev == dev && nh->fib_nh_gw4 == gw && !(nh->fib_nh_flags & RTNH_F_DEAD)) { - spin_unlock(&fib_info_lock); return 0; } } - spin_unlock(&fib_info_lock); - return -1; } @@ -1606,7 +1601,7 @@ link_it: if (!nexthop_nh->fib_nh_dev) continue; head = fib_info_devhash_bucket(nexthop_nh->fib_nh_dev); - hlist_add_head(&nexthop_nh->nh_hash, head); + hlist_add_head_rcu(&nexthop_nh->nh_hash, head); } endfor_nexthops(fi) } spin_unlock_bh(&fib_info_lock); -- cgit v1.3 From 143ca845ec0c625c410768c36e1a949ef4ed1915 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Oct 2024 13:47:19 +0000 Subject: ipv4: remove fib_info_lock After the prior patch, fib_info_lock became redundant because all of its users are holding RTNL. BH protection is not needed. Remove the READ_ONCE()/WRITE_ONCE() annotations around fib_info_cnt, since it is protected by RTNL. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: David Ahern Link: https://patch.msgid.link/20241004134720.579244-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_semantics.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e0ffb4ffd95d..ece779bfb8f6 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -50,7 +50,6 @@ #include "fib_lookup.h" -static DEFINE_SPINLOCK(fib_info_lock); static struct hlist_head *fib_info_hash; static struct hlist_head *fib_info_laddrhash; static unsigned int fib_info_hash_size; @@ -260,12 +259,11 @@ EXPORT_SYMBOL_GPL(free_fib_info); void fib_release_info(struct fib_info *fi) { - spin_lock_bh(&fib_info_lock); + ASSERT_RTNL(); if (fi && refcount_dec_and_test(&fi->fib_treeref)) { hlist_del(&fi->fib_hash); - /* Paired with READ_ONCE() in fib_create_info(). */ - WRITE_ONCE(fib_info_cnt, fib_info_cnt - 1); + fib_info_cnt--; if (fi->fib_prefsrc) hlist_del(&fi->fib_lhash); @@ -282,7 +280,6 @@ void fib_release_info(struct fib_info *fi) WRITE_ONCE(fi->fib_dead, 1); fib_info_put(fi); } - spin_unlock_bh(&fib_info_lock); } static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi) @@ -1266,7 +1263,7 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, unsigned int old_size = fib_info_hash_size; unsigned int i; - spin_lock_bh(&fib_info_lock); + ASSERT_RTNL(); old_info_hash = fib_info_hash; old_laddrhash = fib_info_laddrhash; fib_info_hash_size = new_size; @@ -1303,8 +1300,6 @@ static void fib_info_hash_move(struct hlist_head *new_info_hash, } } - spin_unlock_bh(&fib_info_lock); - kvfree(old_info_hash); kvfree(old_laddrhash); } @@ -1380,6 +1375,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, int nhs = 1; struct net *net = cfg->fc_nlinfo.nl_net; + ASSERT_RTNL(); if (cfg->fc_type > RTN_MAX) goto err_inval; @@ -1422,8 +1418,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, err = -ENOBUFS; - /* Paired with WRITE_ONCE() in fib_release_info() */ - if (READ_ONCE(fib_info_cnt) >= fib_info_hash_size) { + if (fib_info_cnt >= fib_info_hash_size) { unsigned int new_size = fib_info_hash_size << 1; struct hlist_head *new_info_hash; struct hlist_head *new_laddrhash; @@ -1582,7 +1577,7 @@ link_it: refcount_set(&fi->fib_treeref, 1); refcount_set(&fi->fib_clntref, 1); - spin_lock_bh(&fib_info_lock); + fib_info_cnt++; hlist_add_head(&fi->fib_hash, &fib_info_hash[fib_info_hashfn(fi)]); @@ -1604,7 +1599,6 @@ link_it: hlist_add_head_rcu(&nexthop_nh->nh_hash, head); } endfor_nexthops(fi) } - spin_unlock_bh(&fib_info_lock); return fi; err_inval: -- cgit v1.3 From a3f5f4c2f9b6bc2aa6f5a3e8e23b7519e4f2e3e3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Oct 2024 13:47:20 +0000 Subject: ipv4: remove fib_info_devhash[] Upcoming per-netns RTNL conversion needs to get rid of shared hash tables. fib_info_devhash[] is one of them. It is unclear why we used a hash table, because a single hlist_head per net device was cheaper and scalable. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: David Ahern Link: https://patch.msgid.link/20241004134720.579244-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- .../networking/net_cachelines/net_device.rst | 1 + include/linux/netdevice.h | 3 ++ net/ipv4/fib_semantics.c | 35 ++++++++++------------ 3 files changed, 19 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index 49f03cb78c6e..556711c4d3cf 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -83,6 +83,7 @@ unsigned_int allmulti bool uc_promisc unsigned_char nested_level struct_in_device* ip_ptr read_mostly read_mostly __in_dev_get +struct hlist_head fib_nh_head struct_inet6_dev* ip6_ptr read_mostly read_mostly __in6_dev_get struct_vlan_info* vlan_info struct_dsa_port* dsa_ptr diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 49a7e7db0883..3baf8e539b6f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2211,6 +2211,9 @@ struct net_device { /* Protocol-specific pointers */ struct in_device __rcu *ip_ptr; + /** @fib_nh_head: nexthops associated with this netdev */ + struct hlist_head fib_nh_head; + #if IS_ENABLED(CONFIG_VLAN_8021Q) struct vlan_info __rcu *vlan_info; #endif diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index ece779bfb8f6..d2cee5c314f5 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -56,10 +56,6 @@ static unsigned int fib_info_hash_size; static unsigned int fib_info_hash_bits; static unsigned int fib_info_cnt; -#define DEVINDEX_HASHBITS 8 -#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS) -static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; - /* for_nexthops and change_nexthops only used when nexthop object * is not set in a fib_info. The logic within can reference fib_nh. */ @@ -319,12 +315,9 @@ static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi) return 0; } -static struct hlist_head * -fib_info_devhash_bucket(const struct net_device *dev) +static struct hlist_head *fib_nh_head(struct net_device *dev) { - u32 val = net_hash_mix(dev_net(dev)) ^ dev->ifindex; - - return &fib_info_devhash[hash_32(val, DEVINDEX_HASHBITS)]; + return &dev->fib_nh_head; } static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope, @@ -435,11 +428,11 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev) struct hlist_head *head; struct fib_nh *nh; - head = fib_info_devhash_bucket(dev); + head = fib_nh_head(dev); hlist_for_each_entry_rcu(nh, head, nh_hash) { - if (nh->fib_nh_dev == dev && - nh->fib_nh_gw4 == gw && + DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); + if (nh->fib_nh_gw4 == gw && !(nh->fib_nh_flags & RTNH_F_DEAD)) { return 0; } @@ -1595,7 +1588,7 @@ link_it: if (!nexthop_nh->fib_nh_dev) continue; - head = fib_info_devhash_bucket(nexthop_nh->fib_nh_dev); + head = fib_nh_head(nexthop_nh->fib_nh_dev); hlist_add_head_rcu(&nexthop_nh->nh_hash, head); } endfor_nexthops(fi) } @@ -1948,12 +1941,12 @@ void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig) void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) { - struct hlist_head *head = fib_info_devhash_bucket(dev); + struct hlist_head *head = fib_nh_head(dev); struct fib_nh *nh; hlist_for_each_entry(nh, head, nh_hash) { - if (nh->fib_nh_dev == dev) - fib_nhc_update_mtu(&nh->nh_common, dev->mtu, orig_mtu); + DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); + fib_nhc_update_mtu(&nh->nh_common, dev->mtu, orig_mtu); } } @@ -1967,7 +1960,7 @@ void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) */ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) { - struct hlist_head *head = fib_info_devhash_bucket(dev); + struct hlist_head *head = fib_nh_head(dev); struct fib_info *prev_fi = NULL; int scope = RT_SCOPE_NOWHERE; struct fib_nh *nh; @@ -1981,7 +1974,8 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) int dead; BUG_ON(!fi->fib_nhs); - if (nh->fib_nh_dev != dev || fi == prev_fi) + DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); + if (fi == prev_fi) continue; prev_fi = fi; dead = 0; @@ -2131,7 +2125,7 @@ int fib_sync_up(struct net_device *dev, unsigned char nh_flags) } prev_fi = NULL; - head = fib_info_devhash_bucket(dev); + head = fib_nh_head(dev); ret = 0; hlist_for_each_entry(nh, head, nh_hash) { @@ -2139,7 +2133,8 @@ int fib_sync_up(struct net_device *dev, unsigned char nh_flags) int alive; BUG_ON(!fi->fib_nhs); - if (nh->fib_nh_dev != dev || fi == prev_fi) + DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); + if (fi == prev_fi) continue; prev_fi = fi; -- cgit v1.3 From 76aed95319da25d6884dff01d5f0149e4b542f96 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 4 Oct 2024 15:10:29 -0700 Subject: rtnetlink: Add per-netns RTNL. The goal is to break RTNL down into per-netns mutex. This patch adds per-netns mutex and its helper functions, rtnl_net_lock() and rtnl_net_unlock(). rtnl_net_lock() acquires the global RTNL and per-netns RTNL mutex, and rtnl_net_unlock() releases them. We will replace 800+ rtnl_lock() with rtnl_net_lock() and finally removes rtnl_lock() in rtnl_net_lock(). When we need to nest per-netns RTNL mutex, we will use __rtnl_net_lock(), and its locking order is defined by rtnl_net_lock_cmp_fn() as follows: 1. init_net is first 2. netns address ascending order Note that the conversion will be done under CONFIG_DEBUG_NET_SMALL_RTNL with LOCKDEP so that we can carefully add the extra mutex without slowing down RTNL operations during conversion. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/linux/rtnetlink.h | 21 ++++++++++++++++ include/net/net_namespace.h | 4 ++++ net/Kconfig.debug | 15 ++++++++++++ net/core/net_namespace.c | 6 +++++ net/core/rtnetlink.c | 58 +++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 104 insertions(+) (limited to 'net') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index cdfc897f1e3c..edd840a49989 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -92,6 +92,27 @@ static inline bool lockdep_rtnl_is_held(void) #define rcu_replace_pointer_rtnl(rp, p) \ rcu_replace_pointer(rp, p, lockdep_rtnl_is_held()) +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL +void __rtnl_net_lock(struct net *net); +void __rtnl_net_unlock(struct net *net); +void rtnl_net_lock(struct net *net); +void rtnl_net_unlock(struct net *net); +int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b); +#else +static inline void __rtnl_net_lock(struct net *net) {} +static inline void __rtnl_net_unlock(struct net *net) {} + +static inline void rtnl_net_lock(struct net *net) +{ + rtnl_lock(); +} + +static inline void rtnl_net_unlock(struct net *net) +{ + rtnl_unlock(); +} +#endif + static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev) { return rtnl_dereference(dev->ingress_queue); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index e67b483cc8bb..873c0f9fdac6 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -188,6 +188,10 @@ struct net { #if IS_ENABLED(CONFIG_SMC) struct netns_smc smc; #endif +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL + /* Move to a better place when the config guard is removed. */ + struct mutex rtnl_mutex; +#endif } __randomize_layout; #include diff --git a/net/Kconfig.debug b/net/Kconfig.debug index 5e3fffe707dd..277fab8c4d77 100644 --- a/net/Kconfig.debug +++ b/net/Kconfig.debug @@ -24,3 +24,18 @@ config DEBUG_NET help Enable extra sanity checks in networking. This is mostly used by fuzzers, but is safe to select. + +config DEBUG_NET_SMALL_RTNL + bool "Add extra per-netns mutex inside RTNL" + depends on DEBUG_KERNEL && NET && LOCK_DEBUGGING_SUPPORT + select PROVE_LOCKING + default n + help + rtnl_lock() is being replaced with rtnl_net_lock() that + acquires the global RTNL and a small per-netns RTNL mutex. + + During the conversion, rtnl_net_lock() just adds an extra + mutex in every RTNL scope and slows down the operations. + + Once the conversion completes, rtnl_lock() will be removed + and rtnetlink will gain per-netns scalability. diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index e39479f1c9a4..105e3cd26763 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -334,6 +334,12 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_ idr_init(&net->netns_ids); spin_lock_init(&net->nsid_lock); mutex_init(&net->ipv4.ra_mutex); + +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL + mutex_init(&net->rtnl_mutex); + lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL); +#endif + preinit_net_sysctl(net); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 682d8d3127db..46567fa54e42 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -179,6 +179,64 @@ bool lockdep_rtnl_is_held(void) EXPORT_SYMBOL(lockdep_rtnl_is_held); #endif /* #ifdef CONFIG_PROVE_LOCKING */ +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL +void __rtnl_net_lock(struct net *net) +{ + ASSERT_RTNL(); + + mutex_lock(&net->rtnl_mutex); +} +EXPORT_SYMBOL(__rtnl_net_lock); + +void __rtnl_net_unlock(struct net *net) +{ + ASSERT_RTNL(); + + mutex_unlock(&net->rtnl_mutex); +} +EXPORT_SYMBOL(__rtnl_net_unlock); + +void rtnl_net_lock(struct net *net) +{ + rtnl_lock(); + __rtnl_net_lock(net); +} +EXPORT_SYMBOL(rtnl_net_lock); + +void rtnl_net_unlock(struct net *net) +{ + __rtnl_net_unlock(net); + rtnl_unlock(); +} +EXPORT_SYMBOL(rtnl_net_unlock); + +static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b) +{ + if (net_eq(net_a, net_b)) + return 0; + + /* always init_net first */ + if (net_eq(net_a, &init_net)) + return -1; + + if (net_eq(net_b, &init_net)) + return 1; + + /* otherwise lock in ascending order */ + return net_a < net_b ? -1 : 1; +} + +int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) +{ + const struct net *net_a, *net_b; + + net_a = container_of(a, struct net, rtnl_mutex.dep_map); + net_b = container_of(b, struct net, rtnl_mutex.dep_map); + + return rtnl_net_cmp_locks(net_a, net_b); +} +#endif + static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) -- cgit v1.3 From 844e5e7e656d3a7a904fd5607f8491d6fd01db8e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 4 Oct 2024 15:10:30 -0700 Subject: rtnetlink: Add assertion helpers for per-netns RTNL. Once an RTNL scope is converted with rtnl_net_lock(), we will replace RTNL helper functions inside the scope with the following per-netns alternatives: ASSERT_RTNL() -> ASSERT_RTNL_NET(net) rcu_dereference_rtnl(p) -> rcu_dereference_rtnl_net(net, p) Note that the per-netns helpers are equivalent to the conventional helpers unless CONFIG_DEBUG_NET_SMALL_RTNL is enabled. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/linux/rtnetlink.h | 45 +++++++++++++++++++++++++++++++++++++++++---- net/core/rtnetlink.c | 12 ++++++++++++ 2 files changed, 53 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index edd840a49989..8468a4ce8510 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -51,6 +51,10 @@ extern atomic_t dev_unreg_count; extern struct rw_semaphore pernet_ops_rwsem; extern struct rw_semaphore net_rwsem; +#define ASSERT_RTNL() \ + WARN_ONCE(!rtnl_is_locked(), \ + "RTNL: assertion failed at %s (%d)\n", __FILE__, __LINE__) + #ifdef CONFIG_PROVE_LOCKING extern bool lockdep_rtnl_is_held(void); #else @@ -98,6 +102,22 @@ void __rtnl_net_unlock(struct net *net); void rtnl_net_lock(struct net *net); void rtnl_net_unlock(struct net *net); int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b); + +bool rtnl_net_is_locked(struct net *net); + +#define ASSERT_RTNL_NET(net) \ + WARN_ONCE(!rtnl_net_is_locked(net), \ + "RTNL_NET: assertion failed at %s (%d)\n", \ + __FILE__, __LINE__) + +bool lockdep_rtnl_net_is_held(struct net *net); + +#define rcu_dereference_rtnl_net(net, p) \ + rcu_dereference_check(p, lockdep_rtnl_net_is_held(net)) +#define rtnl_net_dereference(net, p) \ + rcu_dereference_protected(p, lockdep_rtnl_net_is_held(net)) +#define rcu_replace_pointer_rtnl_net(net, rp, p) \ + rcu_replace_pointer(rp, p, lockdep_rtnl_net_is_held(net)) #else static inline void __rtnl_net_lock(struct net *net) {} static inline void __rtnl_net_unlock(struct net *net) {} @@ -111,6 +131,27 @@ static inline void rtnl_net_unlock(struct net *net) { rtnl_unlock(); } + +static inline void ASSERT_RTNL_NET(struct net *net) +{ + ASSERT_RTNL(); +} + +static inline void *rcu_dereference_rtnl_net(struct net *net, void *p) +{ + return rcu_dereference_rtnl(p); +} + +static inline void *rtnl_net_dereference(struct net *net, void *p) +{ + return rtnl_dereference(p); +} + +static inline void *rcu_replace_pointer_rtnl_net(struct net *net, + void *rp, void *p) +{ + return rcu_replace_pointer_rtnl(rp, p); +} #endif static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev) @@ -140,10 +181,6 @@ void rtnetlink_init(void); void __rtnl_unlock(void); void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail); -#define ASSERT_RTNL() \ - WARN_ONCE(!rtnl_is_locked(), \ - "RTNL: assertion failed at %s (%d)\n", __FILE__, __LINE__) - extern int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 46567fa54e42..6d68247aea70 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -235,6 +235,18 @@ int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map * return rtnl_net_cmp_locks(net_a, net_b); } + +bool rtnl_net_is_locked(struct net *net) +{ + return rtnl_is_locked() && mutex_is_locked(&net->rtnl_mutex); +} +EXPORT_SYMBOL(rtnl_net_is_locked); + +bool lockdep_rtnl_net_is_held(struct net *net) +{ + return lockdep_rtnl_is_held() && lockdep_is_held(&net->rtnl_mutex); +} +EXPORT_SYMBOL(lockdep_rtnl_net_is_held); #endif static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; -- cgit v1.3 From 03fa534856593bb4edf4935451fa55863e34a108 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 4 Oct 2024 15:10:31 -0700 Subject: rtnetlink: Add ASSERT_RTNL_NET() placeholder for netdev notifier. The global and per-netns netdev notifier depend on RTNL, and its dependency is not so clear due to nested calls. Let's add a placeholder to place ASSERT_RTNL_NET() for each event. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/core/Makefile | 1 + net/core/rtnl_net_debug.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 132 insertions(+) create mode 100644 net/core/rtnl_net_debug.c (limited to 'net') diff --git a/net/core/Makefile b/net/core/Makefile index c3ebbaf9c81e..5a72a87ee0f1 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -45,3 +45,4 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o obj-$(CONFIG_OF) += of_net.o obj-$(CONFIG_NET_TEST) += net_test.o obj-$(CONFIG_NET_DEVMEM) += devmem.o +obj-$(CONFIG_DEBUG_NET_SMALL_RTNL) += rtnl_net_debug.o diff --git a/net/core/rtnl_net_debug.c b/net/core/rtnl_net_debug.c new file mode 100644 index 000000000000..e90a32242e22 --- /dev/null +++ b/net/core/rtnl_net_debug.c @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Copyright Amazon.com Inc. or its affiliates. */ + +#include +#include +#include +#include +#include +#include + +static int rtnl_net_debug_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + enum netdev_cmd cmd = event; + + /* Keep enum and don't add default to trigger -Werror=switch */ + switch (cmd) { + case NETDEV_UP: + case NETDEV_DOWN: + case NETDEV_REBOOT: + case NETDEV_CHANGE: + case NETDEV_REGISTER: + case NETDEV_UNREGISTER: + case NETDEV_CHANGEMTU: + case NETDEV_CHANGEADDR: + case NETDEV_PRE_CHANGEADDR: + case NETDEV_GOING_DOWN: + case NETDEV_CHANGENAME: + case NETDEV_FEAT_CHANGE: + case NETDEV_BONDING_FAILOVER: + case NETDEV_PRE_UP: + case NETDEV_PRE_TYPE_CHANGE: + case NETDEV_POST_TYPE_CHANGE: + case NETDEV_POST_INIT: + case NETDEV_PRE_UNINIT: + case NETDEV_RELEASE: + case NETDEV_NOTIFY_PEERS: + case NETDEV_JOIN: + case NETDEV_CHANGEUPPER: + case NETDEV_RESEND_IGMP: + case NETDEV_PRECHANGEMTU: + case NETDEV_CHANGEINFODATA: + case NETDEV_BONDING_INFO: + case NETDEV_PRECHANGEUPPER: + case NETDEV_CHANGELOWERSTATE: + case NETDEV_UDP_TUNNEL_PUSH_INFO: + case NETDEV_UDP_TUNNEL_DROP_INFO: + case NETDEV_CHANGE_TX_QUEUE_LEN: + case NETDEV_CVLAN_FILTER_PUSH_INFO: + case NETDEV_CVLAN_FILTER_DROP_INFO: + case NETDEV_SVLAN_FILTER_PUSH_INFO: + case NETDEV_SVLAN_FILTER_DROP_INFO: + case NETDEV_OFFLOAD_XSTATS_ENABLE: + case NETDEV_OFFLOAD_XSTATS_DISABLE: + case NETDEV_OFFLOAD_XSTATS_REPORT_USED: + case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA: + case NETDEV_XDP_FEAT_CHANGE: + ASSERT_RTNL(); + break; + + /* Once an event fully supports RTNL_NET, move it here + * and remove "if (0)" below. + * + * case NETDEV_XXX: + * ASSERT_RTNL_NET(net); + * break; + */ + } + + /* Just to avoid unused-variable error for dev and net. */ + if (0) + ASSERT_RTNL_NET(net); + + return NOTIFY_DONE; +} + +static int rtnl_net_debug_net_id; + +static int __net_init rtnl_net_debug_net_init(struct net *net) +{ + struct notifier_block *nb; + + nb = net_generic(net, rtnl_net_debug_net_id); + nb->notifier_call = rtnl_net_debug_event; + + return register_netdevice_notifier_net(net, nb); +} + +static void __net_exit rtnl_net_debug_net_exit(struct net *net) +{ + struct notifier_block *nb; + + nb = net_generic(net, rtnl_net_debug_net_id); + unregister_netdevice_notifier_net(net, nb); +} + +static struct pernet_operations rtnl_net_debug_net_ops __net_initdata = { + .init = rtnl_net_debug_net_init, + .exit = rtnl_net_debug_net_exit, + .id = &rtnl_net_debug_net_id, + .size = sizeof(struct notifier_block), +}; + +static struct notifier_block rtnl_net_debug_block = { + .notifier_call = rtnl_net_debug_event, +}; + +static int __init rtnl_net_debug_init(void) +{ + int ret; + + ret = register_pernet_device(&rtnl_net_debug_net_ops); + if (ret) + return ret; + + ret = register_netdevice_notifier(&rtnl_net_debug_block); + if (ret) + unregister_pernet_subsys(&rtnl_net_debug_net_ops); + + return ret; +} + +static void __exit rtnl_net_debug_exit(void) +{ + unregister_netdevice_notifier(&rtnl_net_debug_block); + unregister_pernet_device(&rtnl_net_debug_net_ops); +} + +subsys_initcall(rtnl_net_debug_init); -- cgit v1.3 From eb62f49de7eca5917be8cebb3ad8aa3710af7021 Mon Sep 17 00:00:00 2001 From: Mahe Tardy Date: Mon, 7 Oct 2024 09:59:57 +0000 Subject: bpf: add get_netns_cookie helper to tc programs This is needed in the context of Cilium and Tetragon to retrieve netns cookie from hostns when traffic leaves Pod, so that we can correlate skb->sk's netns cookie. Signed-off-by: Mahe Tardy Link: https://lore.kernel.org/r/20241007095958.97442-1-mahe.tardy@gmail.com Signed-off-by: Martin KaFai Lau --- net/core/filter.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index e61ac225c41b..9c0b47bfaa77 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5138,6 +5138,17 @@ static u64 __bpf_get_netns_cookie(struct sock *sk) return net->net_cookie; } +BPF_CALL_1(bpf_get_netns_cookie, struct sk_buff *, skb) +{ + return __bpf_get_netns_cookie(skb && skb->sk ? skb->sk : NULL); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_proto = { + .func = bpf_get_netns_cookie, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx) { return __bpf_get_netns_cookie(ctx); @@ -8209,6 +8220,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_under_cgroup_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; case BPF_FUNC_fib_lookup: -- cgit v1.3 From 2b73e9ab8535caca192844a161d9f491ee9f8aab Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Tue, 10 Sep 2024 13:40:03 -0700 Subject: wifi: mac80211: constify ieee80211_ie_build_{he,eht}_oper() chandef The chandef parameter passed to ieee80211_ie_build_he_oper() and ieee80211_ie_build_eht_oper is read-only. Since it is never modified, add the const qualifier to this parameter. This makes these consistent with ieee80211_ie_build_ht_oper() and ieee80211_ie_build_vht_oper(). Signed-off-by: Jeff Johnson Link: https://patch.msgid.link/20240910-wireless-utils-constify-v1-1-e59947bcb3c3@quicinc.com Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 4 ++-- net/mac80211/util.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 4f0390918b60..3f4d2773b828 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2545,8 +2545,8 @@ u8 *ieee80211_ie_build_vht_cap(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, const struct cfg80211_chan_def *chandef); u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata); -u8 *ieee80211_ie_build_he_oper(u8 *pos, struct cfg80211_chan_def *chandef); -u8 *ieee80211_ie_build_eht_oper(u8 *pos, struct cfg80211_chan_def *chandef, +u8 *ieee80211_ie_build_he_oper(u8 *pos, const struct cfg80211_chan_def *chandef); +u8 *ieee80211_ie_build_eht_oper(u8 *pos, const struct cfg80211_chan_def *chandef, const struct ieee80211_sta_eht_cap *eht_cap); int ieee80211_parse_bitrates(enum nl80211_chan_width width, const struct ieee80211_supported_band *sband, diff --git a/net/mac80211/util.c b/net/mac80211/util.c index f94faa86ba8a..f0db60878321 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -2752,7 +2752,7 @@ u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, return pos + sizeof(struct ieee80211_vht_operation); } -u8 *ieee80211_ie_build_he_oper(u8 *pos, struct cfg80211_chan_def *chandef) +u8 *ieee80211_ie_build_he_oper(u8 *pos, const struct cfg80211_chan_def *chandef) { struct ieee80211_he_operation *he_oper; struct ieee80211_he_6ghz_oper *he_6ghz_op; @@ -2844,7 +2844,7 @@ out: return pos; } -u8 *ieee80211_ie_build_eht_oper(u8 *pos, struct cfg80211_chan_def *chandef, +u8 *ieee80211_ie_build_eht_oper(u8 *pos, const struct cfg80211_chan_def *chandef, const struct ieee80211_sta_eht_cap *eht_cap) { -- cgit v1.3 From 4b482281eeb263d5bbbe75e1c5688d80daabb20e Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Fri, 13 Sep 2024 11:49:19 +0300 Subject: wifi: mac80211, cfg80211: miscellaneous spelling fixes Correct spelling here and there as suggested by codespell. Signed-off-by: Dmitry Antipov Link: https://patch.msgid.link/20240913084919.118862-1-dmantipov@yandex.ru Signed-off-by: Johannes Berg --- net/mac80211/agg-tx.c | 2 +- net/mac80211/chan.c | 4 ++-- net/mac80211/ieee80211_i.h | 2 +- net/mac80211/mesh.c | 2 +- net/mac80211/mesh_plink.c | 2 +- net/mac80211/mesh_sync.c | 2 +- net/mac80211/rc80211_minstrel_ht.c | 2 +- net/mac80211/sta_info.h | 2 +- net/mac80211/tkip.c | 2 +- net/mac80211/tx.c | 2 +- net/mac80211/util.c | 2 +- net/mac80211/vht.c | 4 ++-- net/wireless/chan.c | 2 +- net/wireless/nl80211.c | 4 ++-- net/wireless/radiotap.c | 2 +- net/wireless/reg.c | 2 +- net/wireless/util.c | 2 +- net/wireless/wext-compat.c | 2 +- net/wireless/wext-core.c | 2 +- 19 files changed, 22 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 1c18b862ef8c..04cb45cfb310 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -797,7 +797,7 @@ void ieee80211_start_tx_ba_cb(struct sta_info *sta, int tid, if (!test_bit(HT_AGG_STATE_SENT_ADDBA, &tid_tx->state)) { ieee80211_send_addba_with_timeout(sta, tid_tx); - /* RESPONSE_RECEIVED state whould trigger the flow again */ + /* RESPONSE_RECEIVED state would trigger the flow again */ return; } diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index cca6d14084d2..a155e418d26b 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -462,7 +462,7 @@ static void ieee80211_chan_bw_change(struct ieee80211_local *local, continue; /* vif changed to narrow BW and narrow BW for station wasn't - * requested or vise versa */ + * requested or vice versa */ if ((new_sta_bw < link_sta->pub->bandwidth) == !narrowed) continue; @@ -1118,7 +1118,7 @@ ieee80211_replace_chanctx(struct ieee80211_local *local, * * Consider ctx1..3, link1..6, each ctx has 2 links. link1 and * link2 from ctx1 request new different chandefs starting 2 - * in-place reserations with ctx4 and ctx5 replacing ctx1 and + * in-place reservations with ctx4 and ctx5 replacing ctx1 and * ctx2 respectively. Next link5 and link6 from ctx3 reserve * ctx4. If link3 and link4 remain on ctx2 as they are then this * fails unless `replace_ctx` from ctx5 is replaced with ctx3. diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 3f4d2773b828..afb867dc6b24 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2452,7 +2452,7 @@ static inline bool ieee80211_can_run_worker(struct ieee80211_local *local) /* * If quiescing is set, we are racing with __ieee80211_suspend. * __ieee80211_suspend flushes the workers after setting quiescing, - * and we check quiescing / suspended before enqueing new workers. + * and we check quiescing / suspended before enqueuing new workers. * We should abort the worker to avoid the races below. */ if (local->quiescing) diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index f94e4be0be12..0460102c8796 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -1482,7 +1482,7 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, if (!elems) return; - /* ignore non-mesh or secure / unsecure mismatch */ + /* ignore non-mesh or secure / insecure mismatch */ if ((!elems->mesh_id || !elems->mesh_config) || (elems->rsn && sdata->u.mesh.security == IEEE80211_MESH_SEC_NONE) || (!elems->rsn && sdata->u.mesh.security != IEEE80211_MESH_SEC_NONE)) diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 8f2b492a9fe9..42286aa3623c 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -667,7 +667,7 @@ void mesh_plink_timer(struct timer_list *t) /* * This STA is valid because sta_info_destroy() will * del_timer_sync() this timer after having made sure - * it cannot be readded (by deleting the plink.) + * it cannot be re-added (by deleting the plink.) */ sta = mesh->plink_sta; diff --git a/net/mac80211/mesh_sync.c b/net/mac80211/mesh_sync.c index 8cf3f395f52f..3a66b4cefca7 100644 --- a/net/mac80211/mesh_sync.c +++ b/net/mac80211/mesh_sync.c @@ -175,7 +175,7 @@ static void mesh_sync_offset_adjust_tsf(struct ieee80211_sub_if_data *sdata, spin_lock_bh(&ifmsh->sync_offset_lock); if (ifmsh->sync_offset_clockdrift_max > TOFFSET_MINIMUM_ADJUSTMENT) { - /* Since ajusting the tsf here would + /* Since adjusting the tsf here would * require a possibly blocking call * to the driver tsf setter, we punt * the tsf adjustment to the mesh tasklet diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 6bf3b4444a43..706cbc99f718 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -1053,7 +1053,7 @@ minstrel_ht_refill_sample_rates(struct minstrel_ht_sta *mi) * - max_prob_rate must use only one stream, as a tradeoff between delivery * probability and throughput during strong fluctuations * - as long as the max prob rate has a probability of more than 75%, pick - * higher throughput rates, even if the probablity is a bit lower + * higher throughput rates, even if the probability is a bit lower */ static void minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 9195d5a2de0a..9f89fb5bee37 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -169,7 +169,7 @@ struct sta_info; * @buf_size: reorder buffer size at receiver * @failed_bar_ssn: ssn of the last failed BAR tx attempt * @bar_pending: BAR needs to be re-sent - * @amsdu: support A-MSDU withing A-MDPU + * @amsdu: support A-MSDU within A-MDPU * @ssn: starting sequence number of the session * * This structure's lifetime is managed by RCU, assignments to diff --git a/net/mac80211/tkip.c b/net/mac80211/tkip.c index e7f57bb18f6e..7aac84cec044 100644 --- a/net/mac80211/tkip.c +++ b/net/mac80211/tkip.c @@ -313,7 +313,7 @@ int ieee80211_tkip_decrypt_data(struct arc4_ctx *ctx, * Record previously received IV, will be copied into the * key information after MIC verification. It is possible * that we don't catch replays of fragments but that's ok - * because the Michael MIC verication will then fail. + * because the Michael MIC verification will then fail. */ *out_iv32 = iv32; *out_iv16 = iv16; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index a9ee86982259..eedb2c5123ab 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -6214,7 +6214,7 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, goto start_xmit; /* update QoS header to prioritize control port frames if possible, - * priorization also happens for control port frames send over + * prioritization also happens for control port frames send over * AF_PACKET */ rcu_read_lock(); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index f0db60878321..bd93de637f94 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1010,7 +1010,7 @@ void ieee80211_set_wmm_default(struct ieee80211_link_data *link, else aCWmin = 15; - /* Confiure old 802.11b/g medium access rules. */ + /* Configure old 802.11b/g medium access rules. */ qparam.cw_max = aCWmax; qparam.cw_min = aCWmin; qparam.txop = 0; diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index bf6ef45af757..eafe47bf201a 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -280,10 +280,10 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, /* * This is a workaround for VHT-enabled STAs which break the spec * and have the VHT-MCS Rx map filled in with value 3 for all eight - * spacial streams, an example is AR9462. + * spatial streams, an example is AR9462. * * As per spec, in section 22.1.1 Introduction to the VHT PHY - * A VHT STA shall support at least single spactial stream VHT-MCSs + * A VHT STA shall support at least single spatial stream VHT-MCSs * 0 to 7 (transmit and receive) in all supported channel widths. */ if (vht_cap->vht_mcs.rx_mcs_map == cpu_to_le16(0xFFFF)) { diff --git a/net/wireless/chan.c b/net/wireless/chan.c index e579d7e1425f..afd86f7c66ce 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -289,7 +289,7 @@ static bool cfg80211_valid_center_freq(u32 center, /* * Valid channels are packed from lowest frequency towards higher ones. - * So test that the lower frequency alignes with one of these steps. + * So test that the lower frequency aligns with one of these steps. */ return (center - bw / 2 - 5945) % step == 0; } diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 9ab777e0bd4d..d51bcb4e9108 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -12446,7 +12446,7 @@ static int nl80211_del_pmksa(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_MAC]) { pmksa.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); } else if (info->attrs[NL80211_ATTR_SSID]) { - /* SSID based pmksa flush suppported only for FILS, + /* SSID based pmksa flush supported only for FILS, * OWE/SAE OFFLOAD cases */ if (info->attrs[NL80211_ATTR_FILS_CACHE_ID] && @@ -15498,7 +15498,7 @@ static int nl80211_add_tx_ts(struct sk_buff *skb, struct genl_info *info) if (tsid >= IEEE80211_FIRST_TSPEC_TSID) { /* TODO: handle 802.11 TSPEC/admission control * need more attributes for that (e.g. BA session requirement); - * change the WMM adminssion test above to allow both then + * change the WMM admission test above to allow both then */ return -EINVAL; } diff --git a/net/wireless/radiotap.c b/net/wireless/radiotap.c index ae2e1a896461..619187e229b4 100644 --- a/net/wireless/radiotap.c +++ b/net/wireless/radiotap.c @@ -200,7 +200,7 @@ static void find_ns(struct ieee80211_radiotap_iterator *iterator, * present fields. @this_arg can be changed by the caller (eg, * incremented to move inside a compound argument like * IEEE80211_RADIOTAP_CHANNEL). The args pointed to are in - * little-endian format whatever the endianess of your CPU. + * little-endian format whatever the endianness of your CPU. * * Alignment Gotcha: * You must take care when dereferencing iterator.this_arg diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 6489ba943a63..1df65a5a44f7 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1147,7 +1147,7 @@ static const struct ieee80211_regdomain *reg_get_regdomain(struct wiphy *wiphy) /* * Follow the driver's regulatory domain, if present, unless a country - * IE has been processed or a user wants to help complaince further + * IE has been processed or a user wants to help compliance further */ if (lr->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE && lr->initiator != NL80211_REGDOM_SET_BY_USER && diff --git a/net/wireless/util.c b/net/wireless/util.c index f49b55724f83..93a9c32418a6 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -743,7 +743,7 @@ __ieee80211_amsdu_copy(struct sk_buff *skb, unsigned int hlen, return NULL; /* - * When reusing framents, copy some data to the head to simplify + * When reusing fragments, copy some data to the head to simplify * ethernet header handling and speed up protocol header processing * in the stack later. */ diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 2371069f3c43..cd9f8f6e298b 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -1204,7 +1204,7 @@ static int cfg80211_wext_siwpower(struct net_device *dev, switch (wrq->flags & IW_POWER_MODE) { case IW_POWER_ON: /* If not specified */ case IW_POWER_MODE: /* If set all mask */ - case IW_POWER_ALL_R: /* If explicitely state all */ + case IW_POWER_ALL_R: /* If explicitly state all */ ps = true; break; default: /* Otherwise we ignore */ diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 838ad6541a17..3bb04b05c5ce 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -1159,7 +1159,7 @@ char *iwe_stream_add_event(struct iw_request_info *info, char *stream, /* Check if it's possible */ if (likely((stream + event_len) < ends)) { iwe->len = event_len; - /* Beware of alignement issues on 64 bits */ + /* Beware of alignment issues on 64 bits */ memcpy(stream, (char *) iwe, IW_EV_LCP_PK_LEN); memcpy(stream + lcp_len, &iwe->u, event_len - lcp_len); -- cgit v1.3 From bd9813d13be439851a7ff3e6372e53caa6e387a6 Mon Sep 17 00:00:00 2001 From: Karthikeyan Periyasamy Date: Tue, 17 Sep 2024 19:32:39 +0530 Subject: wifi: cfg80211: check radio iface combination for multi radio per wiphy Currently, wiphy_verify_combinations() fails for the multi-radio per wiphy due to the condition check on new global interface combination that DFS only works on one channel. In a multi-radio scenario, new global interface combination encompasses the capabilities of all radio combinations, so it supports more than one channel with DFS. For multi-radio per wiphy, interface combination verification needs to be performed for radio specific interface combinations. This is necessary as the new global interface combination combines the capabilities of all radio combinations. Fixes: a01b1e9f9955 ("wifi: mac80211: add support for DFS with multiple radios") Signed-off-by: Karthikeyan Periyasamy Link: https://patch.msgid.link/20240917140239.886083-1-quic_periyasa@quicinc.com Signed-off-by: Johannes Berg --- net/wireless/core.c | 64 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 54 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index 661adfc77644..4c8d8f167409 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -603,16 +603,20 @@ use_default_name: } EXPORT_SYMBOL(wiphy_new_nm); -static int wiphy_verify_combinations(struct wiphy *wiphy) +static +int wiphy_verify_iface_combinations(struct wiphy *wiphy, + const struct ieee80211_iface_combination *iface_comb, + int n_iface_comb, + bool combined_radio) { const struct ieee80211_iface_combination *c; int i, j; - for (i = 0; i < wiphy->n_iface_combinations; i++) { + for (i = 0; i < n_iface_comb; i++) { u32 cnt = 0; u16 all_iftypes = 0; - c = &wiphy->iface_combinations[i]; + c = &iface_comb[i]; /* * Combinations with just one interface aren't real, @@ -625,9 +629,13 @@ static int wiphy_verify_combinations(struct wiphy *wiphy) if (WARN_ON(!c->num_different_channels)) return -EINVAL; - /* DFS only works on one channel. */ - if (WARN_ON(c->radar_detect_widths && - (c->num_different_channels > 1))) + /* DFS only works on one channel. Avoid this check + * for multi-radio global combination, since it hold + * the capabilities of all radio combinations. + */ + if (!combined_radio && + WARN_ON(c->radar_detect_widths && + c->num_different_channels > 1)) return -EINVAL; if (WARN_ON(!c->n_limits)) @@ -648,13 +656,21 @@ static int wiphy_verify_combinations(struct wiphy *wiphy) if (WARN_ON(wiphy->software_iftypes & types)) return -EINVAL; - /* Only a single P2P_DEVICE can be allowed */ - if (WARN_ON(types & BIT(NL80211_IFTYPE_P2P_DEVICE) && + /* Only a single P2P_DEVICE can be allowed, avoid this + * check for multi-radio global combination, since it + * hold the capabilities of all radio combinations. + */ + if (!combined_radio && + WARN_ON(types & BIT(NL80211_IFTYPE_P2P_DEVICE) && c->limits[j].max > 1)) return -EINVAL; - /* Only a single NAN can be allowed */ - if (WARN_ON(types & BIT(NL80211_IFTYPE_NAN) && + /* Only a single NAN can be allowed, avoid this + * check for multi-radio global combination, since it + * hold the capabilities of all radio combinations. + */ + if (!combined_radio && + WARN_ON(types & BIT(NL80211_IFTYPE_NAN) && c->limits[j].max > 1)) return -EINVAL; @@ -693,6 +709,34 @@ static int wiphy_verify_combinations(struct wiphy *wiphy) return 0; } +static int wiphy_verify_combinations(struct wiphy *wiphy) +{ + int i, ret; + bool combined_radio = false; + + if (wiphy->n_radio) { + for (i = 0; i < wiphy->n_radio; i++) { + const struct wiphy_radio *radio = &wiphy->radio[i]; + + ret = wiphy_verify_iface_combinations(wiphy, + radio->iface_combinations, + radio->n_iface_combinations, + false); + if (ret) + return ret; + } + + combined_radio = true; + } + + ret = wiphy_verify_iface_combinations(wiphy, + wiphy->iface_combinations, + wiphy->n_iface_combinations, + combined_radio); + + return ret; +} + int wiphy_register(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); -- cgit v1.3 From b0b6646a9d680c1d865ef308d84de98e28df9963 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Mon, 30 Sep 2024 13:21:13 +0200 Subject: mac80211: Reorganize kerneldoc parameter names Reorganize kerneldoc parameter names to match the parameter order in the function header. Problems identified using Coccinelle. Signed-off-by: Julia Lawall Reviewed-by: Jeff Johnson Link: https://patch.msgid.link/20240930112121.95324-28-Julia.Lawall@inria.fr Signed-off-by: Johannes Berg --- net/mac80211/mesh_hwmp.c | 6 +++--- net/mac80211/mesh_pathtbl.c | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 024f48db6b05..0b13a6648e08 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -220,12 +220,12 @@ static void prepare_frame_for_deferred_tx(struct ieee80211_sub_if_data *sdata, /** * mesh_path_error_tx - Sends a PERR mesh management frame * + * @sdata: local mesh subif * @ttl: allowed remaining hops * @target: broken destination * @target_sn: SN of the broken destination * @target_rcode: reason code for this PERR * @ra: node this frame is addressed to - * @sdata: local mesh subif * * Note: This function may be called with driver locks taken that the driver * also acquires in the TX path. To avoid a deadlock we don't transmit the @@ -1137,8 +1137,8 @@ enddiscovery: /** * mesh_nexthop_resolve - lookup next hop; conditionally start path discovery * - * @skb: 802.11 frame to be sent * @sdata: network subif the frame will be sent through + * @skb: 802.11 frame to be sent * * Lookup next hop for given skb and start path discovery if no * forwarding information is found. @@ -1245,8 +1245,8 @@ void mesh_path_refresh(struct ieee80211_sub_if_data *sdata, * this function is considered "using" the associated mpath, so preempt a path * refresh if this mpath expires soon. * - * @skb: 802.11 frame to be sent * @sdata: network subif the frame will be sent through + * @skb: 802.11 frame to be sent * * Returns: 0 if the next hop was found. Nonzero otherwise. */ diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index 30c0d89203af..9f9cb5af0a97 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -300,8 +300,8 @@ __mesh_path_lookup_by_idx(struct mesh_table *tbl, int idx) /** * mesh_path_lookup_by_idx - look up a path in the mesh path table by its index - * @idx: index * @sdata: local subif, or NULL for all entries + * @idx: index * * Returns: pointer to the mesh path structure, or NULL if not found. * @@ -315,8 +315,8 @@ mesh_path_lookup_by_idx(struct ieee80211_sub_if_data *sdata, int idx) /** * mpp_path_lookup_by_idx - look up a path in the proxy path table by its index - * @idx: index * @sdata: local subif, or NULL for all entries + * @idx: index * * Returns: pointer to the proxy path structure, or NULL if not found. * @@ -670,8 +670,8 @@ void mesh_fast_tx_flush_addr(struct ieee80211_sub_if_data *sdata, /** * mesh_path_add - allocate and add a new path to the mesh path table - * @dst: destination address of the path (ETH_ALEN length) * @sdata: local subif + * @dst: destination address of the path (ETH_ALEN length) * * Returns: 0 on success * @@ -916,8 +916,8 @@ static int table_path_del(struct mesh_table *tbl, /** * mesh_path_del - delete a mesh path from the table * - * @addr: dst address (ETH_ALEN length) * @sdata: local subif + * @addr: dst address (ETH_ALEN length) * * Returns: 0 if successful */ @@ -996,8 +996,8 @@ int mesh_path_send_to_gates(struct mesh_path *mpath) /** * mesh_path_discard_frame - discard a frame whose path could not be resolved * - * @skb: frame to discard * @sdata: network subif the frame was to be sent through + * @skb: frame to discard * * Locking: the function must me called within a rcu_read_lock region */ -- cgit v1.3 From 484bd64bdc2721224e90e131607398e546cf84b1 Mon Sep 17 00:00:00 2001 From: Dmitry Kandybka Date: Thu, 3 Oct 2024 12:59:12 +0300 Subject: wifi: nl80211: remove redundant null pointer check in coalescing In 'cfg80211_free_coalesce', '&coalesce->rules[i]' is a pointer to VLA member of 'struct cfg80211_coalesce' and should never be NULL, so redundant check may be dropped. I think this is correct, but I haven't tested it seriously. Compile tested only. Found by Linux Verification Center (linuxtesting.org) with SVACE. Signed-off-by: Dmitry Kandybka Link: https://patch.msgid.link/20241003095912.218465-1-d.kandybka@gmail.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d51bcb4e9108..4e3609176880 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -14061,8 +14061,6 @@ void cfg80211_free_coalesce(struct cfg80211_coalesce *coalesce) for (i = 0; i < coalesce->n_rules; i++) { rule = &coalesce->rules[i]; - if (!rule) - continue; for (j = 0; j < rule->n_patterns; j++) kfree(rule->patterns[j].mask); kfree(rule->patterns); -- cgit v1.3 From 02f220b5267042d0de649614eec84ded8aeecb4f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 20:26:55 +0200 Subject: wifi: ipw2x00/lib80211: move remaining lib80211 into libipw There's already much code in libipw that used to be shared with more drivers, but now with the prior cleanups, those old Intel ipw2x00 drivers are also the only ones using whatever is now left of lib80211. Move lib80211 entirely into libipw. Link: https://patch.msgid.link/20241007202707.915ef7b9e7c7.Ib9876d2fe3c90f11d6df458b16d0b7d4bf551a8d@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/ipw2x00/Kconfig | 6 - drivers/net/wireless/intel/ipw2x00/Makefile | 6 +- drivers/net/wireless/intel/ipw2x00/ipw2100.c | 7 +- drivers/net/wireless/intel/ipw2x00/ipw2200.c | 4 +- drivers/net/wireless/intel/ipw2x00/ipw2200.h | 2 - drivers/net/wireless/intel/ipw2x00/libipw.h | 101 ++- drivers/net/wireless/intel/ipw2x00/libipw_crypto.c | 246 +++++++ .../wireless/intel/ipw2x00/libipw_crypto_ccmp.c | 438 ++++++++++++ .../wireless/intel/ipw2x00/libipw_crypto_tkip.c | 728 ++++++++++++++++++++ .../net/wireless/intel/ipw2x00/libipw_crypto_wep.c | 247 +++++++ drivers/net/wireless/intel/ipw2x00/libipw_module.c | 36 +- drivers/net/wireless/intel/ipw2x00/libipw_rx.c | 9 +- drivers/net/wireless/intel/ipw2x00/libipw_tx.c | 4 +- drivers/net/wireless/intel/ipw2x00/libipw_wx.c | 43 +- include/net/lib80211.h | 122 ---- net/wireless/Kconfig | 33 - net/wireless/Makefile | 4 - net/wireless/lib80211.c | 257 ------- net/wireless/lib80211_crypt_ccmp.c | 448 ------------- net/wireless/lib80211_crypt_tkip.c | 738 --------------------- net/wireless/lib80211_crypt_wep.c | 256 ------- 21 files changed, 1825 insertions(+), 1910 deletions(-) create mode 100644 drivers/net/wireless/intel/ipw2x00/libipw_crypto.c create mode 100644 drivers/net/wireless/intel/ipw2x00/libipw_crypto_ccmp.c create mode 100644 drivers/net/wireless/intel/ipw2x00/libipw_crypto_tkip.c create mode 100644 drivers/net/wireless/intel/ipw2x00/libipw_crypto_wep.c delete mode 100644 include/net/lib80211.h delete mode 100644 net/wireless/lib80211.c delete mode 100644 net/wireless/lib80211_crypt_ccmp.c delete mode 100644 net/wireless/lib80211_crypt_tkip.c delete mode 100644 net/wireless/lib80211_crypt_wep.c (limited to 'net') diff --git a/drivers/net/wireless/intel/ipw2x00/Kconfig b/drivers/net/wireless/intel/ipw2x00/Kconfig index 1650d5865aa0..d9c042772399 100644 --- a/drivers/net/wireless/intel/ipw2x00/Kconfig +++ b/drivers/net/wireless/intel/ipw2x00/Kconfig @@ -10,7 +10,6 @@ config IPW2100 select WEXT_SPY select WEXT_PRIV select FW_LOADER - select LIB80211 select LIBIPW help A driver for the Intel PRO/Wireless 2100 Network @@ -72,7 +71,6 @@ config IPW2200 select WEXT_SPY select WEXT_PRIV select FW_LOADER - select LIB80211 select LIBIPW help A driver for the Intel PRO/Wireless 2200BG and 2915ABG Network @@ -162,10 +160,6 @@ config LIBIPW select CRYPTO select CRYPTO_MICHAEL_MIC select CRC32 - select LIB80211 - select LIB80211_CRYPT_WEP - select LIB80211_CRYPT_TKIP - select LIB80211_CRYPT_CCMP help This option enables the hardware independent IEEE 802.11 networking stack. This component is deprecated in favor of the diff --git a/drivers/net/wireless/intel/ipw2x00/Makefile b/drivers/net/wireless/intel/ipw2x00/Makefile index e1ec50359dff..60c5faccbe15 100644 --- a/drivers/net/wireless/intel/ipw2x00/Makefile +++ b/drivers/net/wireless/intel/ipw2x00/Makefile @@ -12,4 +12,8 @@ libipw-objs := \ libipw_tx.o \ libipw_rx.o \ libipw_wx.o \ - libipw_geo.o + libipw_geo.o \ + libipw_crypto.o \ + libipw_crypto_ccmp.o \ + libipw_crypto_tkip.o \ + libipw_crypto_wep.o diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c index b6636002c7d2..a89e06c1b8ee 100644 --- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c +++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c @@ -148,9 +148,6 @@ that only one external action is invoked at a time. #include #include #include - -#include - #include "ipw2100.h" #include "ipw.h" @@ -7571,7 +7568,7 @@ static int ipw2100_wx_set_auth(struct net_device *dev, struct ipw2100_priv *priv = libipw_priv(dev); struct libipw_device *ieee = priv->ieee; struct iw_param *param = &wrqu->param; - struct lib80211_crypt_data *crypt; + struct libipw_crypt_data *crypt; unsigned long flags; int ret = 0; @@ -7663,7 +7660,7 @@ static int ipw2100_wx_get_auth(struct net_device *dev, { struct ipw2100_priv *priv = libipw_priv(dev); struct libipw_device *ieee = priv->ieee; - struct lib80211_crypt_data *crypt; + struct libipw_crypt_data *crypt; struct iw_param *param = &wrqu->param; switch (param->flags & IW_AUTH_INDEX) { diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2200.c b/drivers/net/wireless/intel/ipw2x00/ipw2200.c index eed9ef17bc29..f4fd1fc784b7 100644 --- a/drivers/net/wireless/intel/ipw2x00/ipw2200.c +++ b/drivers/net/wireless/intel/ipw2x00/ipw2200.c @@ -6549,7 +6549,7 @@ static int ipw_wx_set_auth(struct net_device *dev, struct ipw_priv *priv = libipw_priv(dev); struct libipw_device *ieee = priv->ieee; struct iw_param *param = &wrqu->param; - struct lib80211_crypt_data *crypt; + struct libipw_crypt_data *crypt; unsigned long flags; int ret = 0; @@ -6648,7 +6648,7 @@ static int ipw_wx_get_auth(struct net_device *dev, { struct ipw_priv *priv = libipw_priv(dev); struct libipw_device *ieee = priv->ieee; - struct lib80211_crypt_data *crypt; + struct libipw_crypt_data *crypt; struct iw_param *param = &wrqu->param; switch (param->flags & IW_AUTH_INDEX) { diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2200.h b/drivers/net/wireless/intel/ipw2x00/ipw2200.h index 8ebf09121e17..46f119123b49 100644 --- a/drivers/net/wireless/intel/ipw2x00/ipw2200.h +++ b/drivers/net/wireless/intel/ipw2x00/ipw2200.h @@ -31,8 +31,6 @@ #include #include #include - -#include #include #define DRV_NAME "ipw2200" diff --git a/drivers/net/wireless/intel/ipw2x00/libipw.h b/drivers/net/wireless/intel/ipw2x00/libipw.h index bad080d33c07..bc727c99ff3c 100644 --- a/drivers/net/wireless/intel/ipw2x00/libipw.h +++ b/drivers/net/wireless/intel/ipw2x00/libipw.h @@ -25,8 +25,6 @@ #include /* ARRAY_SIZE */ #include #include - -#include #include #define LIBIPW_VERSION "git-1.1.13" @@ -699,6 +697,76 @@ struct libipw_geo { struct libipw_channel a[LIBIPW_52GHZ_CHANNELS]; }; +#define NUM_WEP_KEYS 4 + +enum { + IEEE80211_CRYPTO_TKIP_COUNTERMEASURES = (1 << 0), +}; + +struct module; + +struct libipw_crypto_ops { + const char *name; + struct list_head list; + + /* init new crypto context (e.g., allocate private data space, + * select IV, etc.); returns NULL on failure or pointer to allocated + * private data on success */ + void *(*init) (int keyidx); + + /* deinitialize crypto context and free allocated private data */ + void (*deinit) (void *priv); + + /* encrypt/decrypt return < 0 on error or >= 0 on success. The return + * value from decrypt_mpdu is passed as the keyidx value for + * decrypt_msdu. skb must have enough head and tail room for the + * encryption; if not, error will be returned; these functions are + * called for all MPDUs (i.e., fragments). + */ + int (*encrypt_mpdu) (struct sk_buff * skb, int hdr_len, void *priv); + int (*decrypt_mpdu) (struct sk_buff * skb, int hdr_len, void *priv); + + /* These functions are called for full MSDUs, i.e. full frames. + * These can be NULL if full MSDU operations are not needed. */ + int (*encrypt_msdu) (struct sk_buff * skb, int hdr_len, void *priv); + int (*decrypt_msdu) (struct sk_buff * skb, int keyidx, int hdr_len, + void *priv); + + int (*set_key) (void *key, int len, u8 * seq, void *priv); + int (*get_key) (void *key, int len, u8 * seq, void *priv); + + /* procfs handler for printing out key information and possible + * statistics */ + void (*print_stats) (struct seq_file *m, void *priv); + + /* Crypto specific flag get/set for configuration settings */ + unsigned long (*get_flags) (void *priv); + unsigned long (*set_flags) (unsigned long flags, void *priv); + + /* maximum number of bytes added by encryption; encrypt buf is + * allocated with extra_prefix_len bytes, copy of in_buf, and + * extra_postfix_len; encrypt need not use all this space, but + * the result must start at the beginning of the buffer and correct + * length must be returned */ + int extra_mpdu_prefix_len, extra_mpdu_postfix_len; + int extra_msdu_prefix_len, extra_msdu_postfix_len; + + struct module *owner; +}; + +struct libipw_crypt_info { + char *name; + /* Most clients will already have a lock, + so just point to that. */ + spinlock_t *lock; + + struct libipw_crypt_data *crypt[NUM_WEP_KEYS]; + int tx_keyidx; /* default TX key index (crypt[tx_keyidx]) */ + struct list_head crypt_deinit_list; + struct timer_list crypt_deinit_timer; + int crypt_quiesced; +}; + struct libipw_device { struct net_device *dev; struct wireless_dev wdev; @@ -751,7 +819,7 @@ struct libipw_device { size_t wpa_ie_len; u8 *wpa_ie; - struct lib80211_crypt_info crypt_info; + struct libipw_crypt_info crypt_info; int bcrx_sta_key; /* use individual keys to override default keys even * with RX of broad/multicast frames */ @@ -988,4 +1056,31 @@ static inline int libipw_get_scans(struct libipw_device *ieee) return ieee->scans; } +struct libipw_crypt_data { + struct list_head list; /* delayed deletion list */ + const struct libipw_crypto_ops *ops; + void *priv; + atomic_t refcnt; +}; + +int libipw_crypt_info_init(struct libipw_crypt_info *info, char *name, + spinlock_t *lock); +void libipw_crypt_info_free(struct libipw_crypt_info *info); +int libipw_register_crypto_ops(const struct libipw_crypto_ops *ops); +int libipw_unregister_crypto_ops(const struct libipw_crypto_ops *ops); +const struct libipw_crypto_ops *libipw_get_crypto_ops(const char *name); +void libipw_crypt_delayed_deinit(struct libipw_crypt_info *info, + struct libipw_crypt_data **crypt); + +/* must be called in the listed order */ +int libipw_crypto_init(void); +int libipw_crypto_ccmp_init(void); +int libipw_crypto_tkip_init(void); +int libipw_crypto_wep_init(void); + +void libipw_crypto_wep_exit(void); +void libipw_crypto_tkip_exit(void); +void libipw_crypto_ccmp_exit(void); +void libipw_crypto_exit(void); + #endif /* LIBIPW_H */ diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_crypto.c b/drivers/net/wireless/intel/ipw2x00/libipw_crypto.c new file mode 100644 index 000000000000..32639e0e8430 --- /dev/null +++ b/drivers/net/wireless/intel/ipw2x00/libipw_crypto.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * libipw -- common bits for IPW drivers + * + * Copyright(c) 2008 John W. Linville + * + * Portions copied from old ieee80211 component, w/ original copyright + * notices below: + * + * Host AP crypto routines + * + * Copyright (c) 2002-2003, Jouni Malinen + * Portions Copyright (C) 2004, Intel Corporation + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include "libipw.h" + +struct libipw_crypto_alg { + struct list_head list; + const struct libipw_crypto_ops *ops; +}; + +static LIST_HEAD(libipw_crypto_algs); +static DEFINE_SPINLOCK(libipw_crypto_lock); + +static void libipw_crypt_deinit_entries(struct libipw_crypt_info *info, + int force); +static void libipw_crypt_quiescing(struct libipw_crypt_info *info); +static void libipw_crypt_deinit_handler(struct timer_list *t); + +int libipw_crypt_info_init(struct libipw_crypt_info *info, char *name, + spinlock_t *lock) +{ + memset(info, 0, sizeof(*info)); + + info->name = name; + info->lock = lock; + + INIT_LIST_HEAD(&info->crypt_deinit_list); + timer_setup(&info->crypt_deinit_timer, libipw_crypt_deinit_handler, + 0); + + return 0; +} +EXPORT_SYMBOL(libipw_crypt_info_init); + +void libipw_crypt_info_free(struct libipw_crypt_info *info) +{ + int i; + + libipw_crypt_quiescing(info); + del_timer_sync(&info->crypt_deinit_timer); + libipw_crypt_deinit_entries(info, 1); + + for (i = 0; i < NUM_WEP_KEYS; i++) { + struct libipw_crypt_data *crypt = info->crypt[i]; + if (crypt) { + if (crypt->ops) { + crypt->ops->deinit(crypt->priv); + module_put(crypt->ops->owner); + } + kfree(crypt); + info->crypt[i] = NULL; + } + } +} +EXPORT_SYMBOL(libipw_crypt_info_free); + +static void libipw_crypt_deinit_entries(struct libipw_crypt_info *info, + int force) +{ + struct libipw_crypt_data *entry, *next; + unsigned long flags; + + spin_lock_irqsave(info->lock, flags); + list_for_each_entry_safe(entry, next, &info->crypt_deinit_list, list) { + if (atomic_read(&entry->refcnt) != 0 && !force) + continue; + + list_del(&entry->list); + + if (entry->ops) { + entry->ops->deinit(entry->priv); + module_put(entry->ops->owner); + } + kfree(entry); + } + spin_unlock_irqrestore(info->lock, flags); +} + +/* After this, crypt_deinit_list won't accept new members */ +static void libipw_crypt_quiescing(struct libipw_crypt_info *info) +{ + unsigned long flags; + + spin_lock_irqsave(info->lock, flags); + info->crypt_quiesced = 1; + spin_unlock_irqrestore(info->lock, flags); +} + +static void libipw_crypt_deinit_handler(struct timer_list *t) +{ + struct libipw_crypt_info *info = from_timer(info, t, + crypt_deinit_timer); + unsigned long flags; + + libipw_crypt_deinit_entries(info, 0); + + spin_lock_irqsave(info->lock, flags); + if (!list_empty(&info->crypt_deinit_list) && !info->crypt_quiesced) { + printk(KERN_DEBUG "%s: entries remaining in delayed crypt " + "deletion list\n", info->name); + info->crypt_deinit_timer.expires = jiffies + HZ; + add_timer(&info->crypt_deinit_timer); + } + spin_unlock_irqrestore(info->lock, flags); +} + +void libipw_crypt_delayed_deinit(struct libipw_crypt_info *info, + struct libipw_crypt_data **crypt) +{ + struct libipw_crypt_data *tmp; + unsigned long flags; + + if (*crypt == NULL) + return; + + tmp = *crypt; + *crypt = NULL; + + /* must not run ops->deinit() while there may be pending encrypt or + * decrypt operations. Use a list of delayed deinits to avoid needing + * locking. */ + + spin_lock_irqsave(info->lock, flags); + if (!info->crypt_quiesced) { + list_add(&tmp->list, &info->crypt_deinit_list); + if (!timer_pending(&info->crypt_deinit_timer)) { + info->crypt_deinit_timer.expires = jiffies + HZ; + add_timer(&info->crypt_deinit_timer); + } + } + spin_unlock_irqrestore(info->lock, flags); +} +EXPORT_SYMBOL(libipw_crypt_delayed_deinit); + +int libipw_register_crypto_ops(const struct libipw_crypto_ops *ops) +{ + unsigned long flags; + struct libipw_crypto_alg *alg; + + alg = kzalloc(sizeof(*alg), GFP_KERNEL); + if (alg == NULL) + return -ENOMEM; + + alg->ops = ops; + + spin_lock_irqsave(&libipw_crypto_lock, flags); + list_add(&alg->list, &libipw_crypto_algs); + spin_unlock_irqrestore(&libipw_crypto_lock, flags); + + printk(KERN_DEBUG "libipw_crypt: registered algorithm '%s'\n", + ops->name); + + return 0; +} +EXPORT_SYMBOL(libipw_register_crypto_ops); + +int libipw_unregister_crypto_ops(const struct libipw_crypto_ops *ops) +{ + struct libipw_crypto_alg *alg; + unsigned long flags; + + spin_lock_irqsave(&libipw_crypto_lock, flags); + list_for_each_entry(alg, &libipw_crypto_algs, list) { + if (alg->ops == ops) + goto found; + } + spin_unlock_irqrestore(&libipw_crypto_lock, flags); + return -EINVAL; + + found: + printk(KERN_DEBUG "libipw_crypt: unregistered algorithm '%s'\n", + ops->name); + list_del(&alg->list); + spin_unlock_irqrestore(&libipw_crypto_lock, flags); + kfree(alg); + return 0; +} +EXPORT_SYMBOL(libipw_unregister_crypto_ops); + +const struct libipw_crypto_ops *libipw_get_crypto_ops(const char *name) +{ + struct libipw_crypto_alg *alg; + unsigned long flags; + + spin_lock_irqsave(&libipw_crypto_lock, flags); + list_for_each_entry(alg, &libipw_crypto_algs, list) { + if (strcmp(alg->ops->name, name) == 0) + goto found; + } + spin_unlock_irqrestore(&libipw_crypto_lock, flags); + return NULL; + + found: + spin_unlock_irqrestore(&libipw_crypto_lock, flags); + return alg->ops; +} +EXPORT_SYMBOL(libipw_get_crypto_ops); + +static void *libipw_crypt_null_init(int keyidx) +{ + return (void *)1; +} + +static void libipw_crypt_null_deinit(void *priv) +{ +} + +static const struct libipw_crypto_ops libipw_crypt_null = { + .name = "NULL", + .init = libipw_crypt_null_init, + .deinit = libipw_crypt_null_deinit, + .owner = THIS_MODULE, +}; + +int __init libipw_crypto_init(void) +{ + return libipw_register_crypto_ops(&libipw_crypt_null); +} + +void libipw_crypto_exit(void) +{ + libipw_unregister_crypto_ops(&libipw_crypt_null); + BUG_ON(!list_empty(&libipw_crypto_algs)); +} diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_crypto_ccmp.c b/drivers/net/wireless/intel/ipw2x00/libipw_crypto_ccmp.c new file mode 100644 index 000000000000..bf900d8c8ad3 --- /dev/null +++ b/drivers/net/wireless/intel/ipw2x00/libipw_crypto_ccmp.c @@ -0,0 +1,438 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * libipw crypt: host-based CCMP encryption implementation for libipw + * + * Copyright (c) 2003-2004, Jouni Malinen + * Copyright (c) 2008, John W. Linville + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libipw.h" + +#define AES_BLOCK_LEN 16 +#define CCMP_HDR_LEN 8 +#define CCMP_MIC_LEN 8 +#define CCMP_TK_LEN 16 +#define CCMP_PN_LEN 6 + +struct libipw_ccmp_data { + u8 key[CCMP_TK_LEN]; + int key_set; + + u8 tx_pn[CCMP_PN_LEN]; + u8 rx_pn[CCMP_PN_LEN]; + + u32 dot11RSNAStatsCCMPFormatErrors; + u32 dot11RSNAStatsCCMPReplays; + u32 dot11RSNAStatsCCMPDecryptErrors; + + int key_idx; + + struct crypto_aead *tfm; + + /* scratch buffers for virt_to_page() (crypto API) */ + u8 tx_aad[2 * AES_BLOCK_LEN]; + u8 rx_aad[2 * AES_BLOCK_LEN]; +}; + +static void *libipw_ccmp_init(int key_idx) +{ + struct libipw_ccmp_data *priv; + + priv = kzalloc(sizeof(*priv), GFP_ATOMIC); + if (priv == NULL) + goto fail; + priv->key_idx = key_idx; + + priv->tfm = crypto_alloc_aead("ccm(aes)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(priv->tfm)) { + priv->tfm = NULL; + goto fail; + } + + return priv; + + fail: + if (priv) { + if (priv->tfm) + crypto_free_aead(priv->tfm); + kfree(priv); + } + + return NULL; +} + +static void libipw_ccmp_deinit(void *priv) +{ + struct libipw_ccmp_data *_priv = priv; + if (_priv && _priv->tfm) + crypto_free_aead(_priv->tfm); + kfree(priv); +} + +static int ccmp_init_iv_and_aad(const struct ieee80211_hdr *hdr, + const u8 *pn, u8 *iv, u8 *aad) +{ + u8 *pos, qc = 0; + size_t aad_len; + int a4_included, qc_included; + + a4_included = ieee80211_has_a4(hdr->frame_control); + qc_included = ieee80211_is_data_qos(hdr->frame_control); + + aad_len = 22; + if (a4_included) + aad_len += 6; + if (qc_included) { + pos = (u8 *) & hdr->addr4; + if (a4_included) + pos += 6; + qc = *pos & 0x0f; + aad_len += 2; + } + + /* In CCM, the initial vectors (IV) used for CTR mode encryption and CBC + * mode authentication are not allowed to collide, yet both are derived + * from the same vector. We only set L := 1 here to indicate that the + * data size can be represented in (L+1) bytes. The CCM layer will take + * care of storing the data length in the top (L+1) bytes and setting + * and clearing the other bits as is required to derive the two IVs. + */ + iv[0] = 0x1; + + /* Nonce: QC | A2 | PN */ + iv[1] = qc; + memcpy(iv + 2, hdr->addr2, ETH_ALEN); + memcpy(iv + 8, pn, CCMP_PN_LEN); + + /* AAD: + * FC with bits 4..6 and 11..13 masked to zero; 14 is always one + * A1 | A2 | A3 + * SC with bits 4..15 (seq#) masked to zero + * A4 (if present) + * QC (if present) + */ + pos = (u8 *) hdr; + aad[0] = pos[0] & 0x8f; + aad[1] = pos[1] & 0xc7; + memcpy(aad + 2, &hdr->addrs, 3 * ETH_ALEN); + pos = (u8 *) & hdr->seq_ctrl; + aad[20] = pos[0] & 0x0f; + aad[21] = 0; /* all bits masked */ + memset(aad + 22, 0, 8); + if (a4_included) + memcpy(aad + 22, hdr->addr4, ETH_ALEN); + if (qc_included) { + aad[a4_included ? 28 : 22] = qc; + /* rest of QC masked */ + } + return aad_len; +} + +static int libipw_ccmp_hdr(struct sk_buff *skb, int hdr_len, + u8 *aeskey, int keylen, void *priv) +{ + struct libipw_ccmp_data *key = priv; + int i; + u8 *pos; + + if (skb_headroom(skb) < CCMP_HDR_LEN || skb->len < hdr_len) + return -1; + + if (aeskey != NULL && keylen >= CCMP_TK_LEN) + memcpy(aeskey, key->key, CCMP_TK_LEN); + + pos = skb_push(skb, CCMP_HDR_LEN); + memmove(pos, pos + CCMP_HDR_LEN, hdr_len); + pos += hdr_len; + + i = CCMP_PN_LEN - 1; + while (i >= 0) { + key->tx_pn[i]++; + if (key->tx_pn[i] != 0) + break; + i--; + } + + *pos++ = key->tx_pn[5]; + *pos++ = key->tx_pn[4]; + *pos++ = 0; + *pos++ = (key->key_idx << 6) | (1 << 5) /* Ext IV included */ ; + *pos++ = key->tx_pn[3]; + *pos++ = key->tx_pn[2]; + *pos++ = key->tx_pn[1]; + *pos++ = key->tx_pn[0]; + + return CCMP_HDR_LEN; +} + +static int libipw_ccmp_encrypt(struct sk_buff *skb, int hdr_len, void *priv) +{ + struct libipw_ccmp_data *key = priv; + struct ieee80211_hdr *hdr; + struct aead_request *req; + struct scatterlist sg[2]; + u8 *aad = key->tx_aad; + u8 iv[AES_BLOCK_LEN]; + int len, data_len, aad_len; + int ret; + + if (skb_tailroom(skb) < CCMP_MIC_LEN || skb->len < hdr_len) + return -1; + + data_len = skb->len - hdr_len; + len = libipw_ccmp_hdr(skb, hdr_len, NULL, 0, priv); + if (len < 0) + return -1; + + req = aead_request_alloc(key->tfm, GFP_ATOMIC); + if (!req) + return -ENOMEM; + + hdr = (struct ieee80211_hdr *)skb->data; + aad_len = ccmp_init_iv_and_aad(hdr, key->tx_pn, iv, aad); + + skb_put(skb, CCMP_MIC_LEN); + + sg_init_table(sg, 2); + sg_set_buf(&sg[0], aad, aad_len); + sg_set_buf(&sg[1], skb->data + hdr_len + CCMP_HDR_LEN, + data_len + CCMP_MIC_LEN); + + aead_request_set_callback(req, 0, NULL, NULL); + aead_request_set_ad(req, aad_len); + aead_request_set_crypt(req, sg, sg, data_len, iv); + + ret = crypto_aead_encrypt(req); + aead_request_free(req); + + return ret; +} + +/* + * deal with seq counter wrapping correctly. + * refer to timer_after() for jiffies wrapping handling + */ +static inline int ccmp_replay_check(u8 *pn_n, u8 *pn_o) +{ + u32 iv32_n, iv16_n; + u32 iv32_o, iv16_o; + + iv32_n = (pn_n[0] << 24) | (pn_n[1] << 16) | (pn_n[2] << 8) | pn_n[3]; + iv16_n = (pn_n[4] << 8) | pn_n[5]; + + iv32_o = (pn_o[0] << 24) | (pn_o[1] << 16) | (pn_o[2] << 8) | pn_o[3]; + iv16_o = (pn_o[4] << 8) | pn_o[5]; + + if ((s32)iv32_n - (s32)iv32_o < 0 || + (iv32_n == iv32_o && iv16_n <= iv16_o)) + return 1; + return 0; +} + +static int libipw_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv) +{ + struct libipw_ccmp_data *key = priv; + u8 keyidx, *pos; + struct ieee80211_hdr *hdr; + struct aead_request *req; + struct scatterlist sg[2]; + u8 *aad = key->rx_aad; + u8 iv[AES_BLOCK_LEN]; + u8 pn[6]; + int aad_len, ret; + size_t data_len = skb->len - hdr_len - CCMP_HDR_LEN; + + if (skb->len < hdr_len + CCMP_HDR_LEN + CCMP_MIC_LEN) { + key->dot11RSNAStatsCCMPFormatErrors++; + return -1; + } + + hdr = (struct ieee80211_hdr *)skb->data; + pos = skb->data + hdr_len; + keyidx = pos[3]; + if (!(keyidx & (1 << 5))) { + net_dbg_ratelimited("CCMP: received packet without ExtIV flag from %pM\n", + hdr->addr2); + key->dot11RSNAStatsCCMPFormatErrors++; + return -2; + } + keyidx >>= 6; + if (key->key_idx != keyidx) { + net_dbg_ratelimited("CCMP: RX tkey->key_idx=%d frame keyidx=%d\n", + key->key_idx, keyidx); + return -6; + } + if (!key->key_set) { + net_dbg_ratelimited("CCMP: received packet from %pM with keyid=%d that does not have a configured key\n", + hdr->addr2, keyidx); + return -3; + } + + pn[0] = pos[7]; + pn[1] = pos[6]; + pn[2] = pos[5]; + pn[3] = pos[4]; + pn[4] = pos[1]; + pn[5] = pos[0]; + pos += 8; + + if (ccmp_replay_check(pn, key->rx_pn)) { +#ifdef CONFIG_LIBIPW_DEBUG + net_dbg_ratelimited("CCMP: replay detected: STA=%pM previous PN %02x%02x%02x%02x%02x%02x received PN %02x%02x%02x%02x%02x%02x\n", + hdr->addr2, + key->rx_pn[0], key->rx_pn[1], key->rx_pn[2], + key->rx_pn[3], key->rx_pn[4], key->rx_pn[5], + pn[0], pn[1], pn[2], pn[3], pn[4], pn[5]); +#endif + key->dot11RSNAStatsCCMPReplays++; + return -4; + } + + req = aead_request_alloc(key->tfm, GFP_ATOMIC); + if (!req) + return -ENOMEM; + + aad_len = ccmp_init_iv_and_aad(hdr, pn, iv, aad); + + sg_init_table(sg, 2); + sg_set_buf(&sg[0], aad, aad_len); + sg_set_buf(&sg[1], pos, data_len); + + aead_request_set_callback(req, 0, NULL, NULL); + aead_request_set_ad(req, aad_len); + aead_request_set_crypt(req, sg, sg, data_len, iv); + + ret = crypto_aead_decrypt(req); + aead_request_free(req); + + if (ret) { + net_dbg_ratelimited("CCMP: decrypt failed: STA=%pM (%d)\n", + hdr->addr2, ret); + key->dot11RSNAStatsCCMPDecryptErrors++; + return -5; + } + + memcpy(key->rx_pn, pn, CCMP_PN_LEN); + + /* Remove hdr and MIC */ + memmove(skb->data + CCMP_HDR_LEN, skb->data, hdr_len); + skb_pull(skb, CCMP_HDR_LEN); + skb_trim(skb, skb->len - CCMP_MIC_LEN); + + return keyidx; +} + +static int libipw_ccmp_set_key(void *key, int len, u8 * seq, void *priv) +{ + struct libipw_ccmp_data *data = priv; + int keyidx; + struct crypto_aead *tfm = data->tfm; + + keyidx = data->key_idx; + memset(data, 0, sizeof(*data)); + data->key_idx = keyidx; + data->tfm = tfm; + if (len == CCMP_TK_LEN) { + memcpy(data->key, key, CCMP_TK_LEN); + data->key_set = 1; + if (seq) { + data->rx_pn[0] = seq[5]; + data->rx_pn[1] = seq[4]; + data->rx_pn[2] = seq[3]; + data->rx_pn[3] = seq[2]; + data->rx_pn[4] = seq[1]; + data->rx_pn[5] = seq[0]; + } + if (crypto_aead_setauthsize(data->tfm, CCMP_MIC_LEN) || + crypto_aead_setkey(data->tfm, data->key, CCMP_TK_LEN)) + return -1; + } else if (len == 0) + data->key_set = 0; + else + return -1; + + return 0; +} + +static int libipw_ccmp_get_key(void *key, int len, u8 * seq, void *priv) +{ + struct libipw_ccmp_data *data = priv; + + if (len < CCMP_TK_LEN) + return -1; + + if (!data->key_set) + return 0; + memcpy(key, data->key, CCMP_TK_LEN); + + if (seq) { + seq[0] = data->tx_pn[5]; + seq[1] = data->tx_pn[4]; + seq[2] = data->tx_pn[3]; + seq[3] = data->tx_pn[2]; + seq[4] = data->tx_pn[1]; + seq[5] = data->tx_pn[0]; + } + + return CCMP_TK_LEN; +} + +static void libipw_ccmp_print_stats(struct seq_file *m, void *priv) +{ + struct libipw_ccmp_data *ccmp = priv; + + seq_printf(m, + "key[%d] alg=CCMP key_set=%d " + "tx_pn=%02x%02x%02x%02x%02x%02x " + "rx_pn=%02x%02x%02x%02x%02x%02x " + "format_errors=%d replays=%d decrypt_errors=%d\n", + ccmp->key_idx, ccmp->key_set, + ccmp->tx_pn[0], ccmp->tx_pn[1], ccmp->tx_pn[2], + ccmp->tx_pn[3], ccmp->tx_pn[4], ccmp->tx_pn[5], + ccmp->rx_pn[0], ccmp->rx_pn[1], ccmp->rx_pn[2], + ccmp->rx_pn[3], ccmp->rx_pn[4], ccmp->rx_pn[5], + ccmp->dot11RSNAStatsCCMPFormatErrors, + ccmp->dot11RSNAStatsCCMPReplays, + ccmp->dot11RSNAStatsCCMPDecryptErrors); +} + +static const struct libipw_crypto_ops libipw_crypt_ccmp = { + .name = "CCMP", + .init = libipw_ccmp_init, + .deinit = libipw_ccmp_deinit, + .encrypt_mpdu = libipw_ccmp_encrypt, + .decrypt_mpdu = libipw_ccmp_decrypt, + .encrypt_msdu = NULL, + .decrypt_msdu = NULL, + .set_key = libipw_ccmp_set_key, + .get_key = libipw_ccmp_get_key, + .print_stats = libipw_ccmp_print_stats, + .extra_mpdu_prefix_len = CCMP_HDR_LEN, + .extra_mpdu_postfix_len = CCMP_MIC_LEN, + .owner = THIS_MODULE, +}; + +int __init libipw_crypto_ccmp_init(void) +{ + return libipw_register_crypto_ops(&libipw_crypt_ccmp); +} + +void libipw_crypto_ccmp_exit(void) +{ + libipw_unregister_crypto_ops(&libipw_crypt_ccmp); +} diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_crypto_tkip.c b/drivers/net/wireless/intel/ipw2x00/libipw_crypto_tkip.c new file mode 100644 index 000000000000..32288697da4f --- /dev/null +++ b/drivers/net/wireless/intel/ipw2x00/libipw_crypto_tkip.c @@ -0,0 +1,728 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * libipw crypt: host-based TKIP encryption implementation for libipw + * + * Copyright (c) 2003-2004, Jouni Malinen + * Copyright (c) 2008, John W. Linville + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libipw.h" + +#define TKIP_HDR_LEN 8 + +struct libipw_tkip_data { +#define TKIP_KEY_LEN 32 + u8 key[TKIP_KEY_LEN]; + int key_set; + + u32 tx_iv32; + u16 tx_iv16; + u16 tx_ttak[5]; + int tx_phase1_done; + + u32 rx_iv32; + u16 rx_iv16; + u16 rx_ttak[5]; + int rx_phase1_done; + u32 rx_iv32_new; + u16 rx_iv16_new; + + u32 dot11RSNAStatsTKIPReplays; + u32 dot11RSNAStatsTKIPICVErrors; + u32 dot11RSNAStatsTKIPLocalMICFailures; + + int key_idx; + + struct arc4_ctx rx_ctx_arc4; + struct arc4_ctx tx_ctx_arc4; + struct crypto_shash *rx_tfm_michael; + struct crypto_shash *tx_tfm_michael; + + /* scratch buffers for virt_to_page() (crypto API) */ + u8 rx_hdr[16], tx_hdr[16]; + + unsigned long flags; +}; + +static unsigned long libipw_tkip_set_flags(unsigned long flags, void *priv) +{ + struct libipw_tkip_data *_priv = priv; + unsigned long old_flags = _priv->flags; + _priv->flags = flags; + return old_flags; +} + +static unsigned long libipw_tkip_get_flags(void *priv) +{ + struct libipw_tkip_data *_priv = priv; + return _priv->flags; +} + +static void *libipw_tkip_init(int key_idx) +{ + struct libipw_tkip_data *priv; + + if (fips_enabled) + return NULL; + + priv = kzalloc(sizeof(*priv), GFP_ATOMIC); + if (priv == NULL) + goto fail; + + priv->key_idx = key_idx; + + priv->tx_tfm_michael = crypto_alloc_shash("michael_mic", 0, 0); + if (IS_ERR(priv->tx_tfm_michael)) { + priv->tx_tfm_michael = NULL; + goto fail; + } + + priv->rx_tfm_michael = crypto_alloc_shash("michael_mic", 0, 0); + if (IS_ERR(priv->rx_tfm_michael)) { + priv->rx_tfm_michael = NULL; + goto fail; + } + + return priv; + + fail: + if (priv) { + crypto_free_shash(priv->tx_tfm_michael); + crypto_free_shash(priv->rx_tfm_michael); + kfree(priv); + } + + return NULL; +} + +static void libipw_tkip_deinit(void *priv) +{ + struct libipw_tkip_data *_priv = priv; + if (_priv) { + crypto_free_shash(_priv->tx_tfm_michael); + crypto_free_shash(_priv->rx_tfm_michael); + } + kfree_sensitive(priv); +} + +static inline u16 RotR1(u16 val) +{ + return (val >> 1) | (val << 15); +} + +static inline u8 Lo8(u16 val) +{ + return val & 0xff; +} + +static inline u8 Hi8(u16 val) +{ + return val >> 8; +} + +static inline u16 Lo16(u32 val) +{ + return val & 0xffff; +} + +static inline u16 Hi16(u32 val) +{ + return val >> 16; +} + +static inline u16 Mk16(u8 hi, u8 lo) +{ + return lo | (((u16) hi) << 8); +} + +static inline u16 Mk16_le(__le16 * v) +{ + return le16_to_cpu(*v); +} + +static const u16 Sbox[256] = { + 0xC6A5, 0xF884, 0xEE99, 0xF68D, 0xFF0D, 0xD6BD, 0xDEB1, 0x9154, + 0x6050, 0x0203, 0xCEA9, 0x567D, 0xE719, 0xB562, 0x4DE6, 0xEC9A, + 0x8F45, 0x1F9D, 0x8940, 0xFA87, 0xEF15, 0xB2EB, 0x8EC9, 0xFB0B, + 0x41EC, 0xB367, 0x5FFD, 0x45EA, 0x23BF, 0x53F7, 0xE496, 0x9B5B, + 0x75C2, 0xE11C, 0x3DAE, 0x4C6A, 0x6C5A, 0x7E41, 0xF502, 0x834F, + 0x685C, 0x51F4, 0xD134, 0xF908, 0xE293, 0xAB73, 0x6253, 0x2A3F, + 0x080C, 0x9552, 0x4665, 0x9D5E, 0x3028, 0x37A1, 0x0A0F, 0x2FB5, + 0x0E09, 0x2436, 0x1B9B, 0xDF3D, 0xCD26, 0x4E69, 0x7FCD, 0xEA9F, + 0x121B, 0x1D9E, 0x5874, 0x342E, 0x362D, 0xDCB2, 0xB4EE, 0x5BFB, + 0xA4F6, 0x764D, 0xB761, 0x7DCE, 0x527B, 0xDD3E, 0x5E71, 0x1397, + 0xA6F5, 0xB968, 0x0000, 0xC12C, 0x4060, 0xE31F, 0x79C8, 0xB6ED, + 0xD4BE, 0x8D46, 0x67D9, 0x724B, 0x94DE, 0x98D4, 0xB0E8, 0x854A, + 0xBB6B, 0xC52A, 0x4FE5, 0xED16, 0x86C5, 0x9AD7, 0x6655, 0x1194, + 0x8ACF, 0xE910, 0x0406, 0xFE81, 0xA0F0, 0x7844, 0x25BA, 0x4BE3, + 0xA2F3, 0x5DFE, 0x80C0, 0x058A, 0x3FAD, 0x21BC, 0x7048, 0xF104, + 0x63DF, 0x77C1, 0xAF75, 0x4263, 0x2030, 0xE51A, 0xFD0E, 0xBF6D, + 0x814C, 0x1814, 0x2635, 0xC32F, 0xBEE1, 0x35A2, 0x88CC, 0x2E39, + 0x9357, 0x55F2, 0xFC82, 0x7A47, 0xC8AC, 0xBAE7, 0x322B, 0xE695, + 0xC0A0, 0x1998, 0x9ED1, 0xA37F, 0x4466, 0x547E, 0x3BAB, 0x0B83, + 0x8CCA, 0xC729, 0x6BD3, 0x283C, 0xA779, 0xBCE2, 0x161D, 0xAD76, + 0xDB3B, 0x6456, 0x744E, 0x141E, 0x92DB, 0x0C0A, 0x486C, 0xB8E4, + 0x9F5D, 0xBD6E, 0x43EF, 0xC4A6, 0x39A8, 0x31A4, 0xD337, 0xF28B, + 0xD532, 0x8B43, 0x6E59, 0xDAB7, 0x018C, 0xB164, 0x9CD2, 0x49E0, + 0xD8B4, 0xACFA, 0xF307, 0xCF25, 0xCAAF, 0xF48E, 0x47E9, 0x1018, + 0x6FD5, 0xF088, 0x4A6F, 0x5C72, 0x3824, 0x57F1, 0x73C7, 0x9751, + 0xCB23, 0xA17C, 0xE89C, 0x3E21, 0x96DD, 0x61DC, 0x0D86, 0x0F85, + 0xE090, 0x7C42, 0x71C4, 0xCCAA, 0x90D8, 0x0605, 0xF701, 0x1C12, + 0xC2A3, 0x6A5F, 0xAEF9, 0x69D0, 0x1791, 0x9958, 0x3A27, 0x27B9, + 0xD938, 0xEB13, 0x2BB3, 0x2233, 0xD2BB, 0xA970, 0x0789, 0x33A7, + 0x2DB6, 0x3C22, 0x1592, 0xC920, 0x8749, 0xAAFF, 0x5078, 0xA57A, + 0x038F, 0x59F8, 0x0980, 0x1A17, 0x65DA, 0xD731, 0x84C6, 0xD0B8, + 0x82C3, 0x29B0, 0x5A77, 0x1E11, 0x7BCB, 0xA8FC, 0x6DD6, 0x2C3A, +}; + +static inline u16 _S_(u16 v) +{ + u16 t = Sbox[Hi8(v)]; + return Sbox[Lo8(v)] ^ ((t << 8) | (t >> 8)); +} + +#define PHASE1_LOOP_COUNT 8 + +static void tkip_mixing_phase1(u16 * TTAK, const u8 * TK, const u8 * TA, + u32 IV32) +{ + int i, j; + + /* Initialize the 80-bit TTAK from TSC (IV32) and TA[0..5] */ + TTAK[0] = Lo16(IV32); + TTAK[1] = Hi16(IV32); + TTAK[2] = Mk16(TA[1], TA[0]); + TTAK[3] = Mk16(TA[3], TA[2]); + TTAK[4] = Mk16(TA[5], TA[4]); + + for (i = 0; i < PHASE1_LOOP_COUNT; i++) { + j = 2 * (i & 1); + TTAK[0] += _S_(TTAK[4] ^ Mk16(TK[1 + j], TK[0 + j])); + TTAK[1] += _S_(TTAK[0] ^ Mk16(TK[5 + j], TK[4 + j])); + TTAK[2] += _S_(TTAK[1] ^ Mk16(TK[9 + j], TK[8 + j])); + TTAK[3] += _S_(TTAK[2] ^ Mk16(TK[13 + j], TK[12 + j])); + TTAK[4] += _S_(TTAK[3] ^ Mk16(TK[1 + j], TK[0 + j])) + i; + } +} + +static void tkip_mixing_phase2(u8 * WEPSeed, const u8 * TK, const u16 * TTAK, + u16 IV16) +{ + /* Make temporary area overlap WEP seed so that the final copy can be + * avoided on little endian hosts. */ + u16 *PPK = (u16 *) & WEPSeed[4]; + + /* Step 1 - make copy of TTAK and bring in TSC */ + PPK[0] = TTAK[0]; + PPK[1] = TTAK[1]; + PPK[2] = TTAK[2]; + PPK[3] = TTAK[3]; + PPK[4] = TTAK[4]; + PPK[5] = TTAK[4] + IV16; + + /* Step 2 - 96-bit bijective mixing using S-box */ + PPK[0] += _S_(PPK[5] ^ Mk16_le((__le16 *) & TK[0])); + PPK[1] += _S_(PPK[0] ^ Mk16_le((__le16 *) & TK[2])); + PPK[2] += _S_(PPK[1] ^ Mk16_le((__le16 *) & TK[4])); + PPK[3] += _S_(PPK[2] ^ Mk16_le((__le16 *) & TK[6])); + PPK[4] += _S_(PPK[3] ^ Mk16_le((__le16 *) & TK[8])); + PPK[5] += _S_(PPK[4] ^ Mk16_le((__le16 *) & TK[10])); + + PPK[0] += RotR1(PPK[5] ^ Mk16_le((__le16 *) & TK[12])); + PPK[1] += RotR1(PPK[0] ^ Mk16_le((__le16 *) & TK[14])); + PPK[2] += RotR1(PPK[1]); + PPK[3] += RotR1(PPK[2]); + PPK[4] += RotR1(PPK[3]); + PPK[5] += RotR1(PPK[4]); + + /* Step 3 - bring in last of TK bits, assign 24-bit WEP IV value + * WEPSeed[0..2] is transmitted as WEP IV */ + WEPSeed[0] = Hi8(IV16); + WEPSeed[1] = (Hi8(IV16) | 0x20) & 0x7F; + WEPSeed[2] = Lo8(IV16); + WEPSeed[3] = Lo8((PPK[5] ^ Mk16_le((__le16 *) & TK[0])) >> 1); + +#ifdef __BIG_ENDIAN + { + int i; + for (i = 0; i < 6; i++) + PPK[i] = (PPK[i] << 8) | (PPK[i] >> 8); + } +#endif +} + +static int libipw_tkip_hdr(struct sk_buff *skb, int hdr_len, + u8 * rc4key, int keylen, void *priv) +{ + struct libipw_tkip_data *tkey = priv; + u8 *pos; + struct ieee80211_hdr *hdr; + + hdr = (struct ieee80211_hdr *)skb->data; + + if (skb_headroom(skb) < TKIP_HDR_LEN || skb->len < hdr_len) + return -1; + + if (rc4key == NULL || keylen < 16) + return -1; + + if (!tkey->tx_phase1_done) { + tkip_mixing_phase1(tkey->tx_ttak, tkey->key, hdr->addr2, + tkey->tx_iv32); + tkey->tx_phase1_done = 1; + } + tkip_mixing_phase2(rc4key, tkey->key, tkey->tx_ttak, tkey->tx_iv16); + + pos = skb_push(skb, TKIP_HDR_LEN); + memmove(pos, pos + TKIP_HDR_LEN, hdr_len); + pos += hdr_len; + + *pos++ = *rc4key; + *pos++ = *(rc4key + 1); + *pos++ = *(rc4key + 2); + *pos++ = (tkey->key_idx << 6) | (1 << 5) /* Ext IV included */ ; + *pos++ = tkey->tx_iv32 & 0xff; + *pos++ = (tkey->tx_iv32 >> 8) & 0xff; + *pos++ = (tkey->tx_iv32 >> 16) & 0xff; + *pos++ = (tkey->tx_iv32 >> 24) & 0xff; + + tkey->tx_iv16++; + if (tkey->tx_iv16 == 0) { + tkey->tx_phase1_done = 0; + tkey->tx_iv32++; + } + + return TKIP_HDR_LEN; +} + +static int libipw_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv) +{ + struct libipw_tkip_data *tkey = priv; + int len; + u8 rc4key[16], *pos, *icv; + u32 crc; + + if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) { + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; + net_dbg_ratelimited("TKIP countermeasures: dropped TX packet to %pM\n", + hdr->addr1); + return -1; + } + + if (skb_tailroom(skb) < 4 || skb->len < hdr_len) + return -1; + + len = skb->len - hdr_len; + pos = skb->data + hdr_len; + + if ((libipw_tkip_hdr(skb, hdr_len, rc4key, 16, priv)) < 0) + return -1; + + crc = ~crc32_le(~0, pos, len); + icv = skb_put(skb, 4); + icv[0] = crc; + icv[1] = crc >> 8; + icv[2] = crc >> 16; + icv[3] = crc >> 24; + + arc4_setkey(&tkey->tx_ctx_arc4, rc4key, 16); + arc4_crypt(&tkey->tx_ctx_arc4, pos, pos, len + 4); + + return 0; +} + +/* + * deal with seq counter wrapping correctly. + * refer to timer_after() for jiffies wrapping handling + */ +static inline int tkip_replay_check(u32 iv32_n, u16 iv16_n, + u32 iv32_o, u16 iv16_o) +{ + if ((s32)iv32_n - (s32)iv32_o < 0 || + (iv32_n == iv32_o && iv16_n <= iv16_o)) + return 1; + return 0; +} + +static int libipw_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv) +{ + struct libipw_tkip_data *tkey = priv; + u8 rc4key[16]; + u8 keyidx, *pos; + u32 iv32; + u16 iv16; + struct ieee80211_hdr *hdr; + u8 icv[4]; + u32 crc; + int plen; + + hdr = (struct ieee80211_hdr *)skb->data; + + if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) { + net_dbg_ratelimited("TKIP countermeasures: dropped received packet from %pM\n", + hdr->addr2); + return -1; + } + + if (skb->len < hdr_len + TKIP_HDR_LEN + 4) + return -1; + + pos = skb->data + hdr_len; + keyidx = pos[3]; + if (!(keyidx & (1 << 5))) { + net_dbg_ratelimited("TKIP: received packet without ExtIV flag from %pM\n", + hdr->addr2); + return -2; + } + keyidx >>= 6; + if (tkey->key_idx != keyidx) { + net_dbg_ratelimited("TKIP: RX tkey->key_idx=%d frame keyidx=%d\n", + tkey->key_idx, keyidx); + return -6; + } + if (!tkey->key_set) { + net_dbg_ratelimited("TKIP: received packet from %pM with keyid=%d that does not have a configured key\n", + hdr->addr2, keyidx); + return -3; + } + iv16 = (pos[0] << 8) | pos[2]; + iv32 = pos[4] | (pos[5] << 8) | (pos[6] << 16) | (pos[7] << 24); + pos += TKIP_HDR_LEN; + + if (tkip_replay_check(iv32, iv16, tkey->rx_iv32, tkey->rx_iv16)) { +#ifdef CONFIG_LIBIPW_DEBUG + net_dbg_ratelimited("TKIP: replay detected: STA=%pM previous TSC %08x%04x received TSC %08x%04x\n", + hdr->addr2, tkey->rx_iv32, tkey->rx_iv16, + iv32, iv16); +#endif + tkey->dot11RSNAStatsTKIPReplays++; + return -4; + } + + if (iv32 != tkey->rx_iv32 || !tkey->rx_phase1_done) { + tkip_mixing_phase1(tkey->rx_ttak, tkey->key, hdr->addr2, iv32); + tkey->rx_phase1_done = 1; + } + tkip_mixing_phase2(rc4key, tkey->key, tkey->rx_ttak, iv16); + + plen = skb->len - hdr_len - 12; + + arc4_setkey(&tkey->rx_ctx_arc4, rc4key, 16); + arc4_crypt(&tkey->rx_ctx_arc4, pos, pos, plen + 4); + + crc = ~crc32_le(~0, pos, plen); + icv[0] = crc; + icv[1] = crc >> 8; + icv[2] = crc >> 16; + icv[3] = crc >> 24; + if (memcmp(icv, pos + plen, 4) != 0) { + if (iv32 != tkey->rx_iv32) { + /* Previously cached Phase1 result was already lost, so + * it needs to be recalculated for the next packet. */ + tkey->rx_phase1_done = 0; + } +#ifdef CONFIG_LIBIPW_DEBUG + net_dbg_ratelimited("TKIP: ICV error detected: STA=%pM\n", + hdr->addr2); +#endif + tkey->dot11RSNAStatsTKIPICVErrors++; + return -5; + } + + /* Update real counters only after Michael MIC verification has + * completed */ + tkey->rx_iv32_new = iv32; + tkey->rx_iv16_new = iv16; + + /* Remove IV and ICV */ + memmove(skb->data + TKIP_HDR_LEN, skb->data, hdr_len); + skb_pull(skb, TKIP_HDR_LEN); + skb_trim(skb, skb->len - 4); + + return keyidx; +} + +static int michael_mic(struct crypto_shash *tfm_michael, u8 *key, u8 *hdr, + u8 *data, size_t data_len, u8 *mic) +{ + SHASH_DESC_ON_STACK(desc, tfm_michael); + int err; + + if (tfm_michael == NULL) { + pr_warn("%s(): tfm_michael == NULL\n", __func__); + return -1; + } + + desc->tfm = tfm_michael; + + if (crypto_shash_setkey(tfm_michael, key, 8)) + return -1; + + err = crypto_shash_init(desc); + if (err) + goto out; + err = crypto_shash_update(desc, hdr, 16); + if (err) + goto out; + err = crypto_shash_update(desc, data, data_len); + if (err) + goto out; + err = crypto_shash_final(desc, mic); + +out: + shash_desc_zero(desc); + return err; +} + +static void michael_mic_hdr(struct sk_buff *skb, u8 * hdr) +{ + struct ieee80211_hdr *hdr11; + + hdr11 = (struct ieee80211_hdr *)skb->data; + + switch (le16_to_cpu(hdr11->frame_control) & + (IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS)) { + case IEEE80211_FCTL_TODS: + memcpy(hdr, hdr11->addr3, ETH_ALEN); /* DA */ + memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */ + break; + case IEEE80211_FCTL_FROMDS: + memcpy(hdr, hdr11->addr1, ETH_ALEN); /* DA */ + memcpy(hdr + ETH_ALEN, hdr11->addr3, ETH_ALEN); /* SA */ + break; + case IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS: + memcpy(hdr, hdr11->addr3, ETH_ALEN); /* DA */ + memcpy(hdr + ETH_ALEN, hdr11->addr4, ETH_ALEN); /* SA */ + break; + default: + memcpy(hdr, hdr11->addr1, ETH_ALEN); /* DA */ + memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */ + break; + } + + if (ieee80211_is_data_qos(hdr11->frame_control)) { + hdr[12] = le16_to_cpu(*((__le16 *)ieee80211_get_qos_ctl(hdr11))) + & IEEE80211_QOS_CTL_TID_MASK; + } else + hdr[12] = 0; /* priority */ + + hdr[13] = hdr[14] = hdr[15] = 0; /* reserved */ +} + +static int libipw_michael_mic_add(struct sk_buff *skb, int hdr_len, + void *priv) +{ + struct libipw_tkip_data *tkey = priv; + u8 *pos; + + if (skb_tailroom(skb) < 8 || skb->len < hdr_len) { + printk(KERN_DEBUG "Invalid packet for Michael MIC add " + "(tailroom=%d hdr_len=%d skb->len=%d)\n", + skb_tailroom(skb), hdr_len, skb->len); + return -1; + } + + michael_mic_hdr(skb, tkey->tx_hdr); + pos = skb_put(skb, 8); + if (michael_mic(tkey->tx_tfm_michael, &tkey->key[16], tkey->tx_hdr, + skb->data + hdr_len, skb->len - 8 - hdr_len, pos)) + return -1; + + return 0; +} + +static void libipw_michael_mic_failure(struct net_device *dev, + struct ieee80211_hdr *hdr, + int keyidx) +{ + union iwreq_data wrqu; + struct iw_michaelmicfailure ev; + + /* TODO: needed parameters: count, keyid, key type, TSC */ + memset(&ev, 0, sizeof(ev)); + ev.flags = keyidx & IW_MICFAILURE_KEY_ID; + if (hdr->addr1[0] & 0x01) + ev.flags |= IW_MICFAILURE_GROUP; + else + ev.flags |= IW_MICFAILURE_PAIRWISE; + ev.src_addr.sa_family = ARPHRD_ETHER; + memcpy(ev.src_addr.sa_data, hdr->addr2, ETH_ALEN); + memset(&wrqu, 0, sizeof(wrqu)); + wrqu.data.length = sizeof(ev); + wireless_send_event(dev, IWEVMICHAELMICFAILURE, &wrqu, (char *)&ev); +} + +static int libipw_michael_mic_verify(struct sk_buff *skb, int keyidx, + int hdr_len, void *priv) +{ + struct libipw_tkip_data *tkey = priv; + u8 mic[8]; + + if (!tkey->key_set) + return -1; + + michael_mic_hdr(skb, tkey->rx_hdr); + if (michael_mic(tkey->rx_tfm_michael, &tkey->key[24], tkey->rx_hdr, + skb->data + hdr_len, skb->len - 8 - hdr_len, mic)) + return -1; + if (memcmp(mic, skb->data + skb->len - 8, 8) != 0) { + struct ieee80211_hdr *hdr; + hdr = (struct ieee80211_hdr *)skb->data; + printk(KERN_DEBUG "%s: Michael MIC verification failed for " + "MSDU from %pM keyidx=%d\n", + skb->dev ? skb->dev->name : "N/A", hdr->addr2, + keyidx); + if (skb->dev) + libipw_michael_mic_failure(skb->dev, hdr, keyidx); + tkey->dot11RSNAStatsTKIPLocalMICFailures++; + return -1; + } + + /* Update TSC counters for RX now that the packet verification has + * completed. */ + tkey->rx_iv32 = tkey->rx_iv32_new; + tkey->rx_iv16 = tkey->rx_iv16_new; + + skb_trim(skb, skb->len - 8); + + return 0; +} + +static int libipw_tkip_set_key(void *key, int len, u8 * seq, void *priv) +{ + struct libipw_tkip_data *tkey = priv; + int keyidx; + struct crypto_shash *tfm = tkey->tx_tfm_michael; + struct arc4_ctx *tfm2 = &tkey->tx_ctx_arc4; + struct crypto_shash *tfm3 = tkey->rx_tfm_michael; + struct arc4_ctx *tfm4 = &tkey->rx_ctx_arc4; + + keyidx = tkey->key_idx; + memset(tkey, 0, sizeof(*tkey)); + tkey->key_idx = keyidx; + tkey->tx_tfm_michael = tfm; + tkey->tx_ctx_arc4 = *tfm2; + tkey->rx_tfm_michael = tfm3; + tkey->rx_ctx_arc4 = *tfm4; + if (len == TKIP_KEY_LEN) { + memcpy(tkey->key, key, TKIP_KEY_LEN); + tkey->key_set = 1; + tkey->tx_iv16 = 1; /* TSC is initialized to 1 */ + if (seq) { + tkey->rx_iv32 = (seq[5] << 24) | (seq[4] << 16) | + (seq[3] << 8) | seq[2]; + tkey->rx_iv16 = (seq[1] << 8) | seq[0]; + } + } else if (len == 0) + tkey->key_set = 0; + else + return -1; + + return 0; +} + +static int libipw_tkip_get_key(void *key, int len, u8 * seq, void *priv) +{ + struct libipw_tkip_data *tkey = priv; + + if (len < TKIP_KEY_LEN) + return -1; + + if (!tkey->key_set) + return 0; + memcpy(key, tkey->key, TKIP_KEY_LEN); + + if (seq) { + /* + * Not clear if this should return the value as is + * or - as the code previously seemed to partially + * have been written as - subtract one from it. It + * was working this way for a long time so leave it. + */ + seq[0] = tkey->tx_iv16; + seq[1] = tkey->tx_iv16 >> 8; + seq[2] = tkey->tx_iv32; + seq[3] = tkey->tx_iv32 >> 8; + seq[4] = tkey->tx_iv32 >> 16; + seq[5] = tkey->tx_iv32 >> 24; + } + + return TKIP_KEY_LEN; +} + +static void libipw_tkip_print_stats(struct seq_file *m, void *priv) +{ + struct libipw_tkip_data *tkip = priv; + seq_printf(m, + "key[%d] alg=TKIP key_set=%d " + "tx_pn=%02x%02x%02x%02x%02x%02x " + "rx_pn=%02x%02x%02x%02x%02x%02x " + "replays=%d icv_errors=%d local_mic_failures=%d\n", + tkip->key_idx, tkip->key_set, + (tkip->tx_iv32 >> 24) & 0xff, + (tkip->tx_iv32 >> 16) & 0xff, + (tkip->tx_iv32 >> 8) & 0xff, + tkip->tx_iv32 & 0xff, + (tkip->tx_iv16 >> 8) & 0xff, + tkip->tx_iv16 & 0xff, + (tkip->rx_iv32 >> 24) & 0xff, + (tkip->rx_iv32 >> 16) & 0xff, + (tkip->rx_iv32 >> 8) & 0xff, + tkip->rx_iv32 & 0xff, + (tkip->rx_iv16 >> 8) & 0xff, + tkip->rx_iv16 & 0xff, + tkip->dot11RSNAStatsTKIPReplays, + tkip->dot11RSNAStatsTKIPICVErrors, + tkip->dot11RSNAStatsTKIPLocalMICFailures); +} + +static const struct libipw_crypto_ops libipw_crypt_tkip = { + .name = "TKIP", + .init = libipw_tkip_init, + .deinit = libipw_tkip_deinit, + .encrypt_mpdu = libipw_tkip_encrypt, + .decrypt_mpdu = libipw_tkip_decrypt, + .encrypt_msdu = libipw_michael_mic_add, + .decrypt_msdu = libipw_michael_mic_verify, + .set_key = libipw_tkip_set_key, + .get_key = libipw_tkip_get_key, + .print_stats = libipw_tkip_print_stats, + .extra_mpdu_prefix_len = 4 + 4, /* IV + ExtIV */ + .extra_mpdu_postfix_len = 4, /* ICV */ + .extra_msdu_postfix_len = 8, /* MIC */ + .get_flags = libipw_tkip_get_flags, + .set_flags = libipw_tkip_set_flags, + .owner = THIS_MODULE, +}; + +int __init libipw_crypto_tkip_init(void) +{ + return libipw_register_crypto_ops(&libipw_crypt_tkip); +} + +void libipw_crypto_tkip_exit(void) +{ + libipw_unregister_crypto_ops(&libipw_crypt_tkip); +} diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_crypto_wep.c b/drivers/net/wireless/intel/ipw2x00/libipw_crypto_wep.c new file mode 100644 index 000000000000..c3a4ccb9de17 --- /dev/null +++ b/drivers/net/wireless/intel/ipw2x00/libipw_crypto_wep.c @@ -0,0 +1,247 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * libipw crypt: host-based WEP encryption implementation for libipw + * + * Copyright (c) 2002-2004, Jouni Malinen + * Copyright (c) 2008, John W. Linville + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libipw.h" + +struct libipw_wep_data { + u32 iv; +#define WEP_KEY_LEN 13 + u8 key[WEP_KEY_LEN + 1]; + u8 key_len; + u8 key_idx; + struct arc4_ctx tx_ctx; + struct arc4_ctx rx_ctx; +}; + +static void *libipw_wep_init(int keyidx) +{ + struct libipw_wep_data *priv; + + if (fips_enabled) + return NULL; + + priv = kzalloc(sizeof(*priv), GFP_ATOMIC); + if (priv == NULL) + return NULL; + priv->key_idx = keyidx; + + /* start WEP IV from a random value */ + get_random_bytes(&priv->iv, 4); + + return priv; +} + +static void libipw_wep_deinit(void *priv) +{ + kfree_sensitive(priv); +} + +/* Add WEP IV/key info to a frame that has at least 4 bytes of headroom */ +static int libipw_wep_build_iv(struct sk_buff *skb, int hdr_len, + u8 *key, int keylen, void *priv) +{ + struct libipw_wep_data *wep = priv; + u32 klen; + u8 *pos; + + if (skb_headroom(skb) < 4 || skb->len < hdr_len) + return -1; + + pos = skb_push(skb, 4); + memmove(pos, pos + 4, hdr_len); + pos += hdr_len; + + klen = 3 + wep->key_len; + + wep->iv++; + + /* Fluhrer, Mantin, and Shamir have reported weaknesses in the key + * scheduling algorithm of RC4. At least IVs (KeyByte + 3, 0xff, N) + * can be used to speedup attacks, so avoid using them. */ + if ((wep->iv & 0xff00) == 0xff00) { + u8 B = (wep->iv >> 16) & 0xff; + if (B >= 3 && B < klen) + wep->iv += 0x0100; + } + + /* Prepend 24-bit IV to RC4 key and TX frame */ + *pos++ = (wep->iv >> 16) & 0xff; + *pos++ = (wep->iv >> 8) & 0xff; + *pos++ = wep->iv & 0xff; + *pos++ = wep->key_idx << 6; + + return 0; +} + +/* Perform WEP encryption on given skb that has at least 4 bytes of headroom + * for IV and 4 bytes of tailroom for ICV. Both IV and ICV will be transmitted, + * so the payload length increases with 8 bytes. + * + * WEP frame payload: IV + TX key idx, RC4(data), ICV = RC4(CRC32(data)) + */ +static int libipw_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv) +{ + struct libipw_wep_data *wep = priv; + u32 crc, klen, len; + u8 *pos, *icv; + u8 key[WEP_KEY_LEN + 3]; + + /* other checks are in libipw_wep_build_iv */ + if (skb_tailroom(skb) < 4) + return -1; + + /* add the IV to the frame */ + if (libipw_wep_build_iv(skb, hdr_len, NULL, 0, priv)) + return -1; + + /* Copy the IV into the first 3 bytes of the key */ + skb_copy_from_linear_data_offset(skb, hdr_len, key, 3); + + /* Copy rest of the WEP key (the secret part) */ + memcpy(key + 3, wep->key, wep->key_len); + + len = skb->len - hdr_len - 4; + pos = skb->data + hdr_len + 4; + klen = 3 + wep->key_len; + + /* Append little-endian CRC32 over only the data and encrypt it to produce ICV */ + crc = ~crc32_le(~0, pos, len); + icv = skb_put(skb, 4); + icv[0] = crc; + icv[1] = crc >> 8; + icv[2] = crc >> 16; + icv[3] = crc >> 24; + + arc4_setkey(&wep->tx_ctx, key, klen); + arc4_crypt(&wep->tx_ctx, pos, pos, len + 4); + + return 0; +} + +/* Perform WEP decryption on given buffer. Buffer includes whole WEP part of + * the frame: IV (4 bytes), encrypted payload (including SNAP header), + * ICV (4 bytes). len includes both IV and ICV. + * + * Returns 0 if frame was decrypted successfully and ICV was correct and -1 on + * failure. If frame is OK, IV and ICV will be removed. + */ +static int libipw_wep_decrypt(struct sk_buff *skb, int hdr_len, void *priv) +{ + struct libipw_wep_data *wep = priv; + u32 crc, klen, plen; + u8 key[WEP_KEY_LEN + 3]; + u8 keyidx, *pos, icv[4]; + + if (skb->len < hdr_len + 8) + return -1; + + pos = skb->data + hdr_len; + key[0] = *pos++; + key[1] = *pos++; + key[2] = *pos++; + keyidx = *pos++ >> 6; + if (keyidx != wep->key_idx) + return -1; + + klen = 3 + wep->key_len; + + /* Copy rest of the WEP key (the secret part) */ + memcpy(key + 3, wep->key, wep->key_len); + + /* Apply RC4 to data and compute CRC32 over decrypted data */ + plen = skb->len - hdr_len - 8; + + arc4_setkey(&wep->rx_ctx, key, klen); + arc4_crypt(&wep->rx_ctx, pos, pos, plen + 4); + + crc = ~crc32_le(~0, pos, plen); + icv[0] = crc; + icv[1] = crc >> 8; + icv[2] = crc >> 16; + icv[3] = crc >> 24; + if (memcmp(icv, pos + plen, 4) != 0) { + /* ICV mismatch - drop frame */ + return -2; + } + + /* Remove IV and ICV */ + memmove(skb->data + 4, skb->data, hdr_len); + skb_pull(skb, 4); + skb_trim(skb, skb->len - 4); + + return 0; +} + +static int libipw_wep_set_key(void *key, int len, u8 * seq, void *priv) +{ + struct libipw_wep_data *wep = priv; + + if (len < 0 || len > WEP_KEY_LEN) + return -1; + + memcpy(wep->key, key, len); + wep->key_len = len; + + return 0; +} + +static int libipw_wep_get_key(void *key, int len, u8 * seq, void *priv) +{ + struct libipw_wep_data *wep = priv; + + if (len < wep->key_len) + return -1; + + memcpy(key, wep->key, wep->key_len); + + return wep->key_len; +} + +static void libipw_wep_print_stats(struct seq_file *m, void *priv) +{ + struct libipw_wep_data *wep = priv; + seq_printf(m, "key[%d] alg=WEP len=%d\n", wep->key_idx, wep->key_len); +} + +static const struct libipw_crypto_ops libipw_crypt_wep = { + .name = "WEP", + .init = libipw_wep_init, + .deinit = libipw_wep_deinit, + .encrypt_mpdu = libipw_wep_encrypt, + .decrypt_mpdu = libipw_wep_decrypt, + .encrypt_msdu = NULL, + .decrypt_msdu = NULL, + .set_key = libipw_wep_set_key, + .get_key = libipw_wep_get_key, + .print_stats = libipw_wep_print_stats, + .extra_mpdu_prefix_len = 4, /* IV */ + .extra_mpdu_postfix_len = 4, /* ICV */ + .owner = THIS_MODULE, +}; + +int __init libipw_crypto_wep_init(void) +{ + return libipw_register_crypto_ops(&libipw_crypt_wep); +} + +void __exit libipw_crypto_wep_exit(void) +{ + libipw_unregister_crypto_ops(&libipw_crypt_wep); +} diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_module.c b/drivers/net/wireless/intel/ipw2x00/libipw_module.c index 43bab92a4148..0a16127bfd68 100644 --- a/drivers/net/wireless/intel/ipw2x00/libipw_module.c +++ b/drivers/net/wireless/intel/ipw2x00/libipw_module.c @@ -169,7 +169,7 @@ struct net_device *alloc_libipw(int sizeof_priv, int monitor) spin_lock_init(&ieee->lock); - lib80211_crypt_info_init(&ieee->crypt_info, dev->name, &ieee->lock); + libipw_crypt_info_init(&ieee->crypt_info, dev->name, &ieee->lock); ieee->wpa_enabled = 0; ieee->drop_unencrypted = 0; @@ -191,7 +191,7 @@ void free_libipw(struct net_device *dev, int monitor) { struct libipw_device *ieee = netdev_priv(dev); - lib80211_crypt_info_free(&ieee->crypt_info); + libipw_crypt_info_free(&ieee->crypt_info); libipw_networks_free(ieee); @@ -251,6 +251,7 @@ static const struct proc_ops debug_level_proc_ops = { static int __init libipw_init(void) { + int err; #ifdef CONFIG_LIBIPW_DEBUG struct proc_dir_entry *e; @@ -273,7 +274,33 @@ static int __init libipw_init(void) printk(KERN_INFO DRV_NAME ": " DRV_DESCRIPTION ", " DRV_VERSION "\n"); printk(KERN_INFO DRV_NAME ": " DRV_COPYRIGHT "\n"); + err = libipw_crypto_init(); + if (err) + goto remove_debugfs; + err = libipw_crypto_ccmp_init(); + if (err) + goto uninit_crypto; + err = libipw_crypto_tkip_init(); + if (err) + goto uninit_crypto_ccmp; + err = libipw_crypto_wep_init(); + if (err) + goto uninit_crypto_tkip; + return 0; +uninit_crypto_tkip: + libipw_crypto_tkip_exit(); +uninit_crypto_ccmp: + libipw_crypto_ccmp_exit(); +uninit_crypto: + libipw_crypto_exit(); +remove_debugfs: +#ifdef CONFIG_LIBIPW_DEBUG + remove_proc_entry("debug_level", libipw_proc); + remove_proc_entry(DRV_PROCNAME, init_net.proc_net); + libipw_proc = NULL; +#endif + return err; } static void __exit libipw_exit(void) @@ -285,6 +312,11 @@ static void __exit libipw_exit(void) libipw_proc = NULL; } #endif /* CONFIG_LIBIPW_DEBUG */ + + libipw_crypto_ccmp_exit(); + libipw_crypto_tkip_exit(); + libipw_crypto_wep_exit(); + libipw_crypto_exit(); } #ifdef CONFIG_LIBIPW_DEBUG diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_rx.c b/drivers/net/wireless/intel/ipw2x00/libipw_rx.c index 48d6870bbf4e..1fe05e73a17c 100644 --- a/drivers/net/wireless/intel/ipw2x00/libipw_rx.c +++ b/drivers/net/wireless/intel/ipw2x00/libipw_rx.c @@ -27,9 +27,6 @@ #include #include #include - -#include - #include "libipw.h" static void libipw_monitor_rx(struct libipw_device *ieee, @@ -266,7 +263,7 @@ static int libipw_is_eapol_frame(struct libipw_device *ieee, /* Called only as a tasklet (software IRQ), by libipw_rx */ static int libipw_rx_frame_decrypt(struct libipw_device *ieee, struct sk_buff *skb, - struct lib80211_crypt_data *crypt) + struct libipw_crypt_data *crypt) { struct libipw_hdr_3addr *hdr; int res, hdrlen; @@ -298,7 +295,7 @@ libipw_rx_frame_decrypt(struct libipw_device *ieee, struct sk_buff *skb, static int libipw_rx_frame_decrypt_msdu(struct libipw_device *ieee, struct sk_buff *skb, int keyidx, - struct lib80211_crypt_data *crypt) + struct libipw_crypt_data *crypt) { struct libipw_hdr_3addr *hdr; int res, hdrlen; @@ -345,7 +342,7 @@ int libipw_rx(struct libipw_device *ieee, struct sk_buff *skb, #endif u8 dst[ETH_ALEN]; u8 src[ETH_ALEN]; - struct lib80211_crypt_data *crypt = NULL; + struct libipw_crypt_data *crypt = NULL; int keyidx = 0; int can_be_decrypted = 0; diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_tx.c b/drivers/net/wireless/intel/ipw2x00/libipw_tx.c index e22a6732a4c3..80edaa3dea9c 100644 --- a/drivers/net/wireless/intel/ipw2x00/libipw_tx.c +++ b/drivers/net/wireless/intel/ipw2x00/libipw_tx.c @@ -138,7 +138,7 @@ static int libipw_copy_snap(u8 * data, __be16 h_proto) static int libipw_encrypt_fragment(struct libipw_device *ieee, struct sk_buff *frag, int hdr_len) { - struct lib80211_crypt_data *crypt = + struct libipw_crypt_data *crypt = ieee->crypt_info.crypt[ieee->crypt_info.tx_keyidx]; int res; @@ -255,7 +255,7 @@ netdev_tx_t libipw_xmit(struct sk_buff *skb, struct net_device *dev) .qos_ctl = 0 }; u8 dest[ETH_ALEN], src[ETH_ALEN]; - struct lib80211_crypt_data *crypt; + struct libipw_crypt_data *crypt; int priority = skb->priority; int snapped = 0; diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_wx.c b/drivers/net/wireless/intel/ipw2x00/libipw_wx.c index dbc7153d0a3d..db71d81b0d4f 100644 --- a/drivers/net/wireless/intel/ipw2x00/libipw_wx.c +++ b/drivers/net/wireless/intel/ipw2x00/libipw_wx.c @@ -21,10 +21,7 @@ #include #include #include - -#include #include - #include "libipw.h" static const char *libipw_modes[] = { @@ -303,7 +300,7 @@ int libipw_wx_set_encode(struct libipw_device *ieee, .flags = 0 }; int i, key, key_provided, len; - struct lib80211_crypt_data **crypt; + struct libipw_crypt_data **crypt; int host_crypto = ieee->host_encrypt || ieee->host_decrypt; LIBIPW_DEBUG_WX("SET_ENCODE\n"); @@ -328,7 +325,7 @@ int libipw_wx_set_encode(struct libipw_device *ieee, if (key_provided && *crypt) { LIBIPW_DEBUG_WX("Disabling encryption on key %d.\n", key); - lib80211_crypt_delayed_deinit(&ieee->crypt_info, crypt); + libipw_crypt_delayed_deinit(&ieee->crypt_info, crypt); } else LIBIPW_DEBUG_WX("Disabling encryption.\n"); @@ -338,7 +335,7 @@ int libipw_wx_set_encode(struct libipw_device *ieee, if (ieee->crypt_info.crypt[i] != NULL) { if (key_provided) break; - lib80211_crypt_delayed_deinit(&ieee->crypt_info, + libipw_crypt_delayed_deinit(&ieee->crypt_info, &ieee->crypt_info.crypt[i]); } } @@ -361,21 +358,21 @@ int libipw_wx_set_encode(struct libipw_device *ieee, strcmp((*crypt)->ops->name, "WEP") != 0) { /* changing to use WEP; deinit previously used algorithm * on this key */ - lib80211_crypt_delayed_deinit(&ieee->crypt_info, crypt); + libipw_crypt_delayed_deinit(&ieee->crypt_info, crypt); } if (*crypt == NULL && host_crypto) { - struct lib80211_crypt_data *new_crypt; + struct libipw_crypt_data *new_crypt; /* take WEP into use */ - new_crypt = kzalloc(sizeof(struct lib80211_crypt_data), + new_crypt = kzalloc(sizeof(struct libipw_crypt_data), GFP_KERNEL); if (new_crypt == NULL) return -ENOMEM; - new_crypt->ops = lib80211_get_crypto_ops("WEP"); + new_crypt->ops = libipw_get_crypto_ops("WEP"); if (!new_crypt->ops) { - request_module("lib80211_crypt_wep"); - new_crypt->ops = lib80211_get_crypto_ops("WEP"); + request_module("libipw_crypt_wep"); + new_crypt->ops = libipw_get_crypto_ops("WEP"); } if (new_crypt->ops && try_module_get(new_crypt->ops->owner)) @@ -386,7 +383,7 @@ int libipw_wx_set_encode(struct libipw_device *ieee, new_crypt = NULL; printk(KERN_WARNING "%s: could not initialize WEP: " - "load module lib80211_crypt_wep\n", dev->name); + "load module libipw_crypt_wep\n", dev->name); return -EOPNOTSUPP; } *crypt = new_crypt; @@ -509,8 +506,8 @@ int libipw_wx_set_encodeext(struct libipw_device *ieee, int i, idx, ret = 0; int group_key = 0; const char *alg, *module; - const struct lib80211_crypto_ops *ops; - struct lib80211_crypt_data **crypt; + const struct libipw_crypto_ops *ops; + struct libipw_crypt_data **crypt; struct libipw_security sec = { .flags = 0, @@ -541,7 +538,7 @@ int libipw_wx_set_encodeext(struct libipw_device *ieee, if ((encoding->flags & IW_ENCODE_DISABLED) || ext->alg == IW_ENCODE_ALG_NONE) { if (*crypt) - lib80211_crypt_delayed_deinit(&ieee->crypt_info, crypt); + libipw_crypt_delayed_deinit(&ieee->crypt_info, crypt); for (i = 0; i < WEP_KEYS; i++) if (ieee->crypt_info.crypt[i] != NULL) @@ -567,15 +564,15 @@ int libipw_wx_set_encodeext(struct libipw_device *ieee, switch (ext->alg) { case IW_ENCODE_ALG_WEP: alg = "WEP"; - module = "lib80211_crypt_wep"; + module = "libipw_crypt_wep"; break; case IW_ENCODE_ALG_TKIP: alg = "TKIP"; - module = "lib80211_crypt_tkip"; + module = "libipw_crypt_tkip"; break; case IW_ENCODE_ALG_CCMP: alg = "CCMP"; - module = "lib80211_crypt_ccmp"; + module = "libipw_crypt_ccmp"; break; default: LIBIPW_DEBUG_WX("%s: unknown crypto alg %d\n", @@ -584,10 +581,10 @@ int libipw_wx_set_encodeext(struct libipw_device *ieee, goto done; } - ops = lib80211_get_crypto_ops(alg); + ops = libipw_get_crypto_ops(alg); if (ops == NULL) { request_module(module); - ops = lib80211_get_crypto_ops(alg); + ops = libipw_get_crypto_ops(alg); } if (ops == NULL) { LIBIPW_DEBUG_WX("%s: unknown crypto alg %d\n", @@ -597,9 +594,9 @@ int libipw_wx_set_encodeext(struct libipw_device *ieee, } if (*crypt == NULL || (*crypt)->ops != ops) { - struct lib80211_crypt_data *new_crypt; + struct libipw_crypt_data *new_crypt; - lib80211_crypt_delayed_deinit(&ieee->crypt_info, crypt); + libipw_crypt_delayed_deinit(&ieee->crypt_info, crypt); new_crypt = kzalloc(sizeof(*new_crypt), GFP_KERNEL); if (new_crypt == NULL) { diff --git a/include/net/lib80211.h b/include/net/lib80211.h deleted file mode 100644 index fd0f15d87d80..000000000000 --- a/include/net/lib80211.h +++ /dev/null @@ -1,122 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * lib80211.h -- common bits for IEEE802.11 wireless drivers - * - * Copyright (c) 2008, John W. Linville - * - * Some bits copied from old ieee80211 component, w/ original copyright - * notices below: - * - * Original code based on Host AP (software wireless LAN access point) driver - * for Intersil Prism2/2.5/3. - * - * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen - * - * Copyright (c) 2002-2003, Jouni Malinen - * - * Adaption to a generic IEEE 802.11 stack by James Ketrenos - * - * - * Copyright (c) 2004, Intel Corporation - * - */ - -#ifndef LIB80211_H -#define LIB80211_H - -#include -#include -#include -#include -#include -#include -#include -#include - -#define NUM_WEP_KEYS 4 - -enum { - IEEE80211_CRYPTO_TKIP_COUNTERMEASURES = (1 << 0), -}; - -struct module; - -struct lib80211_crypto_ops { - const char *name; - struct list_head list; - - /* init new crypto context (e.g., allocate private data space, - * select IV, etc.); returns NULL on failure or pointer to allocated - * private data on success */ - void *(*init) (int keyidx); - - /* deinitialize crypto context and free allocated private data */ - void (*deinit) (void *priv); - - /* encrypt/decrypt return < 0 on error or >= 0 on success. The return - * value from decrypt_mpdu is passed as the keyidx value for - * decrypt_msdu. skb must have enough head and tail room for the - * encryption; if not, error will be returned; these functions are - * called for all MPDUs (i.e., fragments). - */ - int (*encrypt_mpdu) (struct sk_buff * skb, int hdr_len, void *priv); - int (*decrypt_mpdu) (struct sk_buff * skb, int hdr_len, void *priv); - - /* These functions are called for full MSDUs, i.e. full frames. - * These can be NULL if full MSDU operations are not needed. */ - int (*encrypt_msdu) (struct sk_buff * skb, int hdr_len, void *priv); - int (*decrypt_msdu) (struct sk_buff * skb, int keyidx, int hdr_len, - void *priv); - - int (*set_key) (void *key, int len, u8 * seq, void *priv); - int (*get_key) (void *key, int len, u8 * seq, void *priv); - - /* procfs handler for printing out key information and possible - * statistics */ - void (*print_stats) (struct seq_file *m, void *priv); - - /* Crypto specific flag get/set for configuration settings */ - unsigned long (*get_flags) (void *priv); - unsigned long (*set_flags) (unsigned long flags, void *priv); - - /* maximum number of bytes added by encryption; encrypt buf is - * allocated with extra_prefix_len bytes, copy of in_buf, and - * extra_postfix_len; encrypt need not use all this space, but - * the result must start at the beginning of the buffer and correct - * length must be returned */ - int extra_mpdu_prefix_len, extra_mpdu_postfix_len; - int extra_msdu_prefix_len, extra_msdu_postfix_len; - - struct module *owner; -}; - -struct lib80211_crypt_data { - struct list_head list; /* delayed deletion list */ - const struct lib80211_crypto_ops *ops; - void *priv; - atomic_t refcnt; -}; - -struct lib80211_crypt_info { - char *name; - /* Most clients will already have a lock, - so just point to that. */ - spinlock_t *lock; - - struct lib80211_crypt_data *crypt[NUM_WEP_KEYS]; - int tx_keyidx; /* default TX key index (crypt[tx_keyidx]) */ - struct list_head crypt_deinit_list; - struct timer_list crypt_deinit_timer; - int crypt_quiesced; -}; - -int lib80211_crypt_info_init(struct lib80211_crypt_info *info, char *name, - spinlock_t *lock); -void lib80211_crypt_info_free(struct lib80211_crypt_info *info); -int lib80211_register_crypto_ops(const struct lib80211_crypto_ops *ops); -int lib80211_unregister_crypto_ops(const struct lib80211_crypto_ops *ops); -const struct lib80211_crypto_ops *lib80211_get_crypto_ops(const char *name); -void lib80211_crypt_delayed_deinit(struct lib80211_crypt_info *info, - struct lib80211_crypt_data **crypt); - -#endif /* LIB80211_H */ diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index 10345388ad13..733c53ad4de5 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -212,36 +212,3 @@ config CFG80211_KUNIT_TEST If unsure, say N. endif # CFG80211 - -config LIB80211 - tristate - default n - help - This options enables a library of common routines used - by IEEE802.11 wireless LAN drivers. - - Drivers should select this themselves if needed. - -config LIB80211_CRYPT_WEP - tristate - select CRYPTO_LIB_ARC4 - -config LIB80211_CRYPT_CCMP - tristate - select CRYPTO - select CRYPTO_AES - select CRYPTO_CCM - -config LIB80211_CRYPT_TKIP - tristate - select CRYPTO_LIB_ARC4 - -config LIB80211_DEBUG - bool "lib80211 debugging messages" - depends on LIB80211 - default n - help - You can enable this if you want verbose debugging messages - from lib80211. - - If unsure, say N. diff --git a/net/wireless/Makefile b/net/wireless/Makefile index 1d49cc8b6da1..27f211bd9954 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -1,9 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_CFG80211) += cfg80211.o -obj-$(CONFIG_LIB80211) += lib80211.o -obj-$(CONFIG_LIB80211_CRYPT_WEP) += lib80211_crypt_wep.o -obj-$(CONFIG_LIB80211_CRYPT_CCMP) += lib80211_crypt_ccmp.o -obj-$(CONFIG_LIB80211_CRYPT_TKIP) += lib80211_crypt_tkip.o obj-y += tests/ obj-$(CONFIG_WEXT_CORE) += wext-core.o diff --git a/net/wireless/lib80211.c b/net/wireless/lib80211.c deleted file mode 100644 index 64c447040786..000000000000 --- a/net/wireless/lib80211.c +++ /dev/null @@ -1,257 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * lib80211 -- common bits for IEEE802.11 drivers - * - * Copyright(c) 2008 John W. Linville - * - * Portions copied from old ieee80211 component, w/ original copyright - * notices below: - * - * Host AP crypto routines - * - * Copyright (c) 2002-2003, Jouni Malinen - * Portions Copyright (C) 2004, Intel Corporation - * - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include - -#include - -#define DRV_DESCRIPTION "common routines for IEEE802.11 drivers" - -MODULE_DESCRIPTION(DRV_DESCRIPTION); -MODULE_AUTHOR("John W. Linville "); -MODULE_LICENSE("GPL"); - -struct lib80211_crypto_alg { - struct list_head list; - const struct lib80211_crypto_ops *ops; -}; - -static LIST_HEAD(lib80211_crypto_algs); -static DEFINE_SPINLOCK(lib80211_crypto_lock); - -static void lib80211_crypt_deinit_entries(struct lib80211_crypt_info *info, - int force); -static void lib80211_crypt_quiescing(struct lib80211_crypt_info *info); -static void lib80211_crypt_deinit_handler(struct timer_list *t); - -int lib80211_crypt_info_init(struct lib80211_crypt_info *info, char *name, - spinlock_t *lock) -{ - memset(info, 0, sizeof(*info)); - - info->name = name; - info->lock = lock; - - INIT_LIST_HEAD(&info->crypt_deinit_list); - timer_setup(&info->crypt_deinit_timer, lib80211_crypt_deinit_handler, - 0); - - return 0; -} -EXPORT_SYMBOL(lib80211_crypt_info_init); - -void lib80211_crypt_info_free(struct lib80211_crypt_info *info) -{ - int i; - - lib80211_crypt_quiescing(info); - del_timer_sync(&info->crypt_deinit_timer); - lib80211_crypt_deinit_entries(info, 1); - - for (i = 0; i < NUM_WEP_KEYS; i++) { - struct lib80211_crypt_data *crypt = info->crypt[i]; - if (crypt) { - if (crypt->ops) { - crypt->ops->deinit(crypt->priv); - module_put(crypt->ops->owner); - } - kfree(crypt); - info->crypt[i] = NULL; - } - } -} -EXPORT_SYMBOL(lib80211_crypt_info_free); - -static void lib80211_crypt_deinit_entries(struct lib80211_crypt_info *info, - int force) -{ - struct lib80211_crypt_data *entry, *next; - unsigned long flags; - - spin_lock_irqsave(info->lock, flags); - list_for_each_entry_safe(entry, next, &info->crypt_deinit_list, list) { - if (atomic_read(&entry->refcnt) != 0 && !force) - continue; - - list_del(&entry->list); - - if (entry->ops) { - entry->ops->deinit(entry->priv); - module_put(entry->ops->owner); - } - kfree(entry); - } - spin_unlock_irqrestore(info->lock, flags); -} - -/* After this, crypt_deinit_list won't accept new members */ -static void lib80211_crypt_quiescing(struct lib80211_crypt_info *info) -{ - unsigned long flags; - - spin_lock_irqsave(info->lock, flags); - info->crypt_quiesced = 1; - spin_unlock_irqrestore(info->lock, flags); -} - -static void lib80211_crypt_deinit_handler(struct timer_list *t) -{ - struct lib80211_crypt_info *info = from_timer(info, t, - crypt_deinit_timer); - unsigned long flags; - - lib80211_crypt_deinit_entries(info, 0); - - spin_lock_irqsave(info->lock, flags); - if (!list_empty(&info->crypt_deinit_list) && !info->crypt_quiesced) { - printk(KERN_DEBUG "%s: entries remaining in delayed crypt " - "deletion list\n", info->name); - info->crypt_deinit_timer.expires = jiffies + HZ; - add_timer(&info->crypt_deinit_timer); - } - spin_unlock_irqrestore(info->lock, flags); -} - -void lib80211_crypt_delayed_deinit(struct lib80211_crypt_info *info, - struct lib80211_crypt_data **crypt) -{ - struct lib80211_crypt_data *tmp; - unsigned long flags; - - if (*crypt == NULL) - return; - - tmp = *crypt; - *crypt = NULL; - - /* must not run ops->deinit() while there may be pending encrypt or - * decrypt operations. Use a list of delayed deinits to avoid needing - * locking. */ - - spin_lock_irqsave(info->lock, flags); - if (!info->crypt_quiesced) { - list_add(&tmp->list, &info->crypt_deinit_list); - if (!timer_pending(&info->crypt_deinit_timer)) { - info->crypt_deinit_timer.expires = jiffies + HZ; - add_timer(&info->crypt_deinit_timer); - } - } - spin_unlock_irqrestore(info->lock, flags); -} -EXPORT_SYMBOL(lib80211_crypt_delayed_deinit); - -int lib80211_register_crypto_ops(const struct lib80211_crypto_ops *ops) -{ - unsigned long flags; - struct lib80211_crypto_alg *alg; - - alg = kzalloc(sizeof(*alg), GFP_KERNEL); - if (alg == NULL) - return -ENOMEM; - - alg->ops = ops; - - spin_lock_irqsave(&lib80211_crypto_lock, flags); - list_add(&alg->list, &lib80211_crypto_algs); - spin_unlock_irqrestore(&lib80211_crypto_lock, flags); - - printk(KERN_DEBUG "lib80211_crypt: registered algorithm '%s'\n", - ops->name); - - return 0; -} -EXPORT_SYMBOL(lib80211_register_crypto_ops); - -int lib80211_unregister_crypto_ops(const struct lib80211_crypto_ops *ops) -{ - struct lib80211_crypto_alg *alg; - unsigned long flags; - - spin_lock_irqsave(&lib80211_crypto_lock, flags); - list_for_each_entry(alg, &lib80211_crypto_algs, list) { - if (alg->ops == ops) - goto found; - } - spin_unlock_irqrestore(&lib80211_crypto_lock, flags); - return -EINVAL; - - found: - printk(KERN_DEBUG "lib80211_crypt: unregistered algorithm '%s'\n", - ops->name); - list_del(&alg->list); - spin_unlock_irqrestore(&lib80211_crypto_lock, flags); - kfree(alg); - return 0; -} -EXPORT_SYMBOL(lib80211_unregister_crypto_ops); - -const struct lib80211_crypto_ops *lib80211_get_crypto_ops(const char *name) -{ - struct lib80211_crypto_alg *alg; - unsigned long flags; - - spin_lock_irqsave(&lib80211_crypto_lock, flags); - list_for_each_entry(alg, &lib80211_crypto_algs, list) { - if (strcmp(alg->ops->name, name) == 0) - goto found; - } - spin_unlock_irqrestore(&lib80211_crypto_lock, flags); - return NULL; - - found: - spin_unlock_irqrestore(&lib80211_crypto_lock, flags); - return alg->ops; -} -EXPORT_SYMBOL(lib80211_get_crypto_ops); - -static void *lib80211_crypt_null_init(int keyidx) -{ - return (void *)1; -} - -static void lib80211_crypt_null_deinit(void *priv) -{ -} - -static const struct lib80211_crypto_ops lib80211_crypt_null = { - .name = "NULL", - .init = lib80211_crypt_null_init, - .deinit = lib80211_crypt_null_deinit, - .owner = THIS_MODULE, -}; - -static int __init lib80211_init(void) -{ - pr_info(DRV_DESCRIPTION "\n"); - return lib80211_register_crypto_ops(&lib80211_crypt_null); -} - -static void __exit lib80211_exit(void) -{ - lib80211_unregister_crypto_ops(&lib80211_crypt_null); - BUG_ON(!list_empty(&lib80211_crypto_algs)); -} - -module_init(lib80211_init); -module_exit(lib80211_exit); diff --git a/net/wireless/lib80211_crypt_ccmp.c b/net/wireless/lib80211_crypt_ccmp.c deleted file mode 100644 index 5aad139130e1..000000000000 --- a/net/wireless/lib80211_crypt_ccmp.c +++ /dev/null @@ -1,448 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * lib80211 crypt: host-based CCMP encryption implementation for lib80211 - * - * Copyright (c) 2003-2004, Jouni Malinen - * Copyright (c) 2008, John W. Linville - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include - -#include - -MODULE_AUTHOR("Jouni Malinen"); -MODULE_DESCRIPTION("Host AP crypt: CCMP"); -MODULE_LICENSE("GPL"); - -#define AES_BLOCK_LEN 16 -#define CCMP_HDR_LEN 8 -#define CCMP_MIC_LEN 8 -#define CCMP_TK_LEN 16 -#define CCMP_PN_LEN 6 - -struct lib80211_ccmp_data { - u8 key[CCMP_TK_LEN]; - int key_set; - - u8 tx_pn[CCMP_PN_LEN]; - u8 rx_pn[CCMP_PN_LEN]; - - u32 dot11RSNAStatsCCMPFormatErrors; - u32 dot11RSNAStatsCCMPReplays; - u32 dot11RSNAStatsCCMPDecryptErrors; - - int key_idx; - - struct crypto_aead *tfm; - - /* scratch buffers for virt_to_page() (crypto API) */ - u8 tx_aad[2 * AES_BLOCK_LEN]; - u8 rx_aad[2 * AES_BLOCK_LEN]; -}; - -static void *lib80211_ccmp_init(int key_idx) -{ - struct lib80211_ccmp_data *priv; - - priv = kzalloc(sizeof(*priv), GFP_ATOMIC); - if (priv == NULL) - goto fail; - priv->key_idx = key_idx; - - priv->tfm = crypto_alloc_aead("ccm(aes)", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(priv->tfm)) { - priv->tfm = NULL; - goto fail; - } - - return priv; - - fail: - if (priv) { - if (priv->tfm) - crypto_free_aead(priv->tfm); - kfree(priv); - } - - return NULL; -} - -static void lib80211_ccmp_deinit(void *priv) -{ - struct lib80211_ccmp_data *_priv = priv; - if (_priv && _priv->tfm) - crypto_free_aead(_priv->tfm); - kfree(priv); -} - -static int ccmp_init_iv_and_aad(const struct ieee80211_hdr *hdr, - const u8 *pn, u8 *iv, u8 *aad) -{ - u8 *pos, qc = 0; - size_t aad_len; - int a4_included, qc_included; - - a4_included = ieee80211_has_a4(hdr->frame_control); - qc_included = ieee80211_is_data_qos(hdr->frame_control); - - aad_len = 22; - if (a4_included) - aad_len += 6; - if (qc_included) { - pos = (u8 *) & hdr->addr4; - if (a4_included) - pos += 6; - qc = *pos & 0x0f; - aad_len += 2; - } - - /* In CCM, the initial vectors (IV) used for CTR mode encryption and CBC - * mode authentication are not allowed to collide, yet both are derived - * from the same vector. We only set L := 1 here to indicate that the - * data size can be represented in (L+1) bytes. The CCM layer will take - * care of storing the data length in the top (L+1) bytes and setting - * and clearing the other bits as is required to derive the two IVs. - */ - iv[0] = 0x1; - - /* Nonce: QC | A2 | PN */ - iv[1] = qc; - memcpy(iv + 2, hdr->addr2, ETH_ALEN); - memcpy(iv + 8, pn, CCMP_PN_LEN); - - /* AAD: - * FC with bits 4..6 and 11..13 masked to zero; 14 is always one - * A1 | A2 | A3 - * SC with bits 4..15 (seq#) masked to zero - * A4 (if present) - * QC (if present) - */ - pos = (u8 *) hdr; - aad[0] = pos[0] & 0x8f; - aad[1] = pos[1] & 0xc7; - memcpy(aad + 2, &hdr->addrs, 3 * ETH_ALEN); - pos = (u8 *) & hdr->seq_ctrl; - aad[20] = pos[0] & 0x0f; - aad[21] = 0; /* all bits masked */ - memset(aad + 22, 0, 8); - if (a4_included) - memcpy(aad + 22, hdr->addr4, ETH_ALEN); - if (qc_included) { - aad[a4_included ? 28 : 22] = qc; - /* rest of QC masked */ - } - return aad_len; -} - -static int lib80211_ccmp_hdr(struct sk_buff *skb, int hdr_len, - u8 *aeskey, int keylen, void *priv) -{ - struct lib80211_ccmp_data *key = priv; - int i; - u8 *pos; - - if (skb_headroom(skb) < CCMP_HDR_LEN || skb->len < hdr_len) - return -1; - - if (aeskey != NULL && keylen >= CCMP_TK_LEN) - memcpy(aeskey, key->key, CCMP_TK_LEN); - - pos = skb_push(skb, CCMP_HDR_LEN); - memmove(pos, pos + CCMP_HDR_LEN, hdr_len); - pos += hdr_len; - - i = CCMP_PN_LEN - 1; - while (i >= 0) { - key->tx_pn[i]++; - if (key->tx_pn[i] != 0) - break; - i--; - } - - *pos++ = key->tx_pn[5]; - *pos++ = key->tx_pn[4]; - *pos++ = 0; - *pos++ = (key->key_idx << 6) | (1 << 5) /* Ext IV included */ ; - *pos++ = key->tx_pn[3]; - *pos++ = key->tx_pn[2]; - *pos++ = key->tx_pn[1]; - *pos++ = key->tx_pn[0]; - - return CCMP_HDR_LEN; -} - -static int lib80211_ccmp_encrypt(struct sk_buff *skb, int hdr_len, void *priv) -{ - struct lib80211_ccmp_data *key = priv; - struct ieee80211_hdr *hdr; - struct aead_request *req; - struct scatterlist sg[2]; - u8 *aad = key->tx_aad; - u8 iv[AES_BLOCK_LEN]; - int len, data_len, aad_len; - int ret; - - if (skb_tailroom(skb) < CCMP_MIC_LEN || skb->len < hdr_len) - return -1; - - data_len = skb->len - hdr_len; - len = lib80211_ccmp_hdr(skb, hdr_len, NULL, 0, priv); - if (len < 0) - return -1; - - req = aead_request_alloc(key->tfm, GFP_ATOMIC); - if (!req) - return -ENOMEM; - - hdr = (struct ieee80211_hdr *)skb->data; - aad_len = ccmp_init_iv_and_aad(hdr, key->tx_pn, iv, aad); - - skb_put(skb, CCMP_MIC_LEN); - - sg_init_table(sg, 2); - sg_set_buf(&sg[0], aad, aad_len); - sg_set_buf(&sg[1], skb->data + hdr_len + CCMP_HDR_LEN, - data_len + CCMP_MIC_LEN); - - aead_request_set_callback(req, 0, NULL, NULL); - aead_request_set_ad(req, aad_len); - aead_request_set_crypt(req, sg, sg, data_len, iv); - - ret = crypto_aead_encrypt(req); - aead_request_free(req); - - return ret; -} - -/* - * deal with seq counter wrapping correctly. - * refer to timer_after() for jiffies wrapping handling - */ -static inline int ccmp_replay_check(u8 *pn_n, u8 *pn_o) -{ - u32 iv32_n, iv16_n; - u32 iv32_o, iv16_o; - - iv32_n = (pn_n[0] << 24) | (pn_n[1] << 16) | (pn_n[2] << 8) | pn_n[3]; - iv16_n = (pn_n[4] << 8) | pn_n[5]; - - iv32_o = (pn_o[0] << 24) | (pn_o[1] << 16) | (pn_o[2] << 8) | pn_o[3]; - iv16_o = (pn_o[4] << 8) | pn_o[5]; - - if ((s32)iv32_n - (s32)iv32_o < 0 || - (iv32_n == iv32_o && iv16_n <= iv16_o)) - return 1; - return 0; -} - -static int lib80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv) -{ - struct lib80211_ccmp_data *key = priv; - u8 keyidx, *pos; - struct ieee80211_hdr *hdr; - struct aead_request *req; - struct scatterlist sg[2]; - u8 *aad = key->rx_aad; - u8 iv[AES_BLOCK_LEN]; - u8 pn[6]; - int aad_len, ret; - size_t data_len = skb->len - hdr_len - CCMP_HDR_LEN; - - if (skb->len < hdr_len + CCMP_HDR_LEN + CCMP_MIC_LEN) { - key->dot11RSNAStatsCCMPFormatErrors++; - return -1; - } - - hdr = (struct ieee80211_hdr *)skb->data; - pos = skb->data + hdr_len; - keyidx = pos[3]; - if (!(keyidx & (1 << 5))) { - net_dbg_ratelimited("CCMP: received packet without ExtIV flag from %pM\n", - hdr->addr2); - key->dot11RSNAStatsCCMPFormatErrors++; - return -2; - } - keyidx >>= 6; - if (key->key_idx != keyidx) { - net_dbg_ratelimited("CCMP: RX tkey->key_idx=%d frame keyidx=%d\n", - key->key_idx, keyidx); - return -6; - } - if (!key->key_set) { - net_dbg_ratelimited("CCMP: received packet from %pM with keyid=%d that does not have a configured key\n", - hdr->addr2, keyidx); - return -3; - } - - pn[0] = pos[7]; - pn[1] = pos[6]; - pn[2] = pos[5]; - pn[3] = pos[4]; - pn[4] = pos[1]; - pn[5] = pos[0]; - pos += 8; - - if (ccmp_replay_check(pn, key->rx_pn)) { -#ifdef CONFIG_LIB80211_DEBUG - net_dbg_ratelimited("CCMP: replay detected: STA=%pM previous PN %02x%02x%02x%02x%02x%02x received PN %02x%02x%02x%02x%02x%02x\n", - hdr->addr2, - key->rx_pn[0], key->rx_pn[1], key->rx_pn[2], - key->rx_pn[3], key->rx_pn[4], key->rx_pn[5], - pn[0], pn[1], pn[2], pn[3], pn[4], pn[5]); -#endif - key->dot11RSNAStatsCCMPReplays++; - return -4; - } - - req = aead_request_alloc(key->tfm, GFP_ATOMIC); - if (!req) - return -ENOMEM; - - aad_len = ccmp_init_iv_and_aad(hdr, pn, iv, aad); - - sg_init_table(sg, 2); - sg_set_buf(&sg[0], aad, aad_len); - sg_set_buf(&sg[1], pos, data_len); - - aead_request_set_callback(req, 0, NULL, NULL); - aead_request_set_ad(req, aad_len); - aead_request_set_crypt(req, sg, sg, data_len, iv); - - ret = crypto_aead_decrypt(req); - aead_request_free(req); - - if (ret) { - net_dbg_ratelimited("CCMP: decrypt failed: STA=%pM (%d)\n", - hdr->addr2, ret); - key->dot11RSNAStatsCCMPDecryptErrors++; - return -5; - } - - memcpy(key->rx_pn, pn, CCMP_PN_LEN); - - /* Remove hdr and MIC */ - memmove(skb->data + CCMP_HDR_LEN, skb->data, hdr_len); - skb_pull(skb, CCMP_HDR_LEN); - skb_trim(skb, skb->len - CCMP_MIC_LEN); - - return keyidx; -} - -static int lib80211_ccmp_set_key(void *key, int len, u8 * seq, void *priv) -{ - struct lib80211_ccmp_data *data = priv; - int keyidx; - struct crypto_aead *tfm = data->tfm; - - keyidx = data->key_idx; - memset(data, 0, sizeof(*data)); - data->key_idx = keyidx; - data->tfm = tfm; - if (len == CCMP_TK_LEN) { - memcpy(data->key, key, CCMP_TK_LEN); - data->key_set = 1; - if (seq) { - data->rx_pn[0] = seq[5]; - data->rx_pn[1] = seq[4]; - data->rx_pn[2] = seq[3]; - data->rx_pn[3] = seq[2]; - data->rx_pn[4] = seq[1]; - data->rx_pn[5] = seq[0]; - } - if (crypto_aead_setauthsize(data->tfm, CCMP_MIC_LEN) || - crypto_aead_setkey(data->tfm, data->key, CCMP_TK_LEN)) - return -1; - } else if (len == 0) - data->key_set = 0; - else - return -1; - - return 0; -} - -static int lib80211_ccmp_get_key(void *key, int len, u8 * seq, void *priv) -{ - struct lib80211_ccmp_data *data = priv; - - if (len < CCMP_TK_LEN) - return -1; - - if (!data->key_set) - return 0; - memcpy(key, data->key, CCMP_TK_LEN); - - if (seq) { - seq[0] = data->tx_pn[5]; - seq[1] = data->tx_pn[4]; - seq[2] = data->tx_pn[3]; - seq[3] = data->tx_pn[2]; - seq[4] = data->tx_pn[1]; - seq[5] = data->tx_pn[0]; - } - - return CCMP_TK_LEN; -} - -static void lib80211_ccmp_print_stats(struct seq_file *m, void *priv) -{ - struct lib80211_ccmp_data *ccmp = priv; - - seq_printf(m, - "key[%d] alg=CCMP key_set=%d " - "tx_pn=%02x%02x%02x%02x%02x%02x " - "rx_pn=%02x%02x%02x%02x%02x%02x " - "format_errors=%d replays=%d decrypt_errors=%d\n", - ccmp->key_idx, ccmp->key_set, - ccmp->tx_pn[0], ccmp->tx_pn[1], ccmp->tx_pn[2], - ccmp->tx_pn[3], ccmp->tx_pn[4], ccmp->tx_pn[5], - ccmp->rx_pn[0], ccmp->rx_pn[1], ccmp->rx_pn[2], - ccmp->rx_pn[3], ccmp->rx_pn[4], ccmp->rx_pn[5], - ccmp->dot11RSNAStatsCCMPFormatErrors, - ccmp->dot11RSNAStatsCCMPReplays, - ccmp->dot11RSNAStatsCCMPDecryptErrors); -} - -static const struct lib80211_crypto_ops lib80211_crypt_ccmp = { - .name = "CCMP", - .init = lib80211_ccmp_init, - .deinit = lib80211_ccmp_deinit, - .encrypt_mpdu = lib80211_ccmp_encrypt, - .decrypt_mpdu = lib80211_ccmp_decrypt, - .encrypt_msdu = NULL, - .decrypt_msdu = NULL, - .set_key = lib80211_ccmp_set_key, - .get_key = lib80211_ccmp_get_key, - .print_stats = lib80211_ccmp_print_stats, - .extra_mpdu_prefix_len = CCMP_HDR_LEN, - .extra_mpdu_postfix_len = CCMP_MIC_LEN, - .owner = THIS_MODULE, -}; - -static int __init lib80211_crypto_ccmp_init(void) -{ - return lib80211_register_crypto_ops(&lib80211_crypt_ccmp); -} - -static void __exit lib80211_crypto_ccmp_exit(void) -{ - lib80211_unregister_crypto_ops(&lib80211_crypt_ccmp); -} - -module_init(lib80211_crypto_ccmp_init); -module_exit(lib80211_crypto_ccmp_exit); diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c deleted file mode 100644 index 63e68e5e121e..000000000000 --- a/net/wireless/lib80211_crypt_tkip.c +++ /dev/null @@ -1,738 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * lib80211 crypt: host-based TKIP encryption implementation for lib80211 - * - * Copyright (c) 2003-2004, Jouni Malinen - * Copyright (c) 2008, John W. Linville - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include - -#include - -MODULE_AUTHOR("Jouni Malinen"); -MODULE_DESCRIPTION("lib80211 crypt: TKIP"); -MODULE_LICENSE("GPL"); - -#define TKIP_HDR_LEN 8 - -struct lib80211_tkip_data { -#define TKIP_KEY_LEN 32 - u8 key[TKIP_KEY_LEN]; - int key_set; - - u32 tx_iv32; - u16 tx_iv16; - u16 tx_ttak[5]; - int tx_phase1_done; - - u32 rx_iv32; - u16 rx_iv16; - u16 rx_ttak[5]; - int rx_phase1_done; - u32 rx_iv32_new; - u16 rx_iv16_new; - - u32 dot11RSNAStatsTKIPReplays; - u32 dot11RSNAStatsTKIPICVErrors; - u32 dot11RSNAStatsTKIPLocalMICFailures; - - int key_idx; - - struct arc4_ctx rx_ctx_arc4; - struct arc4_ctx tx_ctx_arc4; - struct crypto_shash *rx_tfm_michael; - struct crypto_shash *tx_tfm_michael; - - /* scratch buffers for virt_to_page() (crypto API) */ - u8 rx_hdr[16], tx_hdr[16]; - - unsigned long flags; -}; - -static unsigned long lib80211_tkip_set_flags(unsigned long flags, void *priv) -{ - struct lib80211_tkip_data *_priv = priv; - unsigned long old_flags = _priv->flags; - _priv->flags = flags; - return old_flags; -} - -static unsigned long lib80211_tkip_get_flags(void *priv) -{ - struct lib80211_tkip_data *_priv = priv; - return _priv->flags; -} - -static void *lib80211_tkip_init(int key_idx) -{ - struct lib80211_tkip_data *priv; - - if (fips_enabled) - return NULL; - - priv = kzalloc(sizeof(*priv), GFP_ATOMIC); - if (priv == NULL) - goto fail; - - priv->key_idx = key_idx; - - priv->tx_tfm_michael = crypto_alloc_shash("michael_mic", 0, 0); - if (IS_ERR(priv->tx_tfm_michael)) { - priv->tx_tfm_michael = NULL; - goto fail; - } - - priv->rx_tfm_michael = crypto_alloc_shash("michael_mic", 0, 0); - if (IS_ERR(priv->rx_tfm_michael)) { - priv->rx_tfm_michael = NULL; - goto fail; - } - - return priv; - - fail: - if (priv) { - crypto_free_shash(priv->tx_tfm_michael); - crypto_free_shash(priv->rx_tfm_michael); - kfree(priv); - } - - return NULL; -} - -static void lib80211_tkip_deinit(void *priv) -{ - struct lib80211_tkip_data *_priv = priv; - if (_priv) { - crypto_free_shash(_priv->tx_tfm_michael); - crypto_free_shash(_priv->rx_tfm_michael); - } - kfree_sensitive(priv); -} - -static inline u16 RotR1(u16 val) -{ - return (val >> 1) | (val << 15); -} - -static inline u8 Lo8(u16 val) -{ - return val & 0xff; -} - -static inline u8 Hi8(u16 val) -{ - return val >> 8; -} - -static inline u16 Lo16(u32 val) -{ - return val & 0xffff; -} - -static inline u16 Hi16(u32 val) -{ - return val >> 16; -} - -static inline u16 Mk16(u8 hi, u8 lo) -{ - return lo | (((u16) hi) << 8); -} - -static inline u16 Mk16_le(__le16 * v) -{ - return le16_to_cpu(*v); -} - -static const u16 Sbox[256] = { - 0xC6A5, 0xF884, 0xEE99, 0xF68D, 0xFF0D, 0xD6BD, 0xDEB1, 0x9154, - 0x6050, 0x0203, 0xCEA9, 0x567D, 0xE719, 0xB562, 0x4DE6, 0xEC9A, - 0x8F45, 0x1F9D, 0x8940, 0xFA87, 0xEF15, 0xB2EB, 0x8EC9, 0xFB0B, - 0x41EC, 0xB367, 0x5FFD, 0x45EA, 0x23BF, 0x53F7, 0xE496, 0x9B5B, - 0x75C2, 0xE11C, 0x3DAE, 0x4C6A, 0x6C5A, 0x7E41, 0xF502, 0x834F, - 0x685C, 0x51F4, 0xD134, 0xF908, 0xE293, 0xAB73, 0x6253, 0x2A3F, - 0x080C, 0x9552, 0x4665, 0x9D5E, 0x3028, 0x37A1, 0x0A0F, 0x2FB5, - 0x0E09, 0x2436, 0x1B9B, 0xDF3D, 0xCD26, 0x4E69, 0x7FCD, 0xEA9F, - 0x121B, 0x1D9E, 0x5874, 0x342E, 0x362D, 0xDCB2, 0xB4EE, 0x5BFB, - 0xA4F6, 0x764D, 0xB761, 0x7DCE, 0x527B, 0xDD3E, 0x5E71, 0x1397, - 0xA6F5, 0xB968, 0x0000, 0xC12C, 0x4060, 0xE31F, 0x79C8, 0xB6ED, - 0xD4BE, 0x8D46, 0x67D9, 0x724B, 0x94DE, 0x98D4, 0xB0E8, 0x854A, - 0xBB6B, 0xC52A, 0x4FE5, 0xED16, 0x86C5, 0x9AD7, 0x6655, 0x1194, - 0x8ACF, 0xE910, 0x0406, 0xFE81, 0xA0F0, 0x7844, 0x25BA, 0x4BE3, - 0xA2F3, 0x5DFE, 0x80C0, 0x058A, 0x3FAD, 0x21BC, 0x7048, 0xF104, - 0x63DF, 0x77C1, 0xAF75, 0x4263, 0x2030, 0xE51A, 0xFD0E, 0xBF6D, - 0x814C, 0x1814, 0x2635, 0xC32F, 0xBEE1, 0x35A2, 0x88CC, 0x2E39, - 0x9357, 0x55F2, 0xFC82, 0x7A47, 0xC8AC, 0xBAE7, 0x322B, 0xE695, - 0xC0A0, 0x1998, 0x9ED1, 0xA37F, 0x4466, 0x547E, 0x3BAB, 0x0B83, - 0x8CCA, 0xC729, 0x6BD3, 0x283C, 0xA779, 0xBCE2, 0x161D, 0xAD76, - 0xDB3B, 0x6456, 0x744E, 0x141E, 0x92DB, 0x0C0A, 0x486C, 0xB8E4, - 0x9F5D, 0xBD6E, 0x43EF, 0xC4A6, 0x39A8, 0x31A4, 0xD337, 0xF28B, - 0xD532, 0x8B43, 0x6E59, 0xDAB7, 0x018C, 0xB164, 0x9CD2, 0x49E0, - 0xD8B4, 0xACFA, 0xF307, 0xCF25, 0xCAAF, 0xF48E, 0x47E9, 0x1018, - 0x6FD5, 0xF088, 0x4A6F, 0x5C72, 0x3824, 0x57F1, 0x73C7, 0x9751, - 0xCB23, 0xA17C, 0xE89C, 0x3E21, 0x96DD, 0x61DC, 0x0D86, 0x0F85, - 0xE090, 0x7C42, 0x71C4, 0xCCAA, 0x90D8, 0x0605, 0xF701, 0x1C12, - 0xC2A3, 0x6A5F, 0xAEF9, 0x69D0, 0x1791, 0x9958, 0x3A27, 0x27B9, - 0xD938, 0xEB13, 0x2BB3, 0x2233, 0xD2BB, 0xA970, 0x0789, 0x33A7, - 0x2DB6, 0x3C22, 0x1592, 0xC920, 0x8749, 0xAAFF, 0x5078, 0xA57A, - 0x038F, 0x59F8, 0x0980, 0x1A17, 0x65DA, 0xD731, 0x84C6, 0xD0B8, - 0x82C3, 0x29B0, 0x5A77, 0x1E11, 0x7BCB, 0xA8FC, 0x6DD6, 0x2C3A, -}; - -static inline u16 _S_(u16 v) -{ - u16 t = Sbox[Hi8(v)]; - return Sbox[Lo8(v)] ^ ((t << 8) | (t >> 8)); -} - -#define PHASE1_LOOP_COUNT 8 - -static void tkip_mixing_phase1(u16 * TTAK, const u8 * TK, const u8 * TA, - u32 IV32) -{ - int i, j; - - /* Initialize the 80-bit TTAK from TSC (IV32) and TA[0..5] */ - TTAK[0] = Lo16(IV32); - TTAK[1] = Hi16(IV32); - TTAK[2] = Mk16(TA[1], TA[0]); - TTAK[3] = Mk16(TA[3], TA[2]); - TTAK[4] = Mk16(TA[5], TA[4]); - - for (i = 0; i < PHASE1_LOOP_COUNT; i++) { - j = 2 * (i & 1); - TTAK[0] += _S_(TTAK[4] ^ Mk16(TK[1 + j], TK[0 + j])); - TTAK[1] += _S_(TTAK[0] ^ Mk16(TK[5 + j], TK[4 + j])); - TTAK[2] += _S_(TTAK[1] ^ Mk16(TK[9 + j], TK[8 + j])); - TTAK[3] += _S_(TTAK[2] ^ Mk16(TK[13 + j], TK[12 + j])); - TTAK[4] += _S_(TTAK[3] ^ Mk16(TK[1 + j], TK[0 + j])) + i; - } -} - -static void tkip_mixing_phase2(u8 * WEPSeed, const u8 * TK, const u16 * TTAK, - u16 IV16) -{ - /* Make temporary area overlap WEP seed so that the final copy can be - * avoided on little endian hosts. */ - u16 *PPK = (u16 *) & WEPSeed[4]; - - /* Step 1 - make copy of TTAK and bring in TSC */ - PPK[0] = TTAK[0]; - PPK[1] = TTAK[1]; - PPK[2] = TTAK[2]; - PPK[3] = TTAK[3]; - PPK[4] = TTAK[4]; - PPK[5] = TTAK[4] + IV16; - - /* Step 2 - 96-bit bijective mixing using S-box */ - PPK[0] += _S_(PPK[5] ^ Mk16_le((__le16 *) & TK[0])); - PPK[1] += _S_(PPK[0] ^ Mk16_le((__le16 *) & TK[2])); - PPK[2] += _S_(PPK[1] ^ Mk16_le((__le16 *) & TK[4])); - PPK[3] += _S_(PPK[2] ^ Mk16_le((__le16 *) & TK[6])); - PPK[4] += _S_(PPK[3] ^ Mk16_le((__le16 *) & TK[8])); - PPK[5] += _S_(PPK[4] ^ Mk16_le((__le16 *) & TK[10])); - - PPK[0] += RotR1(PPK[5] ^ Mk16_le((__le16 *) & TK[12])); - PPK[1] += RotR1(PPK[0] ^ Mk16_le((__le16 *) & TK[14])); - PPK[2] += RotR1(PPK[1]); - PPK[3] += RotR1(PPK[2]); - PPK[4] += RotR1(PPK[3]); - PPK[5] += RotR1(PPK[4]); - - /* Step 3 - bring in last of TK bits, assign 24-bit WEP IV value - * WEPSeed[0..2] is transmitted as WEP IV */ - WEPSeed[0] = Hi8(IV16); - WEPSeed[1] = (Hi8(IV16) | 0x20) & 0x7F; - WEPSeed[2] = Lo8(IV16); - WEPSeed[3] = Lo8((PPK[5] ^ Mk16_le((__le16 *) & TK[0])) >> 1); - -#ifdef __BIG_ENDIAN - { - int i; - for (i = 0; i < 6; i++) - PPK[i] = (PPK[i] << 8) | (PPK[i] >> 8); - } -#endif -} - -static int lib80211_tkip_hdr(struct sk_buff *skb, int hdr_len, - u8 * rc4key, int keylen, void *priv) -{ - struct lib80211_tkip_data *tkey = priv; - u8 *pos; - struct ieee80211_hdr *hdr; - - hdr = (struct ieee80211_hdr *)skb->data; - - if (skb_headroom(skb) < TKIP_HDR_LEN || skb->len < hdr_len) - return -1; - - if (rc4key == NULL || keylen < 16) - return -1; - - if (!tkey->tx_phase1_done) { - tkip_mixing_phase1(tkey->tx_ttak, tkey->key, hdr->addr2, - tkey->tx_iv32); - tkey->tx_phase1_done = 1; - } - tkip_mixing_phase2(rc4key, tkey->key, tkey->tx_ttak, tkey->tx_iv16); - - pos = skb_push(skb, TKIP_HDR_LEN); - memmove(pos, pos + TKIP_HDR_LEN, hdr_len); - pos += hdr_len; - - *pos++ = *rc4key; - *pos++ = *(rc4key + 1); - *pos++ = *(rc4key + 2); - *pos++ = (tkey->key_idx << 6) | (1 << 5) /* Ext IV included */ ; - *pos++ = tkey->tx_iv32 & 0xff; - *pos++ = (tkey->tx_iv32 >> 8) & 0xff; - *pos++ = (tkey->tx_iv32 >> 16) & 0xff; - *pos++ = (tkey->tx_iv32 >> 24) & 0xff; - - tkey->tx_iv16++; - if (tkey->tx_iv16 == 0) { - tkey->tx_phase1_done = 0; - tkey->tx_iv32++; - } - - return TKIP_HDR_LEN; -} - -static int lib80211_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv) -{ - struct lib80211_tkip_data *tkey = priv; - int len; - u8 rc4key[16], *pos, *icv; - u32 crc; - - if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) { - struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; - net_dbg_ratelimited("TKIP countermeasures: dropped TX packet to %pM\n", - hdr->addr1); - return -1; - } - - if (skb_tailroom(skb) < 4 || skb->len < hdr_len) - return -1; - - len = skb->len - hdr_len; - pos = skb->data + hdr_len; - - if ((lib80211_tkip_hdr(skb, hdr_len, rc4key, 16, priv)) < 0) - return -1; - - crc = ~crc32_le(~0, pos, len); - icv = skb_put(skb, 4); - icv[0] = crc; - icv[1] = crc >> 8; - icv[2] = crc >> 16; - icv[3] = crc >> 24; - - arc4_setkey(&tkey->tx_ctx_arc4, rc4key, 16); - arc4_crypt(&tkey->tx_ctx_arc4, pos, pos, len + 4); - - return 0; -} - -/* - * deal with seq counter wrapping correctly. - * refer to timer_after() for jiffies wrapping handling - */ -static inline int tkip_replay_check(u32 iv32_n, u16 iv16_n, - u32 iv32_o, u16 iv16_o) -{ - if ((s32)iv32_n - (s32)iv32_o < 0 || - (iv32_n == iv32_o && iv16_n <= iv16_o)) - return 1; - return 0; -} - -static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv) -{ - struct lib80211_tkip_data *tkey = priv; - u8 rc4key[16]; - u8 keyidx, *pos; - u32 iv32; - u16 iv16; - struct ieee80211_hdr *hdr; - u8 icv[4]; - u32 crc; - int plen; - - hdr = (struct ieee80211_hdr *)skb->data; - - if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) { - net_dbg_ratelimited("TKIP countermeasures: dropped received packet from %pM\n", - hdr->addr2); - return -1; - } - - if (skb->len < hdr_len + TKIP_HDR_LEN + 4) - return -1; - - pos = skb->data + hdr_len; - keyidx = pos[3]; - if (!(keyidx & (1 << 5))) { - net_dbg_ratelimited("TKIP: received packet without ExtIV flag from %pM\n", - hdr->addr2); - return -2; - } - keyidx >>= 6; - if (tkey->key_idx != keyidx) { - net_dbg_ratelimited("TKIP: RX tkey->key_idx=%d frame keyidx=%d\n", - tkey->key_idx, keyidx); - return -6; - } - if (!tkey->key_set) { - net_dbg_ratelimited("TKIP: received packet from %pM with keyid=%d that does not have a configured key\n", - hdr->addr2, keyidx); - return -3; - } - iv16 = (pos[0] << 8) | pos[2]; - iv32 = pos[4] | (pos[5] << 8) | (pos[6] << 16) | (pos[7] << 24); - pos += TKIP_HDR_LEN; - - if (tkip_replay_check(iv32, iv16, tkey->rx_iv32, tkey->rx_iv16)) { -#ifdef CONFIG_LIB80211_DEBUG - net_dbg_ratelimited("TKIP: replay detected: STA=%pM previous TSC %08x%04x received TSC %08x%04x\n", - hdr->addr2, tkey->rx_iv32, tkey->rx_iv16, - iv32, iv16); -#endif - tkey->dot11RSNAStatsTKIPReplays++; - return -4; - } - - if (iv32 != tkey->rx_iv32 || !tkey->rx_phase1_done) { - tkip_mixing_phase1(tkey->rx_ttak, tkey->key, hdr->addr2, iv32); - tkey->rx_phase1_done = 1; - } - tkip_mixing_phase2(rc4key, tkey->key, tkey->rx_ttak, iv16); - - plen = skb->len - hdr_len - 12; - - arc4_setkey(&tkey->rx_ctx_arc4, rc4key, 16); - arc4_crypt(&tkey->rx_ctx_arc4, pos, pos, plen + 4); - - crc = ~crc32_le(~0, pos, plen); - icv[0] = crc; - icv[1] = crc >> 8; - icv[2] = crc >> 16; - icv[3] = crc >> 24; - if (memcmp(icv, pos + plen, 4) != 0) { - if (iv32 != tkey->rx_iv32) { - /* Previously cached Phase1 result was already lost, so - * it needs to be recalculated for the next packet. */ - tkey->rx_phase1_done = 0; - } -#ifdef CONFIG_LIB80211_DEBUG - net_dbg_ratelimited("TKIP: ICV error detected: STA=%pM\n", - hdr->addr2); -#endif - tkey->dot11RSNAStatsTKIPICVErrors++; - return -5; - } - - /* Update real counters only after Michael MIC verification has - * completed */ - tkey->rx_iv32_new = iv32; - tkey->rx_iv16_new = iv16; - - /* Remove IV and ICV */ - memmove(skb->data + TKIP_HDR_LEN, skb->data, hdr_len); - skb_pull(skb, TKIP_HDR_LEN); - skb_trim(skb, skb->len - 4); - - return keyidx; -} - -static int michael_mic(struct crypto_shash *tfm_michael, u8 *key, u8 *hdr, - u8 *data, size_t data_len, u8 *mic) -{ - SHASH_DESC_ON_STACK(desc, tfm_michael); - int err; - - if (tfm_michael == NULL) { - pr_warn("%s(): tfm_michael == NULL\n", __func__); - return -1; - } - - desc->tfm = tfm_michael; - - if (crypto_shash_setkey(tfm_michael, key, 8)) - return -1; - - err = crypto_shash_init(desc); - if (err) - goto out; - err = crypto_shash_update(desc, hdr, 16); - if (err) - goto out; - err = crypto_shash_update(desc, data, data_len); - if (err) - goto out; - err = crypto_shash_final(desc, mic); - -out: - shash_desc_zero(desc); - return err; -} - -static void michael_mic_hdr(struct sk_buff *skb, u8 * hdr) -{ - struct ieee80211_hdr *hdr11; - - hdr11 = (struct ieee80211_hdr *)skb->data; - - switch (le16_to_cpu(hdr11->frame_control) & - (IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS)) { - case IEEE80211_FCTL_TODS: - memcpy(hdr, hdr11->addr3, ETH_ALEN); /* DA */ - memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */ - break; - case IEEE80211_FCTL_FROMDS: - memcpy(hdr, hdr11->addr1, ETH_ALEN); /* DA */ - memcpy(hdr + ETH_ALEN, hdr11->addr3, ETH_ALEN); /* SA */ - break; - case IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS: - memcpy(hdr, hdr11->addr3, ETH_ALEN); /* DA */ - memcpy(hdr + ETH_ALEN, hdr11->addr4, ETH_ALEN); /* SA */ - break; - default: - memcpy(hdr, hdr11->addr1, ETH_ALEN); /* DA */ - memcpy(hdr + ETH_ALEN, hdr11->addr2, ETH_ALEN); /* SA */ - break; - } - - if (ieee80211_is_data_qos(hdr11->frame_control)) { - hdr[12] = le16_to_cpu(*((__le16 *)ieee80211_get_qos_ctl(hdr11))) - & IEEE80211_QOS_CTL_TID_MASK; - } else - hdr[12] = 0; /* priority */ - - hdr[13] = hdr[14] = hdr[15] = 0; /* reserved */ -} - -static int lib80211_michael_mic_add(struct sk_buff *skb, int hdr_len, - void *priv) -{ - struct lib80211_tkip_data *tkey = priv; - u8 *pos; - - if (skb_tailroom(skb) < 8 || skb->len < hdr_len) { - printk(KERN_DEBUG "Invalid packet for Michael MIC add " - "(tailroom=%d hdr_len=%d skb->len=%d)\n", - skb_tailroom(skb), hdr_len, skb->len); - return -1; - } - - michael_mic_hdr(skb, tkey->tx_hdr); - pos = skb_put(skb, 8); - if (michael_mic(tkey->tx_tfm_michael, &tkey->key[16], tkey->tx_hdr, - skb->data + hdr_len, skb->len - 8 - hdr_len, pos)) - return -1; - - return 0; -} - -static void lib80211_michael_mic_failure(struct net_device *dev, - struct ieee80211_hdr *hdr, - int keyidx) -{ - union iwreq_data wrqu; - struct iw_michaelmicfailure ev; - - /* TODO: needed parameters: count, keyid, key type, TSC */ - memset(&ev, 0, sizeof(ev)); - ev.flags = keyidx & IW_MICFAILURE_KEY_ID; - if (hdr->addr1[0] & 0x01) - ev.flags |= IW_MICFAILURE_GROUP; - else - ev.flags |= IW_MICFAILURE_PAIRWISE; - ev.src_addr.sa_family = ARPHRD_ETHER; - memcpy(ev.src_addr.sa_data, hdr->addr2, ETH_ALEN); - memset(&wrqu, 0, sizeof(wrqu)); - wrqu.data.length = sizeof(ev); - wireless_send_event(dev, IWEVMICHAELMICFAILURE, &wrqu, (char *)&ev); -} - -static int lib80211_michael_mic_verify(struct sk_buff *skb, int keyidx, - int hdr_len, void *priv) -{ - struct lib80211_tkip_data *tkey = priv; - u8 mic[8]; - - if (!tkey->key_set) - return -1; - - michael_mic_hdr(skb, tkey->rx_hdr); - if (michael_mic(tkey->rx_tfm_michael, &tkey->key[24], tkey->rx_hdr, - skb->data + hdr_len, skb->len - 8 - hdr_len, mic)) - return -1; - if (memcmp(mic, skb->data + skb->len - 8, 8) != 0) { - struct ieee80211_hdr *hdr; - hdr = (struct ieee80211_hdr *)skb->data; - printk(KERN_DEBUG "%s: Michael MIC verification failed for " - "MSDU from %pM keyidx=%d\n", - skb->dev ? skb->dev->name : "N/A", hdr->addr2, - keyidx); - if (skb->dev) - lib80211_michael_mic_failure(skb->dev, hdr, keyidx); - tkey->dot11RSNAStatsTKIPLocalMICFailures++; - return -1; - } - - /* Update TSC counters for RX now that the packet verification has - * completed. */ - tkey->rx_iv32 = tkey->rx_iv32_new; - tkey->rx_iv16 = tkey->rx_iv16_new; - - skb_trim(skb, skb->len - 8); - - return 0; -} - -static int lib80211_tkip_set_key(void *key, int len, u8 * seq, void *priv) -{ - struct lib80211_tkip_data *tkey = priv; - int keyidx; - struct crypto_shash *tfm = tkey->tx_tfm_michael; - struct arc4_ctx *tfm2 = &tkey->tx_ctx_arc4; - struct crypto_shash *tfm3 = tkey->rx_tfm_michael; - struct arc4_ctx *tfm4 = &tkey->rx_ctx_arc4; - - keyidx = tkey->key_idx; - memset(tkey, 0, sizeof(*tkey)); - tkey->key_idx = keyidx; - tkey->tx_tfm_michael = tfm; - tkey->tx_ctx_arc4 = *tfm2; - tkey->rx_tfm_michael = tfm3; - tkey->rx_ctx_arc4 = *tfm4; - if (len == TKIP_KEY_LEN) { - memcpy(tkey->key, key, TKIP_KEY_LEN); - tkey->key_set = 1; - tkey->tx_iv16 = 1; /* TSC is initialized to 1 */ - if (seq) { - tkey->rx_iv32 = (seq[5] << 24) | (seq[4] << 16) | - (seq[3] << 8) | seq[2]; - tkey->rx_iv16 = (seq[1] << 8) | seq[0]; - } - } else if (len == 0) - tkey->key_set = 0; - else - return -1; - - return 0; -} - -static int lib80211_tkip_get_key(void *key, int len, u8 * seq, void *priv) -{ - struct lib80211_tkip_data *tkey = priv; - - if (len < TKIP_KEY_LEN) - return -1; - - if (!tkey->key_set) - return 0; - memcpy(key, tkey->key, TKIP_KEY_LEN); - - if (seq) { - /* - * Not clear if this should return the value as is - * or - as the code previously seemed to partially - * have been written as - subtract one from it. It - * was working this way for a long time so leave it. - */ - seq[0] = tkey->tx_iv16; - seq[1] = tkey->tx_iv16 >> 8; - seq[2] = tkey->tx_iv32; - seq[3] = tkey->tx_iv32 >> 8; - seq[4] = tkey->tx_iv32 >> 16; - seq[5] = tkey->tx_iv32 >> 24; - } - - return TKIP_KEY_LEN; -} - -static void lib80211_tkip_print_stats(struct seq_file *m, void *priv) -{ - struct lib80211_tkip_data *tkip = priv; - seq_printf(m, - "key[%d] alg=TKIP key_set=%d " - "tx_pn=%02x%02x%02x%02x%02x%02x " - "rx_pn=%02x%02x%02x%02x%02x%02x " - "replays=%d icv_errors=%d local_mic_failures=%d\n", - tkip->key_idx, tkip->key_set, - (tkip->tx_iv32 >> 24) & 0xff, - (tkip->tx_iv32 >> 16) & 0xff, - (tkip->tx_iv32 >> 8) & 0xff, - tkip->tx_iv32 & 0xff, - (tkip->tx_iv16 >> 8) & 0xff, - tkip->tx_iv16 & 0xff, - (tkip->rx_iv32 >> 24) & 0xff, - (tkip->rx_iv32 >> 16) & 0xff, - (tkip->rx_iv32 >> 8) & 0xff, - tkip->rx_iv32 & 0xff, - (tkip->rx_iv16 >> 8) & 0xff, - tkip->rx_iv16 & 0xff, - tkip->dot11RSNAStatsTKIPReplays, - tkip->dot11RSNAStatsTKIPICVErrors, - tkip->dot11RSNAStatsTKIPLocalMICFailures); -} - -static const struct lib80211_crypto_ops lib80211_crypt_tkip = { - .name = "TKIP", - .init = lib80211_tkip_init, - .deinit = lib80211_tkip_deinit, - .encrypt_mpdu = lib80211_tkip_encrypt, - .decrypt_mpdu = lib80211_tkip_decrypt, - .encrypt_msdu = lib80211_michael_mic_add, - .decrypt_msdu = lib80211_michael_mic_verify, - .set_key = lib80211_tkip_set_key, - .get_key = lib80211_tkip_get_key, - .print_stats = lib80211_tkip_print_stats, - .extra_mpdu_prefix_len = 4 + 4, /* IV + ExtIV */ - .extra_mpdu_postfix_len = 4, /* ICV */ - .extra_msdu_postfix_len = 8, /* MIC */ - .get_flags = lib80211_tkip_get_flags, - .set_flags = lib80211_tkip_set_flags, - .owner = THIS_MODULE, -}; - -static int __init lib80211_crypto_tkip_init(void) -{ - return lib80211_register_crypto_ops(&lib80211_crypt_tkip); -} - -static void __exit lib80211_crypto_tkip_exit(void) -{ - lib80211_unregister_crypto_ops(&lib80211_crypt_tkip); -} - -module_init(lib80211_crypto_tkip_init); -module_exit(lib80211_crypto_tkip_exit); diff --git a/net/wireless/lib80211_crypt_wep.c b/net/wireless/lib80211_crypt_wep.c deleted file mode 100644 index 3b148c7bef85..000000000000 --- a/net/wireless/lib80211_crypt_wep.c +++ /dev/null @@ -1,256 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * lib80211 crypt: host-based WEP encryption implementation for lib80211 - * - * Copyright (c) 2002-2004, Jouni Malinen - * Copyright (c) 2008, John W. Linville - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include - -MODULE_AUTHOR("Jouni Malinen"); -MODULE_DESCRIPTION("lib80211 crypt: WEP"); -MODULE_LICENSE("GPL"); - -struct lib80211_wep_data { - u32 iv; -#define WEP_KEY_LEN 13 - u8 key[WEP_KEY_LEN + 1]; - u8 key_len; - u8 key_idx; - struct arc4_ctx tx_ctx; - struct arc4_ctx rx_ctx; -}; - -static void *lib80211_wep_init(int keyidx) -{ - struct lib80211_wep_data *priv; - - if (fips_enabled) - return NULL; - - priv = kzalloc(sizeof(*priv), GFP_ATOMIC); - if (priv == NULL) - return NULL; - priv->key_idx = keyidx; - - /* start WEP IV from a random value */ - get_random_bytes(&priv->iv, 4); - - return priv; -} - -static void lib80211_wep_deinit(void *priv) -{ - kfree_sensitive(priv); -} - -/* Add WEP IV/key info to a frame that has at least 4 bytes of headroom */ -static int lib80211_wep_build_iv(struct sk_buff *skb, int hdr_len, - u8 *key, int keylen, void *priv) -{ - struct lib80211_wep_data *wep = priv; - u32 klen; - u8 *pos; - - if (skb_headroom(skb) < 4 || skb->len < hdr_len) - return -1; - - pos = skb_push(skb, 4); - memmove(pos, pos + 4, hdr_len); - pos += hdr_len; - - klen = 3 + wep->key_len; - - wep->iv++; - - /* Fluhrer, Mantin, and Shamir have reported weaknesses in the key - * scheduling algorithm of RC4. At least IVs (KeyByte + 3, 0xff, N) - * can be used to speedup attacks, so avoid using them. */ - if ((wep->iv & 0xff00) == 0xff00) { - u8 B = (wep->iv >> 16) & 0xff; - if (B >= 3 && B < klen) - wep->iv += 0x0100; - } - - /* Prepend 24-bit IV to RC4 key and TX frame */ - *pos++ = (wep->iv >> 16) & 0xff; - *pos++ = (wep->iv >> 8) & 0xff; - *pos++ = wep->iv & 0xff; - *pos++ = wep->key_idx << 6; - - return 0; -} - -/* Perform WEP encryption on given skb that has at least 4 bytes of headroom - * for IV and 4 bytes of tailroom for ICV. Both IV and ICV will be transmitted, - * so the payload length increases with 8 bytes. - * - * WEP frame payload: IV + TX key idx, RC4(data), ICV = RC4(CRC32(data)) - */ -static int lib80211_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv) -{ - struct lib80211_wep_data *wep = priv; - u32 crc, klen, len; - u8 *pos, *icv; - u8 key[WEP_KEY_LEN + 3]; - - /* other checks are in lib80211_wep_build_iv */ - if (skb_tailroom(skb) < 4) - return -1; - - /* add the IV to the frame */ - if (lib80211_wep_build_iv(skb, hdr_len, NULL, 0, priv)) - return -1; - - /* Copy the IV into the first 3 bytes of the key */ - skb_copy_from_linear_data_offset(skb, hdr_len, key, 3); - - /* Copy rest of the WEP key (the secret part) */ - memcpy(key + 3, wep->key, wep->key_len); - - len = skb->len - hdr_len - 4; - pos = skb->data + hdr_len + 4; - klen = 3 + wep->key_len; - - /* Append little-endian CRC32 over only the data and encrypt it to produce ICV */ - crc = ~crc32_le(~0, pos, len); - icv = skb_put(skb, 4); - icv[0] = crc; - icv[1] = crc >> 8; - icv[2] = crc >> 16; - icv[3] = crc >> 24; - - arc4_setkey(&wep->tx_ctx, key, klen); - arc4_crypt(&wep->tx_ctx, pos, pos, len + 4); - - return 0; -} - -/* Perform WEP decryption on given buffer. Buffer includes whole WEP part of - * the frame: IV (4 bytes), encrypted payload (including SNAP header), - * ICV (4 bytes). len includes both IV and ICV. - * - * Returns 0 if frame was decrypted successfully and ICV was correct and -1 on - * failure. If frame is OK, IV and ICV will be removed. - */ -static int lib80211_wep_decrypt(struct sk_buff *skb, int hdr_len, void *priv) -{ - struct lib80211_wep_data *wep = priv; - u32 crc, klen, plen; - u8 key[WEP_KEY_LEN + 3]; - u8 keyidx, *pos, icv[4]; - - if (skb->len < hdr_len + 8) - return -1; - - pos = skb->data + hdr_len; - key[0] = *pos++; - key[1] = *pos++; - key[2] = *pos++; - keyidx = *pos++ >> 6; - if (keyidx != wep->key_idx) - return -1; - - klen = 3 + wep->key_len; - - /* Copy rest of the WEP key (the secret part) */ - memcpy(key + 3, wep->key, wep->key_len); - - /* Apply RC4 to data and compute CRC32 over decrypted data */ - plen = skb->len - hdr_len - 8; - - arc4_setkey(&wep->rx_ctx, key, klen); - arc4_crypt(&wep->rx_ctx, pos, pos, plen + 4); - - crc = ~crc32_le(~0, pos, plen); - icv[0] = crc; - icv[1] = crc >> 8; - icv[2] = crc >> 16; - icv[3] = crc >> 24; - if (memcmp(icv, pos + plen, 4) != 0) { - /* ICV mismatch - drop frame */ - return -2; - } - - /* Remove IV and ICV */ - memmove(skb->data + 4, skb->data, hdr_len); - skb_pull(skb, 4); - skb_trim(skb, skb->len - 4); - - return 0; -} - -static int lib80211_wep_set_key(void *key, int len, u8 * seq, void *priv) -{ - struct lib80211_wep_data *wep = priv; - - if (len < 0 || len > WEP_KEY_LEN) - return -1; - - memcpy(wep->key, key, len); - wep->key_len = len; - - return 0; -} - -static int lib80211_wep_get_key(void *key, int len, u8 * seq, void *priv) -{ - struct lib80211_wep_data *wep = priv; - - if (len < wep->key_len) - return -1; - - memcpy(key, wep->key, wep->key_len); - - return wep->key_len; -} - -static void lib80211_wep_print_stats(struct seq_file *m, void *priv) -{ - struct lib80211_wep_data *wep = priv; - seq_printf(m, "key[%d] alg=WEP len=%d\n", wep->key_idx, wep->key_len); -} - -static const struct lib80211_crypto_ops lib80211_crypt_wep = { - .name = "WEP", - .init = lib80211_wep_init, - .deinit = lib80211_wep_deinit, - .encrypt_mpdu = lib80211_wep_encrypt, - .decrypt_mpdu = lib80211_wep_decrypt, - .encrypt_msdu = NULL, - .decrypt_msdu = NULL, - .set_key = lib80211_wep_set_key, - .get_key = lib80211_wep_get_key, - .print_stats = lib80211_wep_print_stats, - .extra_mpdu_prefix_len = 4, /* IV */ - .extra_mpdu_postfix_len = 4, /* ICV */ - .owner = THIS_MODULE, -}; - -static int __init lib80211_crypto_wep_init(void) -{ - return lib80211_register_crypto_ops(&lib80211_crypt_wep); -} - -static void __exit lib80211_crypto_wep_exit(void) -{ - lib80211_unregister_crypto_ops(&lib80211_crypt_wep); -} - -module_init(lib80211_crypto_wep_init); -module_exit(lib80211_crypto_wep_exit); -- cgit v1.3 From 3a1d429ebd43bcfdf3590096ca72cbf593d1598b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 21:02:53 +0200 Subject: wifi: wext/libipw: move spy implementation to libipw There's no driver left using this other than ipw2200, so move the data bookkeeping and code into libipw. Link: https://patch.msgid.link/20241007210254.037d864cda7d.Ib2197cb056ff05746d3521a5fba637062acb7314@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/ipw2x00/Kconfig | 3 - drivers/net/wireless/intel/ipw2x00/Makefile | 1 + drivers/net/wireless/intel/ipw2x00/ipw2200.c | 10 +- drivers/net/wireless/intel/ipw2x00/libipw.h | 13 ++ drivers/net/wireless/intel/ipw2x00/libipw_rx.c | 2 +- drivers/net/wireless/intel/ipw2x00/libipw_spy.c | 232 ++++++++++++++++++++++++ include/net/iw_handler.h | 18 -- net/wireless/Kconfig | 3 - net/wireless/Makefile | 1 - net/wireless/wext-spy.c | 232 ------------------------ 10 files changed, 252 insertions(+), 263 deletions(-) create mode 100644 drivers/net/wireless/intel/ipw2x00/libipw_spy.c delete mode 100644 net/wireless/wext-spy.c (limited to 'net') diff --git a/drivers/net/wireless/intel/ipw2x00/Kconfig b/drivers/net/wireless/intel/ipw2x00/Kconfig index d9c042772399..ce34118f1e90 100644 --- a/drivers/net/wireless/intel/ipw2x00/Kconfig +++ b/drivers/net/wireless/intel/ipw2x00/Kconfig @@ -7,7 +7,6 @@ config IPW2100 tristate "Intel PRO/Wireless 2100 Network Connection" depends on PCI && CFG80211 select WIRELESS_EXT - select WEXT_SPY select WEXT_PRIV select FW_LOADER select LIBIPW @@ -68,7 +67,6 @@ config IPW2200 depends on PCI && CFG80211 select CFG80211_WEXT_EXPORT select WIRELESS_EXT - select WEXT_SPY select WEXT_PRIV select FW_LOADER select LIBIPW @@ -156,7 +154,6 @@ config LIBIPW tristate depends on PCI && CFG80211 select WIRELESS_EXT - select WEXT_SPY select CRYPTO select CRYPTO_MICHAEL_MIC select CRC32 diff --git a/drivers/net/wireless/intel/ipw2x00/Makefile b/drivers/net/wireless/intel/ipw2x00/Makefile index 60c5faccbe15..91e6091c4ebf 100644 --- a/drivers/net/wireless/intel/ipw2x00/Makefile +++ b/drivers/net/wireless/intel/ipw2x00/Makefile @@ -13,6 +13,7 @@ libipw-objs := \ libipw_rx.o \ libipw_wx.o \ libipw_geo.o \ + libipw_spy.o \ libipw_crypto.o \ libipw_crypto_ccmp.o \ libipw_crypto_tkip.o \ diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2200.c b/drivers/net/wireless/intel/ipw2x00/ipw2200.c index f4fd1fc784b7..0008b4615731 100644 --- a/drivers/net/wireless/intel/ipw2x00/ipw2200.c +++ b/drivers/net/wireless/intel/ipw2x00/ipw2200.c @@ -9856,10 +9856,10 @@ static iw_handler ipw_wx_handlers[] = { IW_HANDLER(SIOCGIWENCODE, ipw_wx_get_encode), IW_HANDLER(SIOCSIWPOWER, ipw_wx_set_power), IW_HANDLER(SIOCGIWPOWER, ipw_wx_get_power), - IW_HANDLER(SIOCSIWSPY, iw_handler_set_spy), - IW_HANDLER(SIOCGIWSPY, iw_handler_get_spy), - IW_HANDLER(SIOCSIWTHRSPY, iw_handler_set_thrspy), - IW_HANDLER(SIOCGIWTHRSPY, iw_handler_get_thrspy), + IW_HANDLER(SIOCSIWSPY, ipw_wx_set_spy), + IW_HANDLER(SIOCGIWSPY, ipw_wx_get_spy), + IW_HANDLER(SIOCSIWTHRSPY, ipw_wx_set_thrspy), + IW_HANDLER(SIOCGIWTHRSPY, ipw_wx_get_thrspy), IW_HANDLER(SIOCSIWGENIE, ipw_wx_set_genie), IW_HANDLER(SIOCGIWGENIE, ipw_wx_get_genie), IW_HANDLER(SIOCSIWMLME, ipw_wx_set_mlme), @@ -11636,7 +11636,7 @@ static int ipw_pci_probe(struct pci_dev *pdev, priv->ieee->worst_rssi = -85; net_dev->netdev_ops = &ipw_netdev_ops; - priv->wireless_data.spy_data = &priv->ieee->spy_data; + priv->ieee->spy_enabled = true; net_dev->wireless_data = &priv->wireless_data; net_dev->wireless_handlers = &ipw_wx_handler_def; net_dev->ethtool_ops = &ipw_ethtool_ops; diff --git a/drivers/net/wireless/intel/ipw2x00/libipw.h b/drivers/net/wireless/intel/ipw2x00/libipw.h index bc727c99ff3c..3c20353e5a41 100644 --- a/drivers/net/wireless/intel/ipw2x00/libipw.h +++ b/drivers/net/wireless/intel/ipw2x00/libipw.h @@ -788,6 +788,7 @@ struct libipw_device { int iw_mode; /* operating mode (IW_MODE_*) */ struct iw_spy_data spy_data; /* iwspy support */ + bool spy_enabled; spinlock_t lock; @@ -1083,4 +1084,16 @@ void libipw_crypto_tkip_exit(void); void libipw_crypto_ccmp_exit(void); void libipw_crypto_exit(void); + +int ipw_wx_set_spy(struct net_device *dev, struct iw_request_info *info, + union iwreq_data *wrqu, char *extra); +int ipw_wx_get_spy(struct net_device *dev, struct iw_request_info *info, + union iwreq_data *wrqu, char *extra); +int ipw_wx_set_thrspy(struct net_device *dev, struct iw_request_info *info, + union iwreq_data *wrqu, char *extra); +int ipw_wx_get_thrspy(struct net_device *dev, struct iw_request_info *info, + union iwreq_data *wrqu, char *extra); +void libipw_spy_update(struct net_device *dev, unsigned char *address, + struct iw_quality *wstats); + #endif /* LIBIPW_H */ diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_rx.c b/drivers/net/wireless/intel/ipw2x00/libipw_rx.c index 1fe05e73a17c..7e41cb7bbfe0 100644 --- a/drivers/net/wireless/intel/ipw2x00/libipw_rx.c +++ b/drivers/net/wireless/intel/ipw2x00/libipw_rx.c @@ -393,7 +393,7 @@ int libipw_rx(struct libipw_device *ieee, struct sk_buff *skb, wstats.updated |= IW_QUAL_QUAL_INVALID; /* Update spy records */ - wireless_spy_update(ieee->dev, hdr->addr2, &wstats); + libipw_spy_update(ieee->dev, hdr->addr2, &wstats); } #endif /* IW_WIRELESS_SPY */ #endif /* CONFIG_WIRELESS_EXT */ diff --git a/drivers/net/wireless/intel/ipw2x00/libipw_spy.c b/drivers/net/wireless/intel/ipw2x00/libipw_spy.c new file mode 100644 index 000000000000..979aeb10aeeb --- /dev/null +++ b/drivers/net/wireless/intel/ipw2x00/libipw_spy.c @@ -0,0 +1,232 @@ +/* + * This file implement the Wireless Extensions spy API. + * + * Authors : Jean Tourrilhes - HPL - + * Copyright (c) 1997-2007 Jean Tourrilhes, All Rights Reserved. + * + * (As all part of the Linux kernel, this file is GPL) + */ + +#include +#include +#include +#include +#include +#include +#include +#include "libipw.h" + +static struct iw_spy_data *get_spydata(struct net_device *dev) +{ + if (dev->wireless_data && dev->wireless_data->libipw && + dev->wireless_data->libipw->spy_enabled) + return &dev->wireless_data->libipw->spy_data; + return NULL; +} + +int ipw_wx_set_spy(struct net_device * dev, + struct iw_request_info * info, + union iwreq_data * wrqu, + char * extra) +{ + struct iw_spy_data * spydata = get_spydata(dev); + struct sockaddr * address = (struct sockaddr *) extra; + + /* Make sure driver is not buggy or using the old API */ + if (!spydata) + return -EOPNOTSUPP; + + /* Disable spy collection while we copy the addresses. + * While we copy addresses, any call to libipw_spy_update() + * will NOP. This is OK, as anyway the addresses are changing. */ + spydata->spy_number = 0; + + /* We want to operate without locking, because libipw_spy_update() + * most likely will happen in the interrupt handler, and therefore + * have its own locking constraints and needs performance. + * The rtnl_lock() make sure we don't race with the other iw_handlers. + * This make sure libipw_spy_update() "see" that the spy list + * is temporarily disabled. */ + smp_wmb(); + + /* Are there are addresses to copy? */ + if (wrqu->data.length > 0) { + int i; + + /* Copy addresses */ + for (i = 0; i < wrqu->data.length; i++) + memcpy(spydata->spy_address[i], address[i].sa_data, + ETH_ALEN); + /* Reset stats */ + memset(spydata->spy_stat, 0, + sizeof(struct iw_quality) * IW_MAX_SPY); + } + + /* Make sure above is updated before re-enabling */ + smp_wmb(); + + /* Enable addresses */ + spydata->spy_number = wrqu->data.length; + + return 0; +} +EXPORT_SYMBOL(ipw_wx_set_spy); + +int ipw_wx_get_spy(struct net_device * dev, + struct iw_request_info * info, + union iwreq_data * wrqu, + char * extra) +{ + struct iw_spy_data * spydata = get_spydata(dev); + struct sockaddr * address = (struct sockaddr *) extra; + int i; + + /* Make sure driver is not buggy or using the old API */ + if (!spydata) + return -EOPNOTSUPP; + + wrqu->data.length = spydata->spy_number; + + /* Copy addresses. */ + for (i = 0; i < spydata->spy_number; i++) { + memcpy(address[i].sa_data, spydata->spy_address[i], ETH_ALEN); + address[i].sa_family = AF_UNIX; + } + /* Copy stats to the user buffer (just after). */ + if (spydata->spy_number > 0) + memcpy(extra + (sizeof(struct sockaddr) *spydata->spy_number), + spydata->spy_stat, + sizeof(struct iw_quality) * spydata->spy_number); + /* Reset updated flags. */ + for (i = 0; i < spydata->spy_number; i++) + spydata->spy_stat[i].updated &= ~IW_QUAL_ALL_UPDATED; + return 0; +} +EXPORT_SYMBOL(ipw_wx_get_spy); + +/*------------------------------------------------------------------*/ +/* + * Standard Wireless Handler : set spy threshold + */ +int ipw_wx_set_thrspy(struct net_device * dev, + struct iw_request_info * info, + union iwreq_data * wrqu, + char * extra) +{ + struct iw_spy_data * spydata = get_spydata(dev); + struct iw_thrspy * threshold = (struct iw_thrspy *) extra; + + /* Make sure driver is not buggy or using the old API */ + if (!spydata) + return -EOPNOTSUPP; + + /* Just do it */ + spydata->spy_thr_low = threshold->low; + spydata->spy_thr_high = threshold->high; + + /* Clear flag */ + memset(spydata->spy_thr_under, '\0', sizeof(spydata->spy_thr_under)); + + return 0; +} +EXPORT_SYMBOL(ipw_wx_set_thrspy); + +/*------------------------------------------------------------------*/ +/* + * Standard Wireless Handler : get spy threshold + */ +int ipw_wx_get_thrspy(struct net_device * dev, + struct iw_request_info * info, + union iwreq_data * wrqu, + char * extra) +{ + struct iw_spy_data * spydata = get_spydata(dev); + struct iw_thrspy * threshold = (struct iw_thrspy *) extra; + + /* Make sure driver is not buggy or using the old API */ + if (!spydata) + return -EOPNOTSUPP; + + /* Just do it */ + threshold->low = spydata->spy_thr_low; + threshold->high = spydata->spy_thr_high; + + return 0; +} +EXPORT_SYMBOL(ipw_wx_get_thrspy); + +/*------------------------------------------------------------------*/ +/* + * Prepare and send a Spy Threshold event + */ +static void iw_send_thrspy_event(struct net_device * dev, + struct iw_spy_data * spydata, + unsigned char * address, + struct iw_quality * wstats) +{ + union iwreq_data wrqu; + struct iw_thrspy threshold; + + /* Init */ + wrqu.data.length = 1; + wrqu.data.flags = 0; + /* Copy address */ + memcpy(threshold.addr.sa_data, address, ETH_ALEN); + threshold.addr.sa_family = ARPHRD_ETHER; + /* Copy stats */ + threshold.qual = *wstats; + /* Copy also thresholds */ + threshold.low = spydata->spy_thr_low; + threshold.high = spydata->spy_thr_high; + + /* Send event to user space */ + wireless_send_event(dev, SIOCGIWTHRSPY, &wrqu, (char *) &threshold); +} + +/* ---------------------------------------------------------------- */ +/* + * Call for the driver to update the spy data. + * For now, the spy data is a simple array. As the size of the array is + * small, this is good enough. If we wanted to support larger number of + * spy addresses, we should use something more efficient... + */ +void libipw_spy_update(struct net_device * dev, + unsigned char * address, + struct iw_quality * wstats) +{ + struct iw_spy_data * spydata = get_spydata(dev); + int i; + int match = -1; + + /* Make sure driver is not buggy or using the old API */ + if (!spydata) + return; + + /* Update all records that match */ + for (i = 0; i < spydata->spy_number; i++) + if (ether_addr_equal(address, spydata->spy_address[i])) { + memcpy(&(spydata->spy_stat[i]), wstats, + sizeof(struct iw_quality)); + match = i; + } + + /* Generate an event if we cross the spy threshold. + * To avoid event storms, we have a simple hysteresis : we generate + * event only when we go under the low threshold or above the + * high threshold. */ + if (match >= 0) { + if (spydata->spy_thr_under[match]) { + if (wstats->level > spydata->spy_thr_high.level) { + spydata->spy_thr_under[match] = 0; + iw_send_thrspy_event(dev, spydata, + address, wstats); + } + } else { + if (wstats->level < spydata->spy_thr_low.level) { + spydata->spy_thr_under[match] = 1; + iw_send_thrspy_event(dev, spydata, + address, wstats); + } + } + } +} diff --git a/include/net/iw_handler.h b/include/net/iw_handler.h index 7af1082ea9a0..a7b502958d27 100644 --- a/include/net/iw_handler.h +++ b/include/net/iw_handler.h @@ -418,8 +418,6 @@ struct iw_spy_data { struct libipw_device; /* The struct */ struct iw_public_data { - /* Driver enhanced spy support */ - struct iw_spy_data * spy_data; /* Legacy structure managed by the ipw2x00-specific IEEE 802.11 layer */ struct libipw_device * libipw; }; @@ -443,22 +441,6 @@ static inline void wireless_nlevent_flush(void) {} /* We may need a function to send a stream of events to user space. * More on that later... */ -/* Standard handler for SIOCSIWSPY */ -int iw_handler_set_spy(struct net_device *dev, struct iw_request_info *info, - union iwreq_data *wrqu, char *extra); -/* Standard handler for SIOCGIWSPY */ -int iw_handler_get_spy(struct net_device *dev, struct iw_request_info *info, - union iwreq_data *wrqu, char *extra); -/* Standard handler for SIOCSIWTHRSPY */ -int iw_handler_set_thrspy(struct net_device *dev, struct iw_request_info *info, - union iwreq_data *wrqu, char *extra); -/* Standard handler for SIOCGIWTHRSPY */ -int iw_handler_get_thrspy(struct net_device *dev, struct iw_request_info *info, - union iwreq_data *wrqu, char *extra); -/* Driver call to update spy records */ -void wireless_spy_update(struct net_device *dev, unsigned char *address, - struct iw_quality *wstats); - /************************* INLINE FUNCTIONS *************************/ /* * Function that are so simple that it's more efficient inlining them diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index 733c53ad4de5..8c8bd8b75708 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -11,9 +11,6 @@ config WEXT_PROC depends on PROC_FS depends on WEXT_CORE -config WEXT_SPY - bool - config WEXT_PRIV bool diff --git a/net/wireless/Makefile b/net/wireless/Makefile index 27f211bd9954..62a83faf0e07 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -4,7 +4,6 @@ obj-y += tests/ obj-$(CONFIG_WEXT_CORE) += wext-core.o obj-$(CONFIG_WEXT_PROC) += wext-proc.o -obj-$(CONFIG_WEXT_SPY) += wext-spy.o obj-$(CONFIG_WEXT_PRIV) += wext-priv.o cfg80211-y += core.o sysfs.o radiotap.o util.o reg.o scan.o nl80211.o diff --git a/net/wireless/wext-spy.c b/net/wireless/wext-spy.c deleted file mode 100644 index b379a0371653..000000000000 --- a/net/wireless/wext-spy.c +++ /dev/null @@ -1,232 +0,0 @@ -/* - * This file implement the Wireless Extensions spy API. - * - * Authors : Jean Tourrilhes - HPL - - * Copyright (c) 1997-2007 Jean Tourrilhes, All Rights Reserved. - * - * (As all part of the Linux kernel, this file is GPL) - */ - -#include -#include -#include -#include -#include -#include -#include - -static inline struct iw_spy_data *get_spydata(struct net_device *dev) -{ - /* This is the new way */ - if (dev->wireless_data) - return dev->wireless_data->spy_data; - return NULL; -} - -int iw_handler_set_spy(struct net_device * dev, - struct iw_request_info * info, - union iwreq_data * wrqu, - char * extra) -{ - struct iw_spy_data * spydata = get_spydata(dev); - struct sockaddr * address = (struct sockaddr *) extra; - - /* Make sure driver is not buggy or using the old API */ - if (!spydata) - return -EOPNOTSUPP; - - /* Disable spy collection while we copy the addresses. - * While we copy addresses, any call to wireless_spy_update() - * will NOP. This is OK, as anyway the addresses are changing. */ - spydata->spy_number = 0; - - /* We want to operate without locking, because wireless_spy_update() - * most likely will happen in the interrupt handler, and therefore - * have its own locking constraints and needs performance. - * The rtnl_lock() make sure we don't race with the other iw_handlers. - * This make sure wireless_spy_update() "see" that the spy list - * is temporarily disabled. */ - smp_wmb(); - - /* Are there are addresses to copy? */ - if (wrqu->data.length > 0) { - int i; - - /* Copy addresses */ - for (i = 0; i < wrqu->data.length; i++) - memcpy(spydata->spy_address[i], address[i].sa_data, - ETH_ALEN); - /* Reset stats */ - memset(spydata->spy_stat, 0, - sizeof(struct iw_quality) * IW_MAX_SPY); - } - - /* Make sure above is updated before re-enabling */ - smp_wmb(); - - /* Enable addresses */ - spydata->spy_number = wrqu->data.length; - - return 0; -} -EXPORT_SYMBOL(iw_handler_set_spy); - -int iw_handler_get_spy(struct net_device * dev, - struct iw_request_info * info, - union iwreq_data * wrqu, - char * extra) -{ - struct iw_spy_data * spydata = get_spydata(dev); - struct sockaddr * address = (struct sockaddr *) extra; - int i; - - /* Make sure driver is not buggy or using the old API */ - if (!spydata) - return -EOPNOTSUPP; - - wrqu->data.length = spydata->spy_number; - - /* Copy addresses. */ - for (i = 0; i < spydata->spy_number; i++) { - memcpy(address[i].sa_data, spydata->spy_address[i], ETH_ALEN); - address[i].sa_family = AF_UNIX; - } - /* Copy stats to the user buffer (just after). */ - if (spydata->spy_number > 0) - memcpy(extra + (sizeof(struct sockaddr) *spydata->spy_number), - spydata->spy_stat, - sizeof(struct iw_quality) * spydata->spy_number); - /* Reset updated flags. */ - for (i = 0; i < spydata->spy_number; i++) - spydata->spy_stat[i].updated &= ~IW_QUAL_ALL_UPDATED; - return 0; -} -EXPORT_SYMBOL(iw_handler_get_spy); - -/*------------------------------------------------------------------*/ -/* - * Standard Wireless Handler : set spy threshold - */ -int iw_handler_set_thrspy(struct net_device * dev, - struct iw_request_info *info, - union iwreq_data * wrqu, - char * extra) -{ - struct iw_spy_data * spydata = get_spydata(dev); - struct iw_thrspy * threshold = (struct iw_thrspy *) extra; - - /* Make sure driver is not buggy or using the old API */ - if (!spydata) - return -EOPNOTSUPP; - - /* Just do it */ - spydata->spy_thr_low = threshold->low; - spydata->spy_thr_high = threshold->high; - - /* Clear flag */ - memset(spydata->spy_thr_under, '\0', sizeof(spydata->spy_thr_under)); - - return 0; -} -EXPORT_SYMBOL(iw_handler_set_thrspy); - -/*------------------------------------------------------------------*/ -/* - * Standard Wireless Handler : get spy threshold - */ -int iw_handler_get_thrspy(struct net_device * dev, - struct iw_request_info *info, - union iwreq_data * wrqu, - char * extra) -{ - struct iw_spy_data * spydata = get_spydata(dev); - struct iw_thrspy * threshold = (struct iw_thrspy *) extra; - - /* Make sure driver is not buggy or using the old API */ - if (!spydata) - return -EOPNOTSUPP; - - /* Just do it */ - threshold->low = spydata->spy_thr_low; - threshold->high = spydata->spy_thr_high; - - return 0; -} -EXPORT_SYMBOL(iw_handler_get_thrspy); - -/*------------------------------------------------------------------*/ -/* - * Prepare and send a Spy Threshold event - */ -static void iw_send_thrspy_event(struct net_device * dev, - struct iw_spy_data * spydata, - unsigned char * address, - struct iw_quality * wstats) -{ - union iwreq_data wrqu; - struct iw_thrspy threshold; - - /* Init */ - wrqu.data.length = 1; - wrqu.data.flags = 0; - /* Copy address */ - memcpy(threshold.addr.sa_data, address, ETH_ALEN); - threshold.addr.sa_family = ARPHRD_ETHER; - /* Copy stats */ - threshold.qual = *wstats; - /* Copy also thresholds */ - threshold.low = spydata->spy_thr_low; - threshold.high = spydata->spy_thr_high; - - /* Send event to user space */ - wireless_send_event(dev, SIOCGIWTHRSPY, &wrqu, (char *) &threshold); -} - -/* ---------------------------------------------------------------- */ -/* - * Call for the driver to update the spy data. - * For now, the spy data is a simple array. As the size of the array is - * small, this is good enough. If we wanted to support larger number of - * spy addresses, we should use something more efficient... - */ -void wireless_spy_update(struct net_device * dev, - unsigned char * address, - struct iw_quality * wstats) -{ - struct iw_spy_data * spydata = get_spydata(dev); - int i; - int match = -1; - - /* Make sure driver is not buggy or using the old API */ - if (!spydata) - return; - - /* Update all records that match */ - for (i = 0; i < spydata->spy_number; i++) - if (ether_addr_equal(address, spydata->spy_address[i])) { - memcpy(&(spydata->spy_stat[i]), wstats, - sizeof(struct iw_quality)); - match = i; - } - - /* Generate an event if we cross the spy threshold. - * To avoid event storms, we have a simple hysteresis : we generate - * event only when we go under the low threshold or above the - * high threshold. */ - if (match >= 0) { - if (spydata->spy_thr_under[match]) { - if (wstats->level > spydata->spy_thr_high.level) { - spydata->spy_thr_under[match] = 0; - iw_send_thrspy_event(dev, spydata, - address, wstats); - } - } else { - if (wstats->level < spydata->spy_thr_low.level) { - spydata->spy_thr_under[match] = 1; - iw_send_thrspy_event(dev, spydata, - address, wstats); - } - } - } -} -EXPORT_SYMBOL(wireless_spy_update); -- cgit v1.3 From 49e3307da0f1dbad776650cf522f984b633d3afe Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 21:14:30 +0200 Subject: wifi: cfg80211: stop exporting wext symbols CFG80211_WEXT_EXPORT is no longer needed, if we only make ipw2200 return the static name for SIOCGIWNAME itself. Link: https://patch.msgid.link/20241007211431.8d4a7242ce92.I66ceb885ddfa52c368feeea1ea884bf988c525f2@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/ipw2x00/Kconfig | 1 - drivers/net/wireless/intel/ipw2x00/ipw2200.c | 10 +++++++++- net/wireless/Kconfig | 9 +-------- net/wireless/scan.c | 2 -- net/wireless/wext-compat.c | 9 --------- net/wireless/wext-compat.h | 6 ------ 6 files changed, 10 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/drivers/net/wireless/intel/ipw2x00/Kconfig b/drivers/net/wireless/intel/ipw2x00/Kconfig index ce34118f1e90..5e98be664d38 100644 --- a/drivers/net/wireless/intel/ipw2x00/Kconfig +++ b/drivers/net/wireless/intel/ipw2x00/Kconfig @@ -65,7 +65,6 @@ config IPW2100_DEBUG config IPW2200 tristate "Intel PRO/Wireless 2200BG and 2915ABG Network Connection" depends on PCI && CFG80211 - select CFG80211_WEXT_EXPORT select WIRELESS_EXT select WEXT_PRIV select FW_LOADER diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2200.c b/drivers/net/wireless/intel/ipw2x00/ipw2200.c index 0008b4615731..c0e9d2109e34 100644 --- a/drivers/net/wireless/intel/ipw2x00/ipw2200.c +++ b/drivers/net/wireless/intel/ipw2x00/ipw2200.c @@ -6463,6 +6463,14 @@ static int ipw_set_rsn_capa(struct ipw_priv *priv, * WE-18 support */ +static int ipw_wx_get_name(struct net_device *dev, + struct iw_request_info *info, + union iwreq_data *wrqu, char *extra) +{ + strcpy(wrqu->name, "IEEE 802.11"); + return 0; +} + /* SIOCSIWGENIE */ static int ipw_wx_set_genie(struct net_device *dev, struct iw_request_info *info, @@ -9826,7 +9834,7 @@ static int ipw_wx_sw_reset(struct net_device *dev, /* Rebase the WE IOCTLs to zero for the handler array */ static iw_handler ipw_wx_handlers[] = { - IW_HANDLER(SIOCGIWNAME, cfg80211_wext_giwname), + IW_HANDLER(SIOCGIWNAME, ipw_wx_get_name), IW_HANDLER(SIOCSIWFREQ, ipw_wx_set_freq), IW_HANDLER(SIOCGIWFREQ, ipw_wx_get_freq), IW_HANDLER(SIOCSIWMODE, ipw_wx_set_mode), diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index 8c8bd8b75708..2d67b5f2010e 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -185,19 +185,12 @@ config CFG80211_CRDA_SUPPORT If unsure, say Y. config CFG80211_WEXT - bool "cfg80211 wireless extensions compatibility" if !CFG80211_WEXT_EXPORT + bool "cfg80211 wireless extensions compatibility" select WEXT_CORE - default y if CFG80211_WEXT_EXPORT help Enable this option if you need old userspace for wireless extensions with cfg80211-based drivers. -config CFG80211_WEXT_EXPORT - bool - help - Drivers should select this option if they require cfg80211's - wext compatibility symbols to be exported. - config CFG80211_KUNIT_TEST tristate "KUnit tests for cfg80211" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 59a90bf3c0d6..8ba618f4734f 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -3594,7 +3594,6 @@ int cfg80211_wext_siwscan(struct net_device *dev, kfree(creq); return err; } -EXPORT_WEXT_HANDLER(cfg80211_wext_siwscan); static char *ieee80211_scan_add_ies(struct iw_request_info *info, const struct cfg80211_bss_ies *ies, @@ -3966,5 +3965,4 @@ int cfg80211_wext_giwscan(struct net_device *dev, return res; } -EXPORT_WEXT_HANDLER(cfg80211_wext_giwscan); #endif diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index cd9f8f6e298b..0c8d3797a02e 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -30,7 +30,6 @@ int cfg80211_wext_giwname(struct net_device *dev, strcpy(wrqu->name, "IEEE 802.11"); return 0; } -EXPORT_WEXT_HANDLER(cfg80211_wext_giwname); int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) @@ -69,7 +68,6 @@ int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info, return ret; } -EXPORT_WEXT_HANDLER(cfg80211_wext_siwmode); int cfg80211_wext_giwmode(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) @@ -105,7 +103,6 @@ int cfg80211_wext_giwmode(struct net_device *dev, struct iw_request_info *info, } return 0; } -EXPORT_WEXT_HANDLER(cfg80211_wext_giwmode); int cfg80211_wext_giwrange(struct net_device *dev, @@ -220,7 +217,6 @@ int cfg80211_wext_giwrange(struct net_device *dev, return 0; } -EXPORT_WEXT_HANDLER(cfg80211_wext_giwrange); /** @@ -281,7 +277,6 @@ out: wiphy_unlock(&rdev->wiphy); return err; } -EXPORT_WEXT_HANDLER(cfg80211_wext_siwrts); int cfg80211_wext_giwrts(struct net_device *dev, struct iw_request_info *info, @@ -296,7 +291,6 @@ int cfg80211_wext_giwrts(struct net_device *dev, return 0; } -EXPORT_WEXT_HANDLER(cfg80211_wext_giwrts); int cfg80211_wext_siwfrag(struct net_device *dev, struct iw_request_info *info, @@ -327,7 +321,6 @@ out: return err; } -EXPORT_WEXT_HANDLER(cfg80211_wext_siwfrag); int cfg80211_wext_giwfrag(struct net_device *dev, struct iw_request_info *info, @@ -342,7 +335,6 @@ int cfg80211_wext_giwfrag(struct net_device *dev, return 0; } -EXPORT_WEXT_HANDLER(cfg80211_wext_giwfrag); static int cfg80211_wext_siwretry(struct net_device *dev, struct iw_request_info *info, @@ -413,7 +405,6 @@ int cfg80211_wext_giwretry(struct net_device *dev, return 0; } -EXPORT_WEXT_HANDLER(cfg80211_wext_giwretry); static int cfg80211_set_encryption(struct cfg80211_registered_device *rdev, struct net_device *dev, bool pairwise, diff --git a/net/wireless/wext-compat.h b/net/wireless/wext-compat.h index c02eb789e676..8251ca5df8ae 100644 --- a/net/wireless/wext-compat.h +++ b/net/wireless/wext-compat.h @@ -5,12 +5,6 @@ #include #include -#ifdef CONFIG_CFG80211_WEXT_EXPORT -#define EXPORT_WEXT_HANDLER(h) EXPORT_SYMBOL_GPL(h) -#else -#define EXPORT_WEXT_HANDLER(h) -#endif /* CONFIG_CFG80211_WEXT_EXPORT */ - int cfg80211_ibss_wext_siwfreq(struct net_device *dev, struct iw_request_info *info, struct iw_freq *wextfreq, char *extra); -- cgit v1.3 From aee809aaa2d13bf560fe38d28c4969605e6d9d0e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 21:47:16 +0200 Subject: wifi: cfg80211: unexport wireless_nlevent_flush() This no longer needs to be exported, so don't export it. Link: https://patch.msgid.link/20241007214715.3dd736dc3ac0.I1388536e99c37f28a007dd753c473ad21513d9a9@changeid Signed-off-by: Johannes Berg --- include/net/iw_handler.h | 6 ------ net/wireless/wext-compat.h | 6 ++++++ net/wireless/wext-core.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/iw_handler.h b/include/net/iw_handler.h index fc44fcca1d5c..804587b7592b 100644 --- a/include/net/iw_handler.h +++ b/include/net/iw_handler.h @@ -413,12 +413,6 @@ struct iw_spy_data { /* Send a single event to user space */ void wireless_send_event(struct net_device *dev, unsigned int cmd, union iwreq_data *wrqu, const char *extra); -#ifdef CONFIG_WEXT_CORE -/* flush all previous wext events - if work is done from netdev notifiers */ -void wireless_nlevent_flush(void); -#else -static inline void wireless_nlevent_flush(void) {} -#endif /* We may need a function to send a stream of events to user space. * More on that later... */ diff --git a/net/wireless/wext-compat.h b/net/wireless/wext-compat.h index 8251ca5df8ae..f680dd134582 100644 --- a/net/wireless/wext-compat.h +++ b/net/wireless/wext-compat.h @@ -5,6 +5,12 @@ #include #include +#ifdef CONFIG_WEXT_CORE +void wireless_nlevent_flush(void); +#else +static inline void wireless_nlevent_flush(void) {} +#endif + int cfg80211_ibss_wext_siwfreq(struct net_device *dev, struct iw_request_info *info, struct iw_freq *wextfreq, char *extra); diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 3bb04b05c5ce..00c640b3e86e 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -20,6 +20,7 @@ #include #include #include +#include "wext-compat.h" typedef int (*wext_ioctl_func)(struct net_device *, struct iwreq *, unsigned int, struct iw_request_info *, @@ -356,7 +357,6 @@ void wireless_nlevent_flush(void) } up_read(&net_rwsem); } -EXPORT_SYMBOL_GPL(wireless_nlevent_flush); static int wext_netdev_notifier_call(struct notifier_block *nb, unsigned long state, void *ptr) -- cgit v1.3 From da5e06dee58ad153a4933fd40fc53d571bfef373 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sun, 6 Oct 2024 07:26:09 +0900 Subject: net-timestamp: namespacify the sysctl_tstamp_allow_data Let it be tuned in per netns by admins. Signed-off-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241005222609.94980-1-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski --- include/net/netns/core.h | 1 + include/net/sock.h | 2 -- net/core/net_namespace.c | 1 + net/core/skbuff.c | 2 +- net/core/sock.c | 2 -- net/core/sysctl_net_core.c | 18 +++++++++--------- 6 files changed, 12 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/net/netns/core.h b/include/net/netns/core.h index 78214f1b43a2..9b36f0ff0c20 100644 --- a/include/net/netns/core.h +++ b/include/net/netns/core.h @@ -15,6 +15,7 @@ struct netns_core { int sysctl_somaxconn; int sysctl_optmem_max; u8 sysctl_txrehash; + u8 sysctl_tstamp_allow_data; #ifdef CONFIG_PROC_FS struct prot_inuse __percpu *prot_inuse; diff --git a/include/net/sock.h b/include/net/sock.h index e282127092ab..b32f1424ecc5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2824,8 +2824,6 @@ void sk_get_meminfo(const struct sock *sk, u32 *meminfo); extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; -extern int sysctl_tstamp_allow_data; - extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 105e3cd26763..a5bc1fd8b034 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -317,6 +317,7 @@ static __net_init void preinit_net_sysctl(struct net *net) */ net->core.sysctl_optmem_max = 128 * 1024; net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; + net->core.sysctl_tstamp_allow_data = 1; } /* init code that must occur even if setup_net() is not called. */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 74149dc4ee31..00afeb90c23a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5506,7 +5506,7 @@ static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) { bool ret; - if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly)) + if (likely(tsonly || READ_ONCE(sock_net(sk)->core.sysctl_tstamp_allow_data))) return true; read_lock_bh(&sk->sk_callback_lock); diff --git a/net/core/sock.c b/net/core/sock.c index 846f494a17cf..083d438d8b6f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -286,8 +286,6 @@ EXPORT_SYMBOL(sysctl_rmem_max); __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; -int sysctl_tstamp_allow_data __read_mostly = 1; - DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); EXPORT_SYMBOL_GPL(memalloc_socks_key); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 86a2476678c4..b60fac380cec 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -491,15 +491,6 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "tstamp_allow_data", - .data = &sysctl_tstamp_allow_data, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE - }, #ifdef CONFIG_RPS { .procname = "rps_sock_flow_entries", @@ -665,6 +656,15 @@ static struct ctl_table netns_core_table[] = { .extra2 = SYSCTL_ONE, .proc_handler = proc_dou8vec_minmax, }, + { + .procname = "tstamp_allow_data", + .data = &init_net.core.sysctl_tstamp_allow_data, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, /* sysctl_core_net_init() will set the values after this * to readonly in network namespaces */ -- cgit v1.3 From 3fe3dbaf26723c473d42a58b636a2500586b821e Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 7 Oct 2024 01:44:56 +0100 Subject: caif: Remove unused cfsrvl_getphyid cfsrvl_getphyid() has been unused since 2011's commit f36214408470 ("caif: Use RCU and lists in cfcnfg.c for managing caif link layers") Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241007004456.149899-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- include/net/caif/cfsrvl.h | 1 - net/caif/cfsrvl.c | 6 ------ 2 files changed, 7 deletions(-) (limited to 'net') diff --git a/include/net/caif/cfsrvl.h b/include/net/caif/cfsrvl.h index 5ee7b322e18b..a000dc45f966 100644 --- a/include/net/caif/cfsrvl.h +++ b/include/net/caif/cfsrvl.h @@ -40,7 +40,6 @@ void cfsrvl_init(struct cfsrvl *service, struct dev_info *dev_info, bool supports_flowctrl); bool cfsrvl_ready(struct cfsrvl *service, int *err); -u8 cfsrvl_getphyid(struct cflayer *layer); static inline void cfsrvl_get(struct cflayer *layr) { diff --git a/net/caif/cfsrvl.c b/net/caif/cfsrvl.c index 9cef9496a707..171fa32ada85 100644 --- a/net/caif/cfsrvl.c +++ b/net/caif/cfsrvl.c @@ -183,12 +183,6 @@ bool cfsrvl_ready(struct cfsrvl *service, int *err) return true; } -u8 cfsrvl_getphyid(struct cflayer *layer) -{ - struct cfsrvl *servl = container_obj(layer); - return servl->dev_info.id; -} - bool cfsrvl_phyid_match(struct cflayer *layer, int phyid) { struct cfsrvl *servl = container_obj(layer); -- cgit v1.3 From db03488897a70367aeafe82d07a78943d2a6068e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 9 Oct 2024 08:33:05 +0200 Subject: Revert "wifi: cfg80211: unexport wireless_nlevent_flush()" Revert this, I neglected to take into account the fact that cfg80211 itself can be a module, but wext is always builtin. Fixes: aee809aaa2d1 ("wifi: cfg80211: unexport wireless_nlevent_flush()") Signed-off-by: Johannes Berg --- include/net/iw_handler.h | 6 ++++++ net/wireless/wext-compat.h | 6 ------ net/wireless/wext-core.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/iw_handler.h b/include/net/iw_handler.h index c9b46b996197..b80e474cb0aa 100644 --- a/include/net/iw_handler.h +++ b/include/net/iw_handler.h @@ -410,6 +410,12 @@ struct iw_spy_data { /* Send a single event to user space */ void wireless_send_event(struct net_device *dev, unsigned int cmd, union iwreq_data *wrqu, const char *extra); +#ifdef CONFIG_WEXT_CORE +/* flush all previous wext events - if work is done from netdev notifiers */ +void wireless_nlevent_flush(void); +#else +static inline void wireless_nlevent_flush(void) {} +#endif /* We may need a function to send a stream of events to user space. * More on that later... */ diff --git a/net/wireless/wext-compat.h b/net/wireless/wext-compat.h index f680dd134582..8251ca5df8ae 100644 --- a/net/wireless/wext-compat.h +++ b/net/wireless/wext-compat.h @@ -5,12 +5,6 @@ #include #include -#ifdef CONFIG_WEXT_CORE -void wireless_nlevent_flush(void); -#else -static inline void wireless_nlevent_flush(void) {} -#endif - int cfg80211_ibss_wext_siwfreq(struct net_device *dev, struct iw_request_info *info, struct iw_freq *wextfreq, char *extra); diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 00c640b3e86e..3bb04b05c5ce 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -20,7 +20,6 @@ #include #include #include -#include "wext-compat.h" typedef int (*wext_ioctl_func)(struct net_device *, struct iwreq *, unsigned int, struct iw_request_info *, @@ -357,6 +356,7 @@ void wireless_nlevent_flush(void) } up_read(&net_rwsem); } +EXPORT_SYMBOL_GPL(wireless_nlevent_flush); static int wext_netdev_notifier_call(struct notifier_block *nb, unsigned long state, void *ptr) -- cgit v1.3 From 69a3272d787c3e5977927a2775ecbd1a847dcf11 Mon Sep 17 00:00:00 2001 From: Jonas Rebmann Date: Mon, 7 Oct 2024 16:17:11 +0200 Subject: net: ipv4: igmp: optimize ____ip_mc_inc_group() using mc_hash The runtime cost of joining a single multicast group in the current implementation of ____ip_mc_inc_group grows linearly with the number of existing memberships. This is caused by the linear search for an existing group record in the multicast address list. This linear complexity results in quadratic complexity when successively adding memberships, which becomes a performance bottleneck when setting up large numbers of multicast memberships. If available, use the existing multicast hash map mc_hash to quickly search for an existing group membership record. This leads to near-constant complexity on the addition of a new multicast record, significantly improving performance for workloads involving many multicast memberships. On profiling with a loopback device, this patch presented a speedup of around 6 when successively setting up 2000 multicast groups using setsockopt without measurable drawbacks on smaller numbers of multicast groups. Signed-off-by: Jonas Rebmann Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 9bf09de6a2e7..6a238398acc9 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1437,16 +1437,32 @@ static void ip_mc_hash_remove(struct in_device *in_dev, static void ____ip_mc_inc_group(struct in_device *in_dev, __be32 addr, unsigned int mode, gfp_t gfp) { + struct ip_mc_list __rcu **mc_hash; struct ip_mc_list *im; ASSERT_RTNL(); - for_each_pmc_rtnl(in_dev, im) { - if (im->multiaddr == addr) { - im->users++; - ip_mc_add_src(in_dev, &addr, mode, 0, NULL, 0); - goto out; + mc_hash = rtnl_dereference(in_dev->mc_hash); + if (mc_hash) { + u32 hash = hash_32((__force u32)addr, MC_HASH_SZ_LOG); + + for (im = rtnl_dereference(mc_hash[hash]); + im; + im = rtnl_dereference(im->next_hash)) { + if (im->multiaddr == addr) + break; } + } else { + for_each_pmc_rtnl(in_dev, im) { + if (im->multiaddr == addr) + break; + } + } + + if (im) { + im->users++; + ip_mc_add_src(in_dev, &addr, mode, 0, NULL, 0); + goto out; } im = kzalloc(sizeof(*im), gfp); -- cgit v1.3 From 2b78d30620d7f8a9f9ce312ad21200ec7a554bd9 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 7 Oct 2024 20:24:29 +0200 Subject: ipv4: Convert ip_route_use_hint() to dscp_t. Pass a dscp_t variable to ip_route_use_hint(), instead of a plain u8, to prevent accidental setting of ECN bits in ->flowi4_tos. Only ip_rcv_finish_core() actually calls ip_route_use_hint(). Use the ip4h_dscp() helper to get the DSCP from the IPv4 header. While there, modify the declaration of ip_route_use_hint() in include/net/route.h so that it matches the prototype of its implementation in net/ipv4/route.c. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/c40994fdf804db7a363d04fdee01bf48dddda676.1728302212.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/net/route.h | 4 ++-- net/ipv4/ip_input.c | 4 ++-- net/ipv4/route.c | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/route.h b/include/net/route.h index 5e4374d66927..c219c0fecdcf 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -203,8 +203,8 @@ int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct in_device *in_dev, u32 *itag); int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev); -int ip_route_use_hint(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin, +int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, const struct sk_buff *hint); static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index c0a2490eb7c1..89bb63da6852 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -325,8 +325,8 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk, drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (ip_can_use_hint(skb, iph, hint)) { - err = ip_route_use_hint(skb, iph->daddr, iph->saddr, iph->tos, - dev, hint); + err = ip_route_use_hint(skb, iph->daddr, iph->saddr, + ip4h_dscp(iph), dev, hint); if (unlikely(err)) goto drop_error; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6e1cd0065b87..ac03916cfcde 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2136,7 +2136,7 @@ static int ip_mkroute_input(struct sk_buff *skb, * Uses the provided hint instead of performing a route lookup. */ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, + dscp_t dscp, struct net_device *dev, const struct sk_buff *hint) { struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -2160,8 +2160,8 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (rt->rt_type != RTN_LOCAL) goto skip_validate_source; - tos &= INET_DSCP_MASK; - err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag); + err = fib_validate_source(skb, saddr, daddr, inet_dscp_to_dsfield(dscp), + 0, dev, in_dev, &tag); if (err < 0) goto martian_source; -- cgit v1.3 From 34f28ffd62c14d8b558b3ef0de6c0ebfc5ca0b1a Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 7 Oct 2024 20:24:35 +0200 Subject: ipv4: Convert ip_mkroute_input() to dscp_t. Pass a dscp_t variable to ip_mkroute_input(), instead of a plain u8, to prevent accidental setting of ECN bits in ->flowi4_tos. Only ip_route_input_slow() actually calls ip_mkroute_input(). Since it already has a dscp_t variable to pass as parameter, we only need to remove the inet_dscp_to_dsfield() conversion. While there, reorganise the function parameters to fill up horizontal space. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/6aa71e28f9ff681cbd70847080e1ab6b526f94f1.1728302212.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv4/route.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ac03916cfcde..38bb38dbe490 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2112,11 +2112,9 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, } #endif /* CONFIG_IP_ROUTE_MULTIPATH */ -static int ip_mkroute_input(struct sk_buff *skb, - struct fib_result *res, - struct in_device *in_dev, - __be32 daddr, __be32 saddr, u32 tos, - struct flow_keys *hkeys) +static int ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, + struct in_device *in_dev, __be32 daddr, + __be32 saddr, dscp_t dscp, struct flow_keys *hkeys) { #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && fib_info_num_path(res->fi) > 1) { @@ -2128,7 +2126,8 @@ static int ip_mkroute_input(struct sk_buff *skb, #endif /* create a routing cache entry */ - return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); + return __mkroute_input(skb, res, in_dev, daddr, saddr, + inet_dscp_to_dsfield(dscp)); } /* Implements all the saddr-related checks as ip_route_input_slow(), @@ -2315,8 +2314,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto martian_destination; make_route: - err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, - inet_dscp_to_dsfield(dscp), flkeys); + err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, dscp, flkeys); out: return err; brd_input: -- cgit v1.3 From 0936c671911f46fcc0cc0c8ad2925eade7f64e80 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 7 Oct 2024 20:24:42 +0200 Subject: ipv4: Convert __mkroute_input() to dscp_t. Pass a dscp_t variable to __mkroute_input(), instead of a plain u8, to prevent accidental setting of ECN bits in ->flowi4_tos. Only ip_mkroute_input() actually calls __mkroute_input(). Since it already has a dscp_t variable to pass as parameter, we only need to remove the inet_dscp_to_dsfield() conversion. While there, reorganise the function parameters to fill up horizontal space. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/40853c720aee4d608e6b1b204982164c3b76697d.1728302212.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv4/route.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 38bb38dbe490..763b8bafd1bf 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1764,10 +1764,9 @@ static void ip_handle_martian_source(struct net_device *dev, } /* called in rcu_read_lock() section */ -static int __mkroute_input(struct sk_buff *skb, - const struct fib_result *res, - struct in_device *in_dev, - __be32 daddr, __be32 saddr, u32 tos) +static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, + struct in_device *in_dev, __be32 daddr, + __be32 saddr, dscp_t dscp) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); struct net_device *dev = nhc->nhc_dev; @@ -1785,8 +1784,8 @@ static int __mkroute_input(struct sk_buff *skb, return -EINVAL; } - err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), - in_dev->dev, in_dev, &itag); + err = fib_validate_source(skb, saddr, daddr, inet_dscp_to_dsfield(dscp), + FIB_RES_OIF(*res), in_dev->dev, in_dev, &itag); if (err < 0) { ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); @@ -2126,8 +2125,7 @@ static int ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, #endif /* create a routing cache entry */ - return __mkroute_input(skb, res, in_dev, daddr, saddr, - inet_dscp_to_dsfield(dscp)); + return __mkroute_input(skb, res, in_dev, daddr, saddr, dscp); } /* Implements all the saddr-related checks as ip_route_input_slow(), -- cgit v1.3 From 1a7c292617e4c6dcacf0590909ad9a231df6e25e Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 7 Oct 2024 20:24:48 +0200 Subject: ipv4: Convert ip_route_input_mc() to dscp_t. Pass a dscp_t variable to ip_route_input_mc(), instead of a plain u8, to prevent accidental setting of ECN bits in ->flowi4_tos. Only ip_route_input_rcu() actually calls ip_route_input_mc(). Since it already has a dscp_t variable to pass as parameter, we only need to remove the inet_dscp_to_dsfield() conversion. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/0cc653ef59bbc0a28881f706d34896c61eba9e01.1728302212.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv4/route.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 763b8bafd1bf..527121be1ba2 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1697,7 +1697,7 @@ int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, /* called in rcu_read_lock() section */ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, int our) + dscp_t dscp, struct net_device *dev, int our) { struct in_device *in_dev = __in_dev_get_rcu(dev); unsigned int flags = RTCF_MULTICAST; @@ -1705,7 +1705,9 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, u32 itag = 0; int err; - err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag); + err = ip_mc_validate_source(skb, daddr, saddr, + inet_dscp_to_dsfield(dscp), dev, in_dev, + &itag); if (err) return err; @@ -2455,9 +2457,8 @@ static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, IN_DEV_MFORWARD(in_dev)) #endif ) { - err = ip_route_input_mc(skb, daddr, saddr, - inet_dscp_to_dsfield(dscp), - dev, our); + err = ip_route_input_mc(skb, daddr, saddr, dscp, dev, + our); } return err; } -- cgit v1.3 From d32976408744a589f04b5c939f8f01f7167e5167 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 7 Oct 2024 20:24:54 +0200 Subject: ipv4: Convert ip_mc_validate_source() to dscp_t. Pass a dscp_t variable to ip_mc_validate_source(), instead of a plain u8, to prevent accidental setting of ECN bits in ->flowi4_tos. Callers of ip_mc_validate_source() to consider are: * ip_route_input_mc() which already has a dscp_t variable to pass as parameter. We just need to remove the inet_dscp_to_dsfield() conversion. * udp_v4_early_demux() which gets the DSCP directly from the IPv4 header and can simply use the ip4h_dscp() helper. Also, stop including net/inet_dscp.h in udp.c as we don't use any of its declarations anymore. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/c91b2cca04718b7ee6cf5b9c1d5b40507d65a8d4.1728302212.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/net/route.h | 3 ++- net/ipv4/route.c | 8 ++++---- net/ipv4/udp.c | 4 ++-- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/route.h b/include/net/route.h index c219c0fecdcf..586e59f7ed8a 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -198,8 +198,9 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 fl4->fl4_gre_key = gre_key; return ip_route_output_key(net, fl4); } + int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, + dscp_t dscp, struct net_device *dev, struct in_device *in_dev, u32 *itag); int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 527121be1ba2..1efb65e647c1 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1666,7 +1666,7 @@ EXPORT_SYMBOL(rt_dst_clone); /* called in rcu_read_lock() section */ int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, + dscp_t dscp, struct net_device *dev, struct in_device *in_dev, u32 *itag) { int err; @@ -1687,7 +1687,8 @@ int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, ip_hdr(skb)->protocol != IPPROTO_IGMP) return -EINVAL; } else { - err = fib_validate_source(skb, saddr, 0, tos, 0, dev, + err = fib_validate_source(skb, saddr, 0, + inet_dscp_to_dsfield(dscp), 0, dev, in_dev, itag); if (err < 0) return err; @@ -1705,8 +1706,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, u32 itag = 0; int err; - err = ip_mc_validate_source(skb, daddr, saddr, - inet_dscp_to_dsfield(dscp), dev, in_dev, + err = ip_mc_validate_source(skb, daddr, saddr, dscp, dev, in_dev, &itag); if (err) return err; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8accbf4cb295..4b74a25d0b6e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -100,6 +100,7 @@ #include #include #include +#include #include #include #include @@ -115,7 +116,6 @@ #include #include #include -#include #if IS_ENABLED(CONFIG_IPV6) #include #endif @@ -2619,7 +2619,7 @@ int udp_v4_early_demux(struct sk_buff *skb) if (!inet_sk(sk)->inet_daddr && in_dev) return ip_mc_validate_source(skb, iph->daddr, iph->saddr, - iph->tos & INET_DSCP_MASK, + ip4h_dscp(iph), skb->dev, in_dev, &itag); } return 0; -- cgit v1.3 From d36236ab52754ef6bd083be945e9c2e93f466022 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 7 Oct 2024 20:25:02 +0200 Subject: ipv4: Convert fib_validate_source() to dscp_t. Pass a dscp_t variable to fib_validate_source(), instead of a plain u8, to prevent accidental setting of ECN bits in ->flowi4_tos. All callers of fib_validate_source() already have a dscp_t variable to pass as parameter. We just need to remove the inet_dscp_to_dsfield() conversions. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/08612a4519bc5a3578bb493fbaad82437ebb73dc.1728302212.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/net/ip_fib.h | 3 ++- net/ipv4/fib_frontend.c | 5 +++-- net/ipv4/route.c | 21 +++++++++------------ 3 files changed, 14 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 967e4dc555fa..06130933542d 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -449,8 +449,9 @@ int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, __be32 fib_compute_spec_dst(struct sk_buff *skb); bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev); int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, - u8 tos, int oif, struct net_device *dev, + dscp_t dscp, int oif, struct net_device *dev, struct in_device *idev, u32 *itag); + #ifdef CONFIG_IP_ROUTE_CLASSID static inline int fib_num_tclassid_users(struct net *net) { diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 793e6781399a..d0fbc8c8c5e6 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -419,7 +419,7 @@ e_rpf: /* Ignore rp_filter for packets protected by IPsec. */ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, - u8 tos, int oif, struct net_device *dev, + dscp_t dscp, int oif, struct net_device *dev, struct in_device *idev, u32 *itag) { int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); @@ -448,7 +448,8 @@ ok: } full_check: - return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag); + return __fib_validate_source(skb, src, dst, inet_dscp_to_dsfield(dscp), + oif, dev, r, idev, itag); } static inline __be32 sk_extract_addr(struct sockaddr *addr) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 1efb65e647c1..a0b091a7df87 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1687,9 +1687,8 @@ int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, ip_hdr(skb)->protocol != IPPROTO_IGMP) return -EINVAL; } else { - err = fib_validate_source(skb, saddr, 0, - inet_dscp_to_dsfield(dscp), 0, dev, - in_dev, itag); + err = fib_validate_source(skb, saddr, 0, dscp, 0, dev, in_dev, + itag); if (err < 0) return err; } @@ -1786,8 +1785,8 @@ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, return -EINVAL; } - err = fib_validate_source(skb, saddr, daddr, inet_dscp_to_dsfield(dscp), - FIB_RES_OIF(*res), in_dev->dev, in_dev, &itag); + err = fib_validate_source(skb, saddr, daddr, dscp, FIB_RES_OIF(*res), + in_dev->dev, in_dev, &itag); if (err < 0) { ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); @@ -2159,8 +2158,8 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (rt->rt_type != RTN_LOCAL) goto skip_validate_source; - err = fib_validate_source(skb, saddr, daddr, inet_dscp_to_dsfield(dscp), - 0, dev, in_dev, &tag); + err = fib_validate_source(skb, saddr, daddr, dscp, 0, dev, in_dev, + &tag); if (err < 0) goto martian_source; @@ -2298,8 +2297,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, } if (res->type == RTN_LOCAL) { - err = fib_validate_source(skb, saddr, daddr, - inet_dscp_to_dsfield(dscp), 0, dev, + err = fib_validate_source(skb, saddr, daddr, dscp, 0, dev, in_dev, &itag); if (err < 0) goto martian_source; @@ -2322,9 +2320,8 @@ brd_input: goto e_inval; if (!ipv4_is_zeronet(saddr)) { - err = fib_validate_source(skb, saddr, 0, - inet_dscp_to_dsfield(dscp), 0, dev, - in_dev, &itag); + err = fib_validate_source(skb, saddr, 0, dscp, 0, dev, in_dev, + &itag); if (err < 0) goto martian_source; } -- cgit v1.3 From 3768b402735ea3a580e46d8e6c94779e2f42fb4c Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 7 Oct 2024 20:25:08 +0200 Subject: ipv4: Convert __fib_validate_source() to dscp_t. Pass a dscp_t variable to __fib_validate_source(), instead of a plain u8, to prevent accidental setting of ECN bits in ->flowi4_tos. Only fib_validate_source() actually calls __fib_validate_source(). Since it already has a dscp_t variable to pass as parameter, we only need to remove the inet_dscp_to_dsfield() conversion. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Reviewed-by: David Ahern Link: https://patch.msgid.link/8206b0a64a21a208ed94774e261a251c8d7bc251.1728302212.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_frontend.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index d0fbc8c8c5e6..8353518b110a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -342,7 +342,7 @@ EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev); * called with rcu_read_lock() */ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, - u8 tos, int oif, struct net_device *dev, + dscp_t dscp, int oif, struct net_device *dev, int rpf, struct in_device *idev, u32 *itag) { struct net *net = dev_net(dev); @@ -357,7 +357,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; fl4.saddr = dst; - fl4.flowi4_tos = tos; + fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_flags = 0; @@ -448,8 +448,8 @@ ok: } full_check: - return __fib_validate_source(skb, src, dst, inet_dscp_to_dsfield(dscp), - oif, dev, r, idev, itag); + return __fib_validate_source(skb, src, dst, dscp, oif, dev, r, idev, + itag); } static inline __be32 sk_extract_addr(struct sockaddr *addr) -- cgit v1.3 From d35bd24cea949235d10226576dcf476cf7fdc41d Mon Sep 17 00:00:00 2001 From: "xin.guo" Date: Mon, 7 Oct 2024 16:25:44 +0800 Subject: tcp: remove unnecessary update for tp->write_seq in tcp_connect() Commit 783237e8daf13 ("net-tcp: Fast Open client - sending SYN-data") introduces tcp_connect_queue_skb() and it would overwrite tcp->write_seq, so it is no need to update tp->write_seq before invoking tcp_connect_queue_skb(). Signed-off-by: xin.guo Link: https://patch.msgid.link/1728289544-4611-1-git-send-email-guoxin0309@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_output.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 08772395690d..1251510f0e58 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -4134,7 +4134,10 @@ int tcp_connect(struct sock *sk) if (unlikely(!buff)) return -ENOBUFS; - tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); + /* SYN eats a sequence byte, write_seq updated by + * tcp_connect_queue_skb(). + */ + tcp_init_nondata_skb(buff, tp->write_seq, TCPHDR_SYN); tcp_mstamp_refresh(tp); tp->retrans_stamp = tcp_time_stamp_ts(tp); tcp_connect_queue_skb(sk, buff); -- cgit v1.3 From 4a0ec2aa0704c8271fde33a0f7bb92d09c066c17 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Oct 2024 12:01:01 +0000 Subject: ipv6: switch inet6_addr_hash() to less predictable hash In commit 3f27fb23219e ("ipv6: addrconf: add per netns perturbation in inet6_addr_hash()"), I added net_hash_mix() in inet6_addr_hash() to get better hash dispersion, at a time all netns were sharing the hash table. Since then, commit 21a216a8fc63 ("ipv6/addrconf: allocate a per netns hash table") made the hash table per netns. We could remove the net_hash_mix() from inet6_addr_hash(), but there is still an issue with ipv6_addr_hash(). It is highly predictable and a malicious user can easily create thousands of IPv6 addresses all stored in the same hash bucket. Switch to __ipv6_addr_jhash(). We could use a dedicated secret, or reuse net_hash_mix() as I did in this patch. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241008120101.734521-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 94dceac52884..f31528d4f694 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1016,7 +1016,7 @@ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) static u32 inet6_addr_hash(const struct net *net, const struct in6_addr *addr) { - u32 val = ipv6_addr_hash(addr) ^ net_hash_mix(net); + u32 val = __ipv6_addr_jhash(addr, net_hash_mix(net)); return hash_32(val, IN6_ADDR_HSIZE_SHIFT); } -- cgit v1.3 From 4daf4dc275f1aa3b9629334c4185d62b7bdff1c4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Oct 2024 12:13:07 +0000 Subject: ipv6: switch inet6_acaddr_hash() to less predictable hash commit 2384d02520ff ("net/ipv6: Add anycast addresses to a global hashtable") added inet6_acaddr_hash(), using ipv6_addr_hash() and net_hash_mix() to get hash spreading for typical users. However ipv6_addr_hash() is highly predictable and a malicious user could abuse a specific hash bucket. Switch to __ipv6_addr_jhash(). We could use a dedicated secret, or reuse net_hash_mix() as I did in this patch. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241008121307.800040-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/anycast.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 0627c4c18d1a..562cace50ca9 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -49,9 +49,10 @@ static DEFINE_SPINLOCK(acaddr_hash_lock); static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr); -static u32 inet6_acaddr_hash(struct net *net, const struct in6_addr *addr) +static u32 inet6_acaddr_hash(const struct net *net, + const struct in6_addr *addr) { - u32 val = ipv6_addr_hash(addr) ^ net_hash_mix(net); + u32 val = __ipv6_addr_jhash(addr, net_hash_mix(net)); return hash_32(val, IN6_ADDR_HSIZE_SHIFT); } -- cgit v1.3 From 3a1beabe115910d848c959268ae6d68b4da77fd7 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 8 Oct 2024 10:54:54 +0200 Subject: ipv6: Remove redundant unlikely() IS_ERR_OR_NULL() already implies unlikely(). Signed-off-by: Tobias Klauser Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241008085454.8087-1-tklauser@distanz.ch Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 205673179b3c..f7b4608bb316 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -127,7 +127,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * nexthop = rt6_nexthop(dst_rt6_info(dst), daddr); neigh = __ipv6_neigh_lookup_noref(dev, nexthop); - if (unlikely(IS_ERR_OR_NULL(neigh))) { + if (IS_ERR_OR_NULL(neigh)) { if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dev, false); if (IS_ERR(neigh)) { -- cgit v1.3 From e4650d7ae4252f67e997a632adfae0dd74d3a99a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Oct 2024 11:16:03 +0000 Subject: net_sched: sch_sfq: handle bigger packets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SFQ has an assumption on dealing with packets smaller than 64KB. Even before BIG TCP, TCA_STAB can provide arbitrary big values in qdisc_pkt_len(skb) It is time to switch (struct sfq_slot)->allot to a 32bit field. sizeof(struct sfq_slot) is now 64 bytes, giving better cache locality. Signed-off-by: Eric Dumazet Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20241008111603.653140-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_sfq.c | 39 +++++++++++++-------------------------- 1 file changed, 13 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 3b9245a3c767..a4b8296a2fa1 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -77,12 +77,6 @@ #define SFQ_EMPTY_SLOT 0xffff #define SFQ_DEFAULT_HASH_DIVISOR 1024 -/* We use 16 bits to store allot, and want to handle packets up to 64K - * Scale allot by 8 (1<<3) so that no overflow occurs. - */ -#define SFQ_ALLOT_SHIFT 3 -#define SFQ_ALLOT_SIZE(X) DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT) - /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */ typedef u16 sfq_index; @@ -104,7 +98,7 @@ struct sfq_slot { sfq_index next; /* next slot in sfq RR chain */ struct sfq_head dep; /* anchor in dep[] chains */ unsigned short hash; /* hash value (index in ht[]) */ - short allot; /* credit for this slot */ + int allot; /* credit for this slot */ unsigned int backlog; struct red_vars vars; @@ -120,7 +114,6 @@ struct sfq_sched_data { siphash_key_t perturbation; u8 cur_depth; /* depth of longest slot */ u8 flags; - unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */ struct tcf_proto __rcu *filter_list; struct tcf_block *block; sfq_index *ht; /* Hash table ('divisor' slots) */ @@ -456,7 +449,7 @@ enqueue: */ q->tail = slot; /* We could use a bigger initial quantum for new flows */ - slot->allot = q->scaled_quantum; + slot->allot = q->quantum; } if (++sch->q.qlen <= q->limit) return NET_XMIT_SUCCESS; @@ -493,7 +486,7 @@ next_slot: slot = &q->slots[a]; if (slot->allot <= 0) { q->tail = slot; - slot->allot += q->scaled_quantum; + slot->allot += q->quantum; goto next_slot; } skb = slot_dequeue_head(slot); @@ -512,7 +505,7 @@ next_slot: } q->tail->next = next_a; } else { - slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb)); + slot->allot -= qdisc_pkt_len(skb); } return skb; } @@ -595,7 +588,7 @@ drop: q->tail->next = x; } q->tail = slot; - slot->allot = q->scaled_quantum; + slot->allot = q->quantum; } } sch->q.qlen -= dropped; @@ -628,7 +621,8 @@ static void sfq_perturbation(struct timer_list *t) rcu_read_unlock(); } -static int sfq_change(struct Qdisc *sch, struct nlattr *opt) +static int sfq_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) { struct sfq_sched_data *q = qdisc_priv(sch); struct tc_sfq_qopt *ctl = nla_data(opt); @@ -646,14 +640,10 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536)) return -EINVAL; - /* slot->allot is a short, make sure quantum is not too big. */ - if (ctl->quantum) { - unsigned int scaled = SFQ_ALLOT_SIZE(ctl->quantum); - - if (scaled <= 0 || scaled > SHRT_MAX) - return -EINVAL; + if ((int)ctl->quantum < 0) { + NL_SET_ERR_MSG_MOD(extack, "invalid quantum"); + return -EINVAL; } - if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max, ctl_v1->Wlog, ctl_v1->Scell_log, NULL)) return -EINVAL; @@ -663,10 +653,8 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) return -ENOMEM; } sch_tree_lock(sch); - if (ctl->quantum) { + if (ctl->quantum) q->quantum = ctl->quantum; - q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); - } WRITE_ONCE(q->perturb_period, ctl->perturb_period * HZ); if (ctl->flows) q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS); @@ -762,12 +750,11 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt, q->divisor = SFQ_DEFAULT_HASH_DIVISOR; q->maxflows = SFQ_DEFAULT_FLOWS; q->quantum = psched_mtu(qdisc_dev(sch)); - q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); q->perturb_period = 0; get_random_bytes(&q->perturbation, sizeof(q->perturbation)); if (opt) { - int err = sfq_change(sch, opt); + int err = sfq_change(sch, opt, extack); if (err) return err; } @@ -878,7 +865,7 @@ static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl, if (idx != SFQ_EMPTY_SLOT) { const struct sfq_slot *slot = &q->slots[idx]; - xstats.allot = slot->allot << SFQ_ALLOT_SHIFT; + xstats.allot = slot->allot; qs.qlen = slot->qlen; qs.backlog = slot->backlog; } -- cgit v1.3 From 87173021f1583ee37f4801fcde354729da8db3dc Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 8 Oct 2024 10:29:03 -0700 Subject: ipv4: Link IPv4 address to per-netns hash table. As a prep for per-netns RTNL conversion, we want to namespacify the IPv4 address hash table and the GC work. Let's allocate the per-netns IPv4 address hash table to net->ipv4.inet_addr_lst and link IPv4 addresses into it. The actual users will be converted later. Note that the IPv6 address hash table is already namespacified. Reviewed-by: Eric Dumazet Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241008172906.1326-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/linux/inetdevice.h | 1 + include/net/netns/ipv4.h | 1 + net/ipv4/devinet.c | 22 +++++++++++++++++++--- 3 files changed, 21 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index cb5280e6cc21..d0c2bf67a9b0 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -142,6 +142,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev) struct in_ifaddr { struct hlist_node hash; + struct hlist_node addr_lst; struct in_ifaddr __rcu *ifa_next; struct in_device *ifa_dev; struct rcu_head rcu_head; diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 276f622f3516..29eba2eaaa26 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -270,5 +270,6 @@ struct netns_ipv4 { atomic_t rt_genid; siphash_key_t ip_id_key; + struct hlist_head *inet_addr_lst; }; #endif diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index ab76744383cf..059807a627a6 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -134,11 +134,13 @@ static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa) ASSERT_RTNL(); hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]); + hlist_add_head_rcu(&ifa->addr_lst, &net->ipv4.inet_addr_lst[hash]); } static void inet_hash_remove(struct in_ifaddr *ifa) { ASSERT_RTNL(); + hlist_del_init_rcu(&ifa->addr_lst); hlist_del_init_rcu(&ifa->hash); } @@ -228,6 +230,7 @@ static struct in_ifaddr *inet_alloc_ifa(struct in_device *in_dev) ifa->ifa_dev = in_dev; INIT_HLIST_NODE(&ifa->hash); + INIT_HLIST_NODE(&ifa->addr_lst); return ifa; } @@ -2663,14 +2666,21 @@ static struct ctl_table ctl_forward_entry[] = { static __net_init int devinet_init_net(struct net *net) { - int err; - struct ipv4_devconf *all, *dflt; #ifdef CONFIG_SYSCTL - struct ctl_table *tbl; struct ctl_table_header *forw_hdr; + struct ctl_table *tbl; #endif + struct ipv4_devconf *all, *dflt; + int err; + int i; err = -ENOMEM; + net->ipv4.inet_addr_lst = kmalloc_array(IN4_ADDR_HSIZE, + sizeof(struct hlist_head), + GFP_KERNEL); + if (!net->ipv4.inet_addr_lst) + goto err_alloc_hash; + all = kmemdup(&ipv4_devconf, sizeof(ipv4_devconf), GFP_KERNEL); if (!all) goto err_alloc_all; @@ -2731,6 +2741,9 @@ static __net_init int devinet_init_net(struct net *net) net->ipv4.forw_hdr = forw_hdr; #endif + for (i = 0; i < IN4_ADDR_HSIZE; i++) + INIT_HLIST_HEAD(&net->ipv4.inet_addr_lst[i]); + net->ipv4.devconf_all = all; net->ipv4.devconf_dflt = dflt; return 0; @@ -2748,6 +2761,8 @@ err_alloc_ctl: err_alloc_dflt: kfree(all); err_alloc_all: + kfree(net->ipv4.inet_addr_lst); +err_alloc_hash: return err; } @@ -2766,6 +2781,7 @@ static __net_exit void devinet_exit_net(struct net *net) #endif kfree(net->ipv4.devconf_dflt); kfree(net->ipv4.devconf_all); + kfree(net->ipv4.inet_addr_lst); } static __net_initdata struct pernet_operations devinet_ops = { -- cgit v1.3 From 49e613194292ff7750a3f889cd2db012da16f68e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 8 Oct 2024 10:29:04 -0700 Subject: ipv4: Use per-netns hash table in inet_lookup_ifaddr_rcu(). Now, all IPv4 addresses are put in the per-netns hash table. Let's use it in inet_lookup_ifaddr_rcu(). Reviewed-by: Eric Dumazet Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241008172906.1326-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/devinet.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 059807a627a6..cf47b5ac061f 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -188,9 +188,8 @@ struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr) u32 hash = inet_addr_hash(net, addr); struct in_ifaddr *ifa; - hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) - if (ifa->ifa_local == addr && - net_eq(dev_net(ifa->ifa_dev->dev), net)) + hlist_for_each_entry_rcu(ifa, &net->ipv4.inet_addr_lst[hash], addr_lst) + if (ifa->ifa_local == addr) return ifa; return NULL; -- cgit v1.3 From 1675f385213edc14ed849e079d6866b48e552252 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 8 Oct 2024 10:29:05 -0700 Subject: ipv4: Namespacify IPv4 address GC. Each IPv4 address could have a lifetime, which is useful for DHCP, and GC is periodically executed as check_lifetime_work. check_lifetime() does the actual GC under RTNL. 1. Acquire RTNL 2. Iterate inet_addr_lst 3. Remove IPv4 address if expired 4. Release RTNL Namespacifying the GC is required for per-netns RTNL, but using the per-netns hash table will shorten the time on the hash bucket iteration under RTNL. Let's add per-netns GC work and use the per-netns hash table. Reviewed-by: Eric Dumazet Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241008172906.1326-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 1 + net/ipv4/devinet.c | 32 ++++++++++++++++++-------------- 2 files changed, 19 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 29eba2eaaa26..66a4cffc44ee 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -271,5 +271,6 @@ struct netns_ipv4 { atomic_t rt_genid; siphash_key_t ip_id_key; struct hlist_head *inet_addr_lst; + struct delayed_work addr_chk_work; }; #endif diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index cf47b5ac061f..ac245944e89e 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -486,15 +486,12 @@ static void inet_del_ifa(struct in_device *in_dev, __inet_del_ifa(in_dev, ifap, destroy, NULL, 0); } -static void check_lifetime(struct work_struct *work); - -static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime); - static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, u32 portid, struct netlink_ext_ack *extack) { struct in_ifaddr __rcu **last_primary, **ifap; struct in_device *in_dev = ifa->ifa_dev; + struct net *net = dev_net(in_dev->dev); struct in_validator_info ivi; struct in_ifaddr *ifa1; int ret; @@ -563,8 +560,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, inet_hash_insert(dev_net(in_dev->dev), ifa); - cancel_delayed_work(&check_lifetime_work); - queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0); + cancel_delayed_work(&net->ipv4.addr_chk_work); + queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, 0); /* Send message first, then call notifier. Notifier will trigger FIB update, so that @@ -710,16 +707,19 @@ static void check_lifetime(struct work_struct *work) unsigned long now, next, next_sec, next_sched; struct in_ifaddr *ifa; struct hlist_node *n; + struct net *net; int i; + net = container_of(to_delayed_work(work), struct net, ipv4.addr_chk_work); now = jiffies; next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY); for (i = 0; i < IN4_ADDR_HSIZE; i++) { + struct hlist_head *head = &net->ipv4.inet_addr_lst[i]; bool change_needed = false; rcu_read_lock(); - hlist_for_each_entry_rcu(ifa, &inet_addr_lst[i], hash) { + hlist_for_each_entry_rcu(ifa, head, addr_lst) { unsigned long age, tstamp; u32 preferred_lft; u32 valid_lft; @@ -757,7 +757,7 @@ static void check_lifetime(struct work_struct *work) if (!change_needed) continue; rtnl_lock(); - hlist_for_each_entry_safe(ifa, n, &inet_addr_lst[i], hash) { + hlist_for_each_entry_safe(ifa, n, head, addr_lst) { unsigned long age; if (ifa->ifa_flags & IFA_F_PERMANENT) @@ -806,8 +806,8 @@ static void check_lifetime(struct work_struct *work) if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX)) next_sched = now + ADDRCONF_TIMER_FUZZ_MAX; - queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, - next_sched - now); + queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, + next_sched - now); } static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft, @@ -1004,9 +1004,9 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, ifa->ifa_proto = new_proto; set_ifa_lifetime(ifa, valid_lft, prefered_lft); - cancel_delayed_work(&check_lifetime_work); + cancel_delayed_work(&net->ipv4.addr_chk_work); queue_delayed_work(system_power_efficient_wq, - &check_lifetime_work, 0); + &net->ipv4.addr_chk_work, 0); rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid); } return 0; @@ -2743,6 +2743,8 @@ static __net_init int devinet_init_net(struct net *net) for (i = 0; i < IN4_ADDR_HSIZE; i++) INIT_HLIST_HEAD(&net->ipv4.inet_addr_lst[i]); + INIT_DEFERRABLE_WORK(&net->ipv4.addr_chk_work, check_lifetime); + net->ipv4.devconf_all = all; net->ipv4.devconf_dflt = dflt; return 0; @@ -2769,7 +2771,11 @@ static __net_exit void devinet_exit_net(struct net *net) { #ifdef CONFIG_SYSCTL const struct ctl_table *tbl; +#endif + + cancel_delayed_work_sync(&net->ipv4.addr_chk_work); +#ifdef CONFIG_SYSCTL tbl = net->ipv4.forw_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.forw_hdr); __devinet_sysctl_unregister(net, net->ipv4.devconf_dflt, @@ -2806,8 +2812,6 @@ void __init devinet_init(void) register_pernet_subsys(&devinet_ops); register_netdevice_notifier(&ip_netdev_notifier); - queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0); - rtnl_af_register(&inet_af_ops); rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, 0); -- cgit v1.3 From 99ee348e6a41cf24b334a1bb7cde87239e8e2d95 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 8 Oct 2024 10:29:06 -0700 Subject: ipv4: Retire global IPv4 hash table inet_addr_lst. No one uses inet_addr_lst anymore, so let's remove it. Reviewed-by: Eric Dumazet Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241008172906.1326-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/linux/inetdevice.h | 1 - net/ipv4/devinet.c | 10 ---------- 2 files changed, 11 deletions(-) (limited to 'net') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index d0c2bf67a9b0..d9c690c8c80b 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -141,7 +141,6 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev) ARP_EVICT_NOCARRIER) struct in_ifaddr { - struct hlist_node hash; struct hlist_node addr_lst; struct in_ifaddr __rcu *ifa_next; struct in_device *ifa_dev; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index ac245944e89e..7c156f85b7d2 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -119,8 +119,6 @@ struct inet_fill_args { #define IN4_ADDR_HSIZE_SHIFT 8 #define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT) -static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE]; - static u32 inet_addr_hash(const struct net *net, __be32 addr) { u32 val = (__force u32) addr ^ net_hash_mix(net); @@ -133,7 +131,6 @@ static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa) u32 hash = inet_addr_hash(net, ifa->ifa_local); ASSERT_RTNL(); - hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]); hlist_add_head_rcu(&ifa->addr_lst, &net->ipv4.inet_addr_lst[hash]); } @@ -141,7 +138,6 @@ static void inet_hash_remove(struct in_ifaddr *ifa) { ASSERT_RTNL(); hlist_del_init_rcu(&ifa->addr_lst); - hlist_del_init_rcu(&ifa->hash); } /** @@ -228,7 +224,6 @@ static struct in_ifaddr *inet_alloc_ifa(struct in_device *in_dev) in_dev_hold(in_dev); ifa->ifa_dev = in_dev; - INIT_HLIST_NODE(&ifa->hash); INIT_HLIST_NODE(&ifa->addr_lst); return ifa; @@ -2804,11 +2799,6 @@ static struct rtnl_af_ops inet_af_ops __read_mostly = { void __init devinet_init(void) { - int i; - - for (i = 0; i < IN4_ADDR_HSIZE; i++) - INIT_HLIST_HEAD(&inet_addr_lst[i]); - register_pernet_subsys(&devinet_ops); register_netdevice_notifier(&ip_netdev_notifier); -- cgit v1.3 From 13d68a16430312fc21990f48326366eb73891202 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Oct 2024 10:09:47 +0200 Subject: genetlink: extend info user-storage to match NL cb ctx This allows a more uniform implementation of non-dump and dump operations, and will be used later in the series to avoid some per-operation allocation. Additionally rename the NL_ASSERT_DUMP_CTX_FITS macro, to fit a more extended usage. Suggested-by: Jakub Kicinski Reviewed-by: Jakub Kicinski Reviewed-by: Jiri Pirko Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/1130cc2896626b84587a2a5f96a5c6829638f4da.1728460186.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_mdb.c | 2 +- include/linux/netlink.h | 5 +++-- include/net/genetlink.h | 8 ++++++-- net/core/netdev-genl.c | 2 +- net/core/rtnetlink.c | 2 +- net/devlink/devl_internal.h | 2 +- net/ethtool/rss.c | 2 +- net/netfilter/nf_conntrack_netlink.c | 2 +- net/netlink/genetlink.c | 4 ++-- 9 files changed, 17 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/drivers/net/vxlan/vxlan_mdb.c b/drivers/net/vxlan/vxlan_mdb.c index 60eb95a06d55..ebed05a2804c 100644 --- a/drivers/net/vxlan/vxlan_mdb.c +++ b/drivers/net/vxlan/vxlan_mdb.c @@ -284,7 +284,7 @@ int vxlan_mdb_dump(struct net_device *dev, struct sk_buff *skb, ASSERT_RTNL(); - NL_ASSERT_DUMP_CTX_FITS(struct vxlan_mdb_dump_ctx); + NL_ASSERT_CTX_FITS(struct vxlan_mdb_dump_ctx); nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWMDB, sizeof(*bpm), diff --git a/include/linux/netlink.h b/include/linux/netlink.h index b332c2048c75..a3ca198a3a9e 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -34,6 +34,7 @@ struct netlink_skb_parms { #define NETLINK_CB(skb) (*(struct netlink_skb_parms*)&((skb)->cb)) #define NETLINK_CREDS(skb) (&NETLINK_CB((skb)).creds) +#define NETLINK_CTX_SIZE 48 void netlink_table_grab(void); @@ -293,7 +294,7 @@ struct netlink_callback { int flags; bool strict_check; union { - u8 ctx[48]; + u8 ctx[NETLINK_CTX_SIZE]; /* args is deprecated. Cast a struct over ctx instead * for proper type safety. @@ -302,7 +303,7 @@ struct netlink_callback { }; }; -#define NL_ASSERT_DUMP_CTX_FITS(type_name) \ +#define NL_ASSERT_CTX_FITS(type_name) \ BUILD_BUG_ON(sizeof(type_name) > \ sizeof_field(struct netlink_callback, ctx)) diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 9ab49bfeae78..9d3726e8f90e 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -124,7 +124,8 @@ struct genl_family { * @genlhdr: generic netlink message header * @attrs: netlink attributes * @_net: network namespace - * @user_ptr: user pointers + * @ctx: storage space for the use by the family + * @user_ptr: user pointers (deprecated, use ctx instead) * @extack: extended ACK report struct */ struct genl_info { @@ -135,7 +136,10 @@ struct genl_info { struct genlmsghdr * genlhdr; struct nlattr ** attrs; possible_net_t _net; - void * user_ptr[2]; + union { + u8 ctx[NETLINK_CTX_SIZE]; + void * user_ptr[2]; + }; struct netlink_ext_ack *extack; }; diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 1cb954f2d39e..358cba248796 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -24,7 +24,7 @@ struct netdev_nl_dump_ctx { static struct netdev_nl_dump_ctx *netdev_dump_ctx(struct netlink_callback *cb) { - NL_ASSERT_DUMP_CTX_FITS(struct netdev_nl_dump_ctx); + NL_ASSERT_CTX_FITS(struct netdev_nl_dump_ctx); return (struct netdev_nl_dump_ctx *)cb->ctx; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 6d68247aea70..a9b81b7d9746 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -6243,7 +6243,7 @@ static int rtnl_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb) int idx, s_idx; int err; - NL_ASSERT_DUMP_CTX_FITS(struct rtnl_mdb_dump_ctx); + NL_ASSERT_CTX_FITS(struct rtnl_mdb_dump_ctx); if (cb->strict_check) { err = rtnl_mdb_valid_dump_req(cb->nlh, cb->extack); diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index c7a8e13f917c..a9f064ab9ed9 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -166,7 +166,7 @@ int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb, static inline struct devlink_nl_dump_state * devlink_dump_state(struct netlink_callback *cb) { - NL_ASSERT_DUMP_CTX_FITS(struct devlink_nl_dump_state); + NL_ASSERT_CTX_FITS(struct devlink_nl_dump_state); return (struct devlink_nl_dump_state *)cb->ctx; } diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index e07386275e14..7cb106b590ab 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -224,7 +224,7 @@ struct rss_nl_dump_ctx { static struct rss_nl_dump_ctx *rss_dump_ctx(struct netlink_callback *cb) { - NL_ASSERT_DUMP_CTX_FITS(struct rss_nl_dump_ctx); + NL_ASSERT_CTX_FITS(struct rss_nl_dump_ctx); return (struct rss_nl_dump_ctx *)cb->ctx; } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 6a1239433830..36168f8b6efa 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -3870,7 +3870,7 @@ static int __init ctnetlink_init(void) { int ret; - NL_ASSERT_DUMP_CTX_FITS(struct ctnetlink_list_dump_ctx); + NL_ASSERT_CTX_FITS(struct ctnetlink_list_dump_ctx); ret = nfnetlink_subsys_register(&ctnl_subsys); if (ret < 0) { diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index feb54c63a116..29387b605f3e 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -997,7 +997,7 @@ static int genl_start(struct netlink_callback *cb) info->info.attrs = attrs; genl_info_net_set(&info->info, sock_net(cb->skb->sk)); info->info.extack = cb->extack; - memset(&info->info.user_ptr, 0, sizeof(info->info.user_ptr)); + memset(&info->info.ctx, 0, sizeof(info->info.ctx)); cb->data = info; if (ops->start) { @@ -1104,7 +1104,7 @@ static int genl_family_rcv_msg_doit(const struct genl_family *family, info.attrs = attrbuf; info.extack = extack; genl_info_net_set(&info, net); - memset(&info.user_ptr, 0, sizeof(info.user_ptr)); + memset(&info.ctx, 0, sizeof(info.ctx)); if (ops->pre_doit) { err = ops->pre_doit(ops, skb, &info); -- cgit v1.3 From 04e65df94b3112a1b319b6deb5bab83fd740bc7d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Oct 2024 10:09:48 +0200 Subject: netlink: spec: add shaper YAML spec Define the user-space visible interface to query, configure and delete network shapers via yaml definition. Add dummy implementations for the relevant NL callbacks. set() and delete() operations touch a single shaper creating/updating or deleting it. The group() operation creates a shaper's group, nesting multiple input shapers under the specified output shaper. Reviewed-by: Jiri Pirko Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/7a33a1ff370bdbcd0cd3f909575c912cd56f41da.1728460186.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/net_shaper.yaml | 274 ++++++++++++++++++++++++++++ MAINTAINERS | 1 + include/uapi/linux/net_shaper.h | 78 ++++++++ net/Kconfig | 3 + net/Makefile | 1 + net/shaper/Makefile | 8 + net/shaper/shaper.c | 55 ++++++ net/shaper/shaper_nl_gen.c | 125 +++++++++++++ net/shaper/shaper_nl_gen.h | 34 ++++ 9 files changed, 579 insertions(+) create mode 100644 Documentation/netlink/specs/net_shaper.yaml create mode 100644 include/uapi/linux/net_shaper.h create mode 100644 net/shaper/Makefile create mode 100644 net/shaper/shaper.c create mode 100644 net/shaper/shaper_nl_gen.c create mode 100644 net/shaper/shaper_nl_gen.h (limited to 'net') diff --git a/Documentation/netlink/specs/net_shaper.yaml b/Documentation/netlink/specs/net_shaper.yaml new file mode 100644 index 000000000000..618fc09932ff --- /dev/null +++ b/Documentation/netlink/specs/net_shaper.yaml @@ -0,0 +1,274 @@ +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +name: net-shaper + +doc: | + Networking HW rate limiting configuration. + + This API allows configuring HW shapers available on the network + devices at different levels (queues, network device) and allows + arbitrary manipulation of the scheduling tree of the involved + shapers. + + Each @shaper is identified within the given device, by a @handle, + comprising both a @scope and an @id. + + Depending on the @scope value, the shapers are attached to specific + HW objects (queues, devices) or, for @node scope, represent a + scheduling group, that can be placed in an arbitrary location of + the scheduling tree. + + Shapers can be created with two different operations: the @set + operation, to create and update a single "attached" shaper, and + the @group operation, to create and update a scheduling + group. Only the @group operation can create @node scope shapers. + + Existing shapers can be deleted/reset via the @delete operation. + + The user can query the running configuration via the @get operation. + +definitions: + - + type: enum + name: scope + doc: Defines the shaper @id interpretation. + render-max: true + entries: + - name: unspec + doc: The scope is not specified. + - + name: netdev + doc: The main shaper for the given network device. + - + name: queue + doc: | + The shaper is attached to the given device queue, + the @id represents the queue number. + - + name: node + doc: | + The shaper allows grouping of queues or other + node shapers; can be nested in either @netdev + shapers or other @node shapers, allowing placement + in any location of the scheduling tree, except + leaves and root. + - + type: enum + name: metric + doc: Different metric supported by the shaper. + entries: + - + name: bps + doc: Shaper operates on a bits per second basis. + - + name: pps + doc: Shaper operates on a packets per second basis. + +attribute-sets: + - + name: net-shaper + attributes: + - + name: handle + type: nest + nested-attributes: handle + doc: Unique identifier for the given shaper inside the owning device. + - + name: metric + type: u32 + enum: metric + doc: Metric used by the given shaper for bw-min, bw-max and burst. + - + name: bw-min + type: uint + doc: Guaranteed bandwidth for the given shaper. + - + name: bw-max + type: uint + doc: Maximum bandwidth for the given shaper or 0 when unlimited. + - + name: burst + type: uint + doc: | + Maximum burst-size for shaping. Should not be interpreted + as a quantum. + - + name: priority + type: u32 + doc: | + Scheduling priority for the given shaper. The priority + scheduling is applied to sibling shapers. + - + name: weight + type: u32 + doc: | + Relative weight for round robin scheduling of the + given shaper. + The scheduling is applied to all sibling shapers + with the same priority. + - + name: ifindex + type: u32 + doc: Interface index owning the specified shaper. + - + name: parent + type: nest + nested-attributes: handle + doc: | + Identifier for the parent of the affected shaper. + Only needed for @group operation. + - + name: leaves + type: nest + multi-attr: true + nested-attributes: leaf-info + doc: | + Describes a set of leaves shapers for a @group operation. + - + name: handle + attributes: + - + name: scope + type: u32 + enum: scope + doc: Defines the shaper @id interpretation. + - + name: id + type: u32 + doc: | + Numeric identifier of a shaper. The id semantic depends on + the scope. For @queue scope it's the queue id and for @node + scope it's the node identifier. + - + name: leaf-info + subset-of: net-shaper + attributes: + - + name: handle + - + name: priority + - + name: weight + +operations: + list: + - + name: get + doc: | + Get information about a shaper for a given device. + attribute-set: net-shaper + + do: + pre: net-shaper-nl-pre-doit + post: net-shaper-nl-post-doit + request: + attributes: &ns-binding + - ifindex + - handle + reply: + attributes: &ns-attrs + - ifindex + - parent + - handle + - metric + - bw-min + - bw-max + - burst + - priority + - weight + + dump: + pre: net-shaper-nl-pre-dumpit + post: net-shaper-nl-post-dumpit + request: + attributes: + - ifindex + reply: + attributes: *ns-attrs + - + name: set + doc: | + Create or update the specified shaper. + The set operation can't be used to create a @node scope shaper, + use the @group operation instead. + attribute-set: net-shaper + flags: [ admin-perm ] + + do: + pre: net-shaper-nl-pre-doit + post: net-shaper-nl-post-doit + request: + attributes: + - ifindex + - handle + - metric + - bw-min + - bw-max + - burst + - priority + - weight + + - + name: delete + doc: | + Clear (remove) the specified shaper. When deleting + a @node shaper, reattach all the node's leaves to the + deleted node's parent. + If, after the removal, the parent shaper has no more + leaves and the parent shaper scope is @node, the parent + node is deleted, recursively. + When deleting a @queue shaper or a @netdev shaper, + the shaper disappears from the hierarchy, but the + queue/device can still send traffic: it has an implicit + node with infinite bandwidth. The queue's implicit node + feeds an implicit RR node at the root of the hierarchy. + attribute-set: net-shaper + flags: [ admin-perm ] + + do: + pre: net-shaper-nl-pre-doit + post: net-shaper-nl-post-doit + request: + attributes: *ns-binding + + - + name: group + doc: | + Create or update a scheduling group, attaching the specified + @leaves shapers under the specified node identified by @handle. + The @leaves shapers scope must be @queue and the node shaper + scope must be either @node or @netdev. + When the node shaper has @node scope, if the @handle @id is not + specified, a new shaper of such scope is created, otherwise the + specified node must already exist. + When updating an existing node shaper, the specified @leaves are + added to the existing node; such node will also retain any preexisting + leave. + The @parent handle for a new node shaper defaults to the parent + of all the leaves, provided all the leaves share the same parent. + Otherwise @parent handle must be specified. + The user can optionally provide shaping attributes for the node + shaper. + The operation is atomic, on failure no change is applied to + the device shaping configuration, otherwise the @node shaper + full identifier, comprising @binding and @handle, is provided + as the reply. + attribute-set: net-shaper + flags: [ admin-perm ] + + do: + pre: net-shaper-nl-pre-doit + post: net-shaper-nl-post-doit + request: + attributes: + - ifindex + - parent + - handle + - metric + - bw-min + - bw-max + - burst + - priority + - weight + - leaves + reply: + attributes: *ns-binding diff --git a/MAINTAINERS b/MAINTAINERS index 1389704c7d8d..2927b44dda25 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16116,6 +16116,7 @@ F: include/linux/platform_data/wiznet.h F: include/uapi/linux/cn_proc.h F: include/uapi/linux/ethtool_netlink.h F: include/uapi/linux/if_* +F: include/uapi/linux/net_shaper.h F: include/uapi/linux/netdev* F: tools/testing/selftests/drivers/net/ X: Documentation/devicetree/bindings/net/bluetooth/ diff --git a/include/uapi/linux/net_shaper.h b/include/uapi/linux/net_shaper.h new file mode 100644 index 000000000000..9e3fa63618ee --- /dev/null +++ b/include/uapi/linux/net_shaper.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/net_shaper.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_NET_SHAPER_H +#define _UAPI_LINUX_NET_SHAPER_H + +#define NET_SHAPER_FAMILY_NAME "net-shaper" +#define NET_SHAPER_FAMILY_VERSION 1 + +/** + * enum net_shaper_scope - Defines the shaper @id interpretation. + * @NET_SHAPER_SCOPE_UNSPEC: The scope is not specified. + * @NET_SHAPER_SCOPE_NETDEV: The main shaper for the given network device. + * @NET_SHAPER_SCOPE_QUEUE: The shaper is attached to the given device queue, + * the @id represents the queue number. + * @NET_SHAPER_SCOPE_NODE: The shaper allows grouping of queues or other node + * shapers; can be nested in either @netdev shapers or other @node shapers, + * allowing placement in any location of the scheduling tree, except leaves + * and root. + */ +enum net_shaper_scope { + NET_SHAPER_SCOPE_UNSPEC, + NET_SHAPER_SCOPE_NETDEV, + NET_SHAPER_SCOPE_QUEUE, + NET_SHAPER_SCOPE_NODE, + + /* private: */ + __NET_SHAPER_SCOPE_MAX, + NET_SHAPER_SCOPE_MAX = (__NET_SHAPER_SCOPE_MAX - 1) +}; + +/** + * enum net_shaper_metric - Different metric supported by the shaper. + * @NET_SHAPER_METRIC_BPS: Shaper operates on a bits per second basis. + * @NET_SHAPER_METRIC_PPS: Shaper operates on a packets per second basis. + */ +enum net_shaper_metric { + NET_SHAPER_METRIC_BPS, + NET_SHAPER_METRIC_PPS, +}; + +enum { + NET_SHAPER_A_HANDLE = 1, + NET_SHAPER_A_METRIC, + NET_SHAPER_A_BW_MIN, + NET_SHAPER_A_BW_MAX, + NET_SHAPER_A_BURST, + NET_SHAPER_A_PRIORITY, + NET_SHAPER_A_WEIGHT, + NET_SHAPER_A_IFINDEX, + NET_SHAPER_A_PARENT, + NET_SHAPER_A_LEAVES, + + __NET_SHAPER_A_MAX, + NET_SHAPER_A_MAX = (__NET_SHAPER_A_MAX - 1) +}; + +enum { + NET_SHAPER_A_HANDLE_SCOPE = 1, + NET_SHAPER_A_HANDLE_ID, + + __NET_SHAPER_A_HANDLE_MAX, + NET_SHAPER_A_HANDLE_MAX = (__NET_SHAPER_A_HANDLE_MAX - 1) +}; + +enum { + NET_SHAPER_CMD_GET = 1, + NET_SHAPER_CMD_SET, + NET_SHAPER_CMD_DELETE, + NET_SHAPER_CMD_GROUP, + + __NET_SHAPER_CMD_MAX, + NET_SHAPER_CMD_MAX = (__NET_SHAPER_CMD_MAX - 1) +}; + +#endif /* _UAPI_LINUX_NET_SHAPER_H */ diff --git a/net/Kconfig b/net/Kconfig index a629f92dc86b..c3fca69a7c83 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -72,6 +72,9 @@ config NET_DEVMEM depends on GENERIC_ALLOCATOR depends on PAGE_POOL +config NET_SHAPER + bool + menu "Networking options" source "net/packet/Kconfig" diff --git a/net/Makefile b/net/Makefile index 65bb8c72a35e..60ed5190eda8 100644 --- a/net/Makefile +++ b/net/Makefile @@ -79,3 +79,4 @@ obj-$(CONFIG_XDP_SOCKETS) += xdp/ obj-$(CONFIG_MPTCP) += mptcp/ obj-$(CONFIG_MCTP) += mctp/ obj-$(CONFIG_NET_HANDSHAKE) += handshake/ +obj-$(CONFIG_NET_SHAPER) += shaper/ diff --git a/net/shaper/Makefile b/net/shaper/Makefile new file mode 100644 index 000000000000..54af7169a331 --- /dev/null +++ b/net/shaper/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Makefile for the net shaper infrastructure. +# +# Copyright (c) 2024, Red Hat, Inc. +# + +obj-y += shaper.o shaper_nl_gen.o diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c new file mode 100644 index 000000000000..a1b20888f502 --- /dev/null +++ b/net/shaper/shaper.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include + +#include "shaper_nl_gen.h" + +int net_shaper_nl_pre_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +void net_shaper_nl_post_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ +} + +int net_shaper_nl_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +int net_shaper_nl_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return -EOPNOTSUPP; +} + +int net_shaper_nl_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +int net_shaper_nl_delete_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +int net_shaper_nl_pre_dumpit(struct netlink_callback *cb) +{ + return -EOPNOTSUPP; +} + +int net_shaper_nl_post_dumpit(struct netlink_callback *cb) +{ + return -EOPNOTSUPP; +} + +static int __init shaper_init(void) +{ + return genl_register_family(&net_shaper_nl_family); +} + +subsys_initcall(shaper_init); diff --git a/net/shaper/shaper_nl_gen.c b/net/shaper/shaper_nl_gen.c new file mode 100644 index 000000000000..34185c5989e6 --- /dev/null +++ b/net/shaper/shaper_nl_gen.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/net_shaper.yaml */ +/* YNL-GEN kernel source */ + +#include +#include + +#include "shaper_nl_gen.h" + +#include + +/* Common nested types */ +const struct nla_policy net_shaper_handle_nl_policy[NET_SHAPER_A_HANDLE_ID + 1] = { + [NET_SHAPER_A_HANDLE_SCOPE] = NLA_POLICY_MAX(NLA_U32, 3), + [NET_SHAPER_A_HANDLE_ID] = { .type = NLA_U32, }, +}; + +const struct nla_policy net_shaper_leaf_info_nl_policy[NET_SHAPER_A_WEIGHT + 1] = { + [NET_SHAPER_A_HANDLE] = NLA_POLICY_NESTED(net_shaper_handle_nl_policy), + [NET_SHAPER_A_PRIORITY] = { .type = NLA_U32, }, + [NET_SHAPER_A_WEIGHT] = { .type = NLA_U32, }, +}; + +/* NET_SHAPER_CMD_GET - do */ +static const struct nla_policy net_shaper_get_do_nl_policy[NET_SHAPER_A_IFINDEX + 1] = { + [NET_SHAPER_A_IFINDEX] = { .type = NLA_U32, }, + [NET_SHAPER_A_HANDLE] = NLA_POLICY_NESTED(net_shaper_handle_nl_policy), +}; + +/* NET_SHAPER_CMD_GET - dump */ +static const struct nla_policy net_shaper_get_dump_nl_policy[NET_SHAPER_A_IFINDEX + 1] = { + [NET_SHAPER_A_IFINDEX] = { .type = NLA_U32, }, +}; + +/* NET_SHAPER_CMD_SET - do */ +static const struct nla_policy net_shaper_set_nl_policy[NET_SHAPER_A_IFINDEX + 1] = { + [NET_SHAPER_A_IFINDEX] = { .type = NLA_U32, }, + [NET_SHAPER_A_HANDLE] = NLA_POLICY_NESTED(net_shaper_handle_nl_policy), + [NET_SHAPER_A_METRIC] = NLA_POLICY_MAX(NLA_U32, 1), + [NET_SHAPER_A_BW_MIN] = { .type = NLA_UINT, }, + [NET_SHAPER_A_BW_MAX] = { .type = NLA_UINT, }, + [NET_SHAPER_A_BURST] = { .type = NLA_UINT, }, + [NET_SHAPER_A_PRIORITY] = { .type = NLA_U32, }, + [NET_SHAPER_A_WEIGHT] = { .type = NLA_U32, }, +}; + +/* NET_SHAPER_CMD_DELETE - do */ +static const struct nla_policy net_shaper_delete_nl_policy[NET_SHAPER_A_IFINDEX + 1] = { + [NET_SHAPER_A_IFINDEX] = { .type = NLA_U32, }, + [NET_SHAPER_A_HANDLE] = NLA_POLICY_NESTED(net_shaper_handle_nl_policy), +}; + +/* NET_SHAPER_CMD_GROUP - do */ +static const struct nla_policy net_shaper_group_nl_policy[NET_SHAPER_A_LEAVES + 1] = { + [NET_SHAPER_A_IFINDEX] = { .type = NLA_U32, }, + [NET_SHAPER_A_PARENT] = NLA_POLICY_NESTED(net_shaper_handle_nl_policy), + [NET_SHAPER_A_HANDLE] = NLA_POLICY_NESTED(net_shaper_handle_nl_policy), + [NET_SHAPER_A_METRIC] = NLA_POLICY_MAX(NLA_U32, 1), + [NET_SHAPER_A_BW_MIN] = { .type = NLA_UINT, }, + [NET_SHAPER_A_BW_MAX] = { .type = NLA_UINT, }, + [NET_SHAPER_A_BURST] = { .type = NLA_UINT, }, + [NET_SHAPER_A_PRIORITY] = { .type = NLA_U32, }, + [NET_SHAPER_A_WEIGHT] = { .type = NLA_U32, }, + [NET_SHAPER_A_LEAVES] = NLA_POLICY_NESTED(net_shaper_leaf_info_nl_policy), +}; + +/* Ops table for net_shaper */ +static const struct genl_split_ops net_shaper_nl_ops[] = { + { + .cmd = NET_SHAPER_CMD_GET, + .pre_doit = net_shaper_nl_pre_doit, + .doit = net_shaper_nl_get_doit, + .post_doit = net_shaper_nl_post_doit, + .policy = net_shaper_get_do_nl_policy, + .maxattr = NET_SHAPER_A_IFINDEX, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NET_SHAPER_CMD_GET, + .start = net_shaper_nl_pre_dumpit, + .dumpit = net_shaper_nl_get_dumpit, + .done = net_shaper_nl_post_dumpit, + .policy = net_shaper_get_dump_nl_policy, + .maxattr = NET_SHAPER_A_IFINDEX, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = NET_SHAPER_CMD_SET, + .pre_doit = net_shaper_nl_pre_doit, + .doit = net_shaper_nl_set_doit, + .post_doit = net_shaper_nl_post_doit, + .policy = net_shaper_set_nl_policy, + .maxattr = NET_SHAPER_A_IFINDEX, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = NET_SHAPER_CMD_DELETE, + .pre_doit = net_shaper_nl_pre_doit, + .doit = net_shaper_nl_delete_doit, + .post_doit = net_shaper_nl_post_doit, + .policy = net_shaper_delete_nl_policy, + .maxattr = NET_SHAPER_A_IFINDEX, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = NET_SHAPER_CMD_GROUP, + .pre_doit = net_shaper_nl_pre_doit, + .doit = net_shaper_nl_group_doit, + .post_doit = net_shaper_nl_post_doit, + .policy = net_shaper_group_nl_policy, + .maxattr = NET_SHAPER_A_LEAVES, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, +}; + +struct genl_family net_shaper_nl_family __ro_after_init = { + .name = NET_SHAPER_FAMILY_NAME, + .version = NET_SHAPER_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = net_shaper_nl_ops, + .n_split_ops = ARRAY_SIZE(net_shaper_nl_ops), +}; diff --git a/net/shaper/shaper_nl_gen.h b/net/shaper/shaper_nl_gen.h new file mode 100644 index 000000000000..016cb6f3187b --- /dev/null +++ b/net/shaper/shaper_nl_gen.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/net_shaper.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_NET_SHAPER_GEN_H +#define _LINUX_NET_SHAPER_GEN_H + +#include +#include + +#include + +/* Common nested types */ +extern const struct nla_policy net_shaper_handle_nl_policy[NET_SHAPER_A_HANDLE_ID + 1]; +extern const struct nla_policy net_shaper_leaf_info_nl_policy[NET_SHAPER_A_WEIGHT + 1]; + +int net_shaper_nl_pre_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); +void +net_shaper_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); +int net_shaper_nl_pre_dumpit(struct netlink_callback *cb); +int net_shaper_nl_post_dumpit(struct netlink_callback *cb); + +int net_shaper_nl_get_doit(struct sk_buff *skb, struct genl_info *info); +int net_shaper_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int net_shaper_nl_set_doit(struct sk_buff *skb, struct genl_info *info); +int net_shaper_nl_delete_doit(struct sk_buff *skb, struct genl_info *info); +int net_shaper_nl_group_doit(struct sk_buff *skb, struct genl_info *info); + +extern struct genl_family net_shaper_nl_family; + +#endif /* _LINUX_NET_SHAPER_GEN_H */ -- cgit v1.3 From 4b623f9f0f59652ea71fcb27d60b4c3b65126dbb Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Oct 2024 10:09:49 +0200 Subject: net-shapers: implement NL get operation Introduce the basic infrastructure to implement the net-shaper core functionality. Each network devices carries a net-shaper cache, the NL get() operation fetches the data from such cache. The cache is initially empty, will be fill by the set()/group() operation implemented later and is destroyed at device cleanup time. The net_shaper_fill_handle(), net_shaper_ctx_init(), and net_shaper_generic_pre() implementations handle generic index type attributes, despite the current caller always pass a constant value to avoid more noise in later patches using them with different attributes. Reviewed-by: Jakub Kicinski Reviewed-by: Jiri Pirko Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/ddd10fd645a9367803ad02fca4a5664ea5ace170.1728460186.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/networking/kapi.rst | 3 + include/linux/netdevice.h | 21 +++ include/net/net_shaper.h | 120 ++++++++++++++ net/core/dev.c | 6 + net/core/dev.h | 6 + net/shaper/shaper.c | 335 +++++++++++++++++++++++++++++++++++++- 6 files changed, 484 insertions(+), 7 deletions(-) create mode 100644 include/net/net_shaper.h (limited to 'net') diff --git a/Documentation/networking/kapi.rst b/Documentation/networking/kapi.rst index ea55f462cefa..98682b9a13ee 100644 --- a/Documentation/networking/kapi.rst +++ b/Documentation/networking/kapi.rst @@ -104,6 +104,9 @@ Driver Support .. kernel-doc:: include/linux/netdevice.h :internal: +.. kernel-doc:: include/net/net_shaper.h + :internal: + PHY Support ----------- diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3baf8e539b6f..e6b93d01e631 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1603,6 +1603,14 @@ struct net_device_ops { int (*ndo_hwtstamp_set)(struct net_device *dev, struct kernel_hwtstamp_config *kernel_config, struct netlink_ext_ack *extack); + +#if IS_ENABLED(CONFIG_NET_SHAPER) + /** + * @net_shaper_ops: Device shaping offload operations + * see include/net/net_shapers.h + */ + const struct net_shaper_ops *net_shaper_ops; +#endif }; /** @@ -2406,6 +2414,19 @@ struct net_device { u64 max_pacing_offload_horizon; + /** + * @lock: protects @net_shaper_hierarchy, feel free to use for other + * netdev-scope protection. Ordering: take after rtnl_lock. + */ + struct mutex lock; + +#if IS_ENABLED(CONFIG_NET_SHAPER) + /** + * @net_shaper_hierarchy: data tracking the current shaper status + * see include/net/net_shapers.h + */ + struct net_shaper_hierarchy *net_shaper_hierarchy; +#endif u8 priv[] ____cacheline_aligned __counted_by(priv_len); } ____cacheline_aligned; diff --git a/include/net/net_shaper.h b/include/net/net_shaper.h new file mode 100644 index 000000000000..5c3f49b52fe9 --- /dev/null +++ b/include/net/net_shaper.h @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _NET_SHAPER_H_ +#define _NET_SHAPER_H_ + +#include + +#include + +struct net_device; +struct devlink; +struct netlink_ext_ack; + +enum net_shaper_binding_type { + NET_SHAPER_BINDING_TYPE_NETDEV, + /* NET_SHAPER_BINDING_TYPE_DEVLINK_PORT */ +}; + +struct net_shaper_binding { + enum net_shaper_binding_type type; + union { + struct net_device *netdev; + struct devlink *devlink; + }; +}; + +struct net_shaper_handle { + enum net_shaper_scope scope; + u32 id; +}; + +/** + * struct net_shaper - represents a shaping node on the NIC H/W + * zeroed field are considered not set. + * @parent: Unique identifier for the shaper parent, usually implied + * @handle: Unique identifier for this shaper + * @metric: Specify if the rate limits refers to PPS or BPS + * @bw_min: Minimum guaranteed rate for this shaper + * @bw_max: Maximum peak rate allowed for this shaper + * @burst: Maximum burst for the peek rate of this shaper + * @priority: Scheduling priority for this shaper + * @weight: Scheduling weight for this shaper + */ +struct net_shaper { + struct net_shaper_handle parent; + struct net_shaper_handle handle; + enum net_shaper_metric metric; + u64 bw_min; + u64 bw_max; + u64 burst; + u32 priority; + u32 weight; + + /* private: */ + u32 leaves; /* accounted only for NODE scope */ + struct rcu_head rcu; +}; + +/** + * struct net_shaper_ops - Operations on device H/W shapers + * + * The operations applies to either net_device and devlink objects. + * The initial shaping configuration at device initialization is empty: + * does not constraint the rate in any way. + * The network core keeps track of the applied user-configuration in + * the net_device or devlink structure. + * The operations are serialized via a per device lock. + * + * Device not supporting any kind of nesting should not provide the + * group operation. + * + * Each shaper is uniquely identified within the device with a 'handle' + * comprising the shaper scope and a scope-specific id. + */ +struct net_shaper_ops { + /** + * @group: create the specified shapers scheduling group + * + * Nest the @leaves shapers identified under the * @node shaper. + * All the shapers belong to the device specified by @binding. + * The @leaves arrays size is specified by @leaves_count. + * Create either the @leaves and the @node shaper; or if they already + * exists, links them together in the desired way. + * @leaves scope must be NET_SHAPER_SCOPE_QUEUE. + */ + int (*group)(struct net_shaper_binding *binding, int leaves_count, + const struct net_shaper *leaves, + const struct net_shaper *node, + struct netlink_ext_ack *extack); + + /** + * @set: Updates the specified shaper + * + * Updates or creates the @shaper on the device specified by @binding. + */ + int (*set)(struct net_shaper_binding *binding, + const struct net_shaper *shaper, + struct netlink_ext_ack *extack); + + /** + * @delete: Removes the specified shaper + * + * Removes the shaper configuration as identified by the given @handle + * on the device specified by @binding, restoring the default behavior. + */ + int (*delete)(struct net_shaper_binding *binding, + const struct net_shaper_handle *handle, + struct netlink_ext_ack *extack); + + /** + * @capabilities: get the shaper features supported by the device + * + * Fills the bitmask @cap with the supported capabilities for the + * specified @scope and device specified by @binding. + */ + void (*capabilities)(struct net_shaper_binding *binding, + enum net_shaper_scope scope, unsigned long *cap); +}; + +#endif diff --git a/net/core/dev.c b/net/core/dev.c index ea5fbcd133ae..6e727c49a6f7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11147,6 +11147,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, hash_init(dev->qdisc_hash); #endif + mutex_init(&dev->lock); + dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); @@ -11217,6 +11219,8 @@ void free_netdev(struct net_device *dev) return; } + mutex_destroy(&dev->lock); + kfree(dev->ethtool); netif_free_tx_queues(dev); netif_free_rx_queues(dev); @@ -11426,6 +11430,8 @@ void unregister_netdevice_many_notify(struct list_head *head, mutex_destroy(&dev->ethtool->rss_lock); + net_shaper_flush_netdev(dev); + if (skb) rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh); diff --git a/net/core/dev.h b/net/core/dev.h index 5654325c5b71..13c558874af3 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -35,6 +35,12 @@ void dev_addr_flush(struct net_device *dev); int dev_addr_init(struct net_device *dev); void dev_addr_check(struct net_device *dev); +#if IS_ENABLED(CONFIG_NET_SHAPER) +void net_shaper_flush_netdev(struct net_device *dev); +#else +static inline void net_shaper_flush_netdev(struct net_device *dev) {} +#endif + /* sysctls not referred to from outside net/core/ */ extern int netdev_unregister_timeout_secs; extern int weight_p; diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index a1b20888f502..22daf7dde999 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -1,30 +1,333 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include +#include +#include #include +#include +#include #include +#include +#include +#include #include "shaper_nl_gen.h" +#include "../core/dev.h" + +#define NET_SHAPER_SCOPE_SHIFT 26 +#define NET_SHAPER_ID_MASK GENMASK(NET_SHAPER_SCOPE_SHIFT - 1, 0) +#define NET_SHAPER_SCOPE_MASK GENMASK(31, NET_SHAPER_SCOPE_SHIFT) + +#define NET_SHAPER_ID_UNSPEC NET_SHAPER_ID_MASK + +struct net_shaper_hierarchy { + struct xarray shapers; +}; + +struct net_shaper_nl_ctx { + struct net_shaper_binding binding; + netdevice_tracker dev_tracker; + unsigned long start_index; +}; + +static struct net_shaper_binding *net_shaper_binding_from_ctx(void *ctx) +{ + return &((struct net_shaper_nl_ctx *)ctx)->binding; +} + +static struct net_shaper_hierarchy * +net_shaper_hierarchy(struct net_shaper_binding *binding) +{ + /* Pairs with WRITE_ONCE() in net_shaper_hierarchy_setup. */ + if (binding->type == NET_SHAPER_BINDING_TYPE_NETDEV) + return READ_ONCE(binding->netdev->net_shaper_hierarchy); + + /* No other type supported yet. */ + return NULL; +} + +static int net_shaper_fill_binding(struct sk_buff *msg, + const struct net_shaper_binding *binding, + u32 type) +{ + /* Should never happen, as currently only NETDEV is supported. */ + if (WARN_ON_ONCE(binding->type != NET_SHAPER_BINDING_TYPE_NETDEV)) + return -EINVAL; + + if (nla_put_u32(msg, type, binding->netdev->ifindex)) + return -EMSGSIZE; + + return 0; +} + +static int net_shaper_fill_handle(struct sk_buff *msg, + const struct net_shaper_handle *handle, + u32 type) +{ + struct nlattr *handle_attr; + + if (handle->scope == NET_SHAPER_SCOPE_UNSPEC) + return 0; + + handle_attr = nla_nest_start(msg, type); + if (!handle_attr) + return -EMSGSIZE; + + if (nla_put_u32(msg, NET_SHAPER_A_HANDLE_SCOPE, handle->scope) || + (handle->scope >= NET_SHAPER_SCOPE_QUEUE && + nla_put_u32(msg, NET_SHAPER_A_HANDLE_ID, handle->id))) + goto handle_nest_cancel; + + nla_nest_end(msg, handle_attr); + return 0; + +handle_nest_cancel: + nla_nest_cancel(msg, handle_attr); + return -EMSGSIZE; +} + +static int +net_shaper_fill_one(struct sk_buff *msg, + const struct net_shaper_binding *binding, + const struct net_shaper *shaper, + const struct genl_info *info) +{ + void *hdr; + + hdr = genlmsg_iput(msg, info); + if (!hdr) + return -EMSGSIZE; + + if (net_shaper_fill_binding(msg, binding, NET_SHAPER_A_IFINDEX) || + net_shaper_fill_handle(msg, &shaper->parent, + NET_SHAPER_A_PARENT) || + net_shaper_fill_handle(msg, &shaper->handle, + NET_SHAPER_A_HANDLE) || + ((shaper->bw_min || shaper->bw_max || shaper->burst) && + nla_put_u32(msg, NET_SHAPER_A_METRIC, shaper->metric)) || + (shaper->bw_min && + nla_put_uint(msg, NET_SHAPER_A_BW_MIN, shaper->bw_min)) || + (shaper->bw_max && + nla_put_uint(msg, NET_SHAPER_A_BW_MAX, shaper->bw_max)) || + (shaper->burst && + nla_put_uint(msg, NET_SHAPER_A_BURST, shaper->burst)) || + (shaper->priority && + nla_put_u32(msg, NET_SHAPER_A_PRIORITY, shaper->priority)) || + (shaper->weight && + nla_put_u32(msg, NET_SHAPER_A_WEIGHT, shaper->weight))) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +/* Initialize the context fetching the relevant device and + * acquiring a reference to it. + */ +static int net_shaper_ctx_setup(const struct genl_info *info, int type, + struct net_shaper_nl_ctx *ctx) +{ + struct net *ns = genl_info_net(info); + struct net_device *dev; + int ifindex; + + if (GENL_REQ_ATTR_CHECK(info, type)) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[type]); + dev = netdev_get_by_index(ns, ifindex, &ctx->dev_tracker, GFP_KERNEL); + if (!dev) { + NL_SET_BAD_ATTR(info->extack, info->attrs[type]); + return -ENOENT; + } + + if (!dev->netdev_ops->net_shaper_ops) { + NL_SET_BAD_ATTR(info->extack, info->attrs[type]); + netdev_put(dev, &ctx->dev_tracker); + return -EOPNOTSUPP; + } + + ctx->binding.type = NET_SHAPER_BINDING_TYPE_NETDEV; + ctx->binding.netdev = dev; + return 0; +} + +static void net_shaper_ctx_cleanup(struct net_shaper_nl_ctx *ctx) +{ + if (ctx->binding.type == NET_SHAPER_BINDING_TYPE_NETDEV) + netdev_put(ctx->binding.netdev, &ctx->dev_tracker); +} + +static u32 net_shaper_handle_to_index(const struct net_shaper_handle *handle) +{ + return FIELD_PREP(NET_SHAPER_SCOPE_MASK, handle->scope) | + FIELD_PREP(NET_SHAPER_ID_MASK, handle->id); +} + +static struct net_shaper * +net_shaper_lookup(struct net_shaper_binding *binding, + const struct net_shaper_handle *handle) +{ + struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); + u32 index = net_shaper_handle_to_index(handle); + + return hierarchy ? xa_load(&hierarchy->shapers, index) : NULL; +} + +static int net_shaper_parse_handle(const struct nlattr *attr, + const struct genl_info *info, + struct net_shaper_handle *handle) +{ + struct nlattr *tb[NET_SHAPER_A_HANDLE_MAX + 1]; + struct nlattr *id_attr; + u32 id = 0; + int ret; + + ret = nla_parse_nested(tb, NET_SHAPER_A_HANDLE_MAX, attr, + net_shaper_handle_nl_policy, info->extack); + if (ret < 0) + return ret; + + if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, + NET_SHAPER_A_HANDLE_SCOPE)) + return -EINVAL; + + handle->scope = nla_get_u32(tb[NET_SHAPER_A_HANDLE_SCOPE]); + + /* The default id for NODE scope shapers is an invalid one + * to help the 'group' operation discriminate between new + * NODE shaper creation (ID_UNSPEC) and reuse of existing + * shaper (any other value). + */ + id_attr = tb[NET_SHAPER_A_HANDLE_ID]; + if (id_attr) + id = nla_get_u32(id_attr); + else if (handle->scope == NET_SHAPER_SCOPE_NODE) + id = NET_SHAPER_ID_UNSPEC; + + handle->id = id; + return 0; +} + +static int net_shaper_generic_pre(struct genl_info *info, int type) +{ + struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)info->ctx; + + BUILD_BUG_ON(sizeof(*ctx) > sizeof(info->ctx)); + + return net_shaper_ctx_setup(info, type, ctx); +} + int net_shaper_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { - return -EOPNOTSUPP; + return net_shaper_generic_pre(info, NET_SHAPER_A_IFINDEX); +} + +static void net_shaper_generic_post(struct genl_info *info) +{ + net_shaper_ctx_cleanup((struct net_shaper_nl_ctx *)info->ctx); } void net_shaper_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { + net_shaper_generic_post(info); +} + +int net_shaper_nl_pre_dumpit(struct netlink_callback *cb) +{ + struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)cb->ctx; + const struct genl_info *info = genl_info_dump(cb); + + return net_shaper_ctx_setup(info, NET_SHAPER_A_IFINDEX, ctx); +} + +int net_shaper_nl_post_dumpit(struct netlink_callback *cb) +{ + net_shaper_ctx_cleanup((struct net_shaper_nl_ctx *)cb->ctx); + return 0; } int net_shaper_nl_get_doit(struct sk_buff *skb, struct genl_info *info) { - return -EOPNOTSUPP; + struct net_shaper_binding *binding; + struct net_shaper_handle handle; + struct net_shaper *shaper; + struct sk_buff *msg; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, NET_SHAPER_A_HANDLE)) + return -EINVAL; + + binding = net_shaper_binding_from_ctx(info->ctx); + ret = net_shaper_parse_handle(info->attrs[NET_SHAPER_A_HANDLE], info, + &handle); + if (ret < 0) + return ret; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + rcu_read_lock(); + shaper = net_shaper_lookup(binding, &handle); + if (!shaper) { + NL_SET_BAD_ATTR(info->extack, + info->attrs[NET_SHAPER_A_HANDLE]); + rcu_read_unlock(); + ret = -ENOENT; + goto free_msg; + } + + ret = net_shaper_fill_one(msg, binding, shaper, info); + rcu_read_unlock(); + if (ret) + goto free_msg; + + ret = genlmsg_reply(msg, info); + if (ret) + goto free_msg; + + return 0; + +free_msg: + nlmsg_free(msg); + return ret; } int net_shaper_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { - return -EOPNOTSUPP; + struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)cb->ctx; + const struct genl_info *info = genl_info_dump(cb); + struct net_shaper_hierarchy *hierarchy; + struct net_shaper_binding *binding; + struct net_shaper *shaper; + int ret = 0; + + /* Don't error out dumps performed before any set operation. */ + binding = net_shaper_binding_from_ctx(ctx); + hierarchy = net_shaper_hierarchy(binding); + if (!hierarchy) + return 0; + + rcu_read_lock(); + for (; (shaper = xa_find(&hierarchy->shapers, &ctx->start_index, + U32_MAX, XA_PRESENT)); ctx->start_index++) { + ret = net_shaper_fill_one(skb, binding, shaper, info); + if (ret) + break; + } + rcu_read_unlock(); + + return ret; } int net_shaper_nl_set_doit(struct sk_buff *skb, struct genl_info *info) @@ -37,14 +340,32 @@ int net_shaper_nl_delete_doit(struct sk_buff *skb, struct genl_info *info) return -EOPNOTSUPP; } -int net_shaper_nl_pre_dumpit(struct netlink_callback *cb) +static void net_shaper_flush(struct net_shaper_binding *binding) { - return -EOPNOTSUPP; + struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); + struct net_shaper *cur; + unsigned long index; + + if (!hierarchy) + return; + + xa_lock(&hierarchy->shapers); + xa_for_each(&hierarchy->shapers, index, cur) { + __xa_erase(&hierarchy->shapers, index); + kfree(cur); + } + xa_unlock(&hierarchy->shapers); + kfree(hierarchy); } -int net_shaper_nl_post_dumpit(struct netlink_callback *cb) +void net_shaper_flush_netdev(struct net_device *dev) { - return -EOPNOTSUPP; + struct net_shaper_binding binding = { + .type = NET_SHAPER_BINDING_TYPE_NETDEV, + .netdev = dev, + }; + + net_shaper_flush(&binding); } static int __init shaper_init(void) -- cgit v1.3 From 93954b40f6a4fc43226c01a15b02732f884500f1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Oct 2024 10:09:50 +0200 Subject: net-shapers: implement NL set and delete operations Both NL operations directly map on the homonymous device shaper callbacks, update accordingly the shapers cache and are serialized via a per device lock. Implement the cache modification helpers to additionally deal with NODE scope shaper. That will be needed by the group() operation implemented in the next patch. The delete implementation is partial: does not handle NODE scope shaper yet. Such support will require infrastructure from the next patch and will be implemented later in the series. Reviewed-by: Jiri Pirko Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/1e6a34a4095b35d773d2b9c476164671bbcf8397.1728460186.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/shaper/shaper.c | 383 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 380 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index 22daf7dde999..5946f140f3d0 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -36,6 +36,24 @@ static struct net_shaper_binding *net_shaper_binding_from_ctx(void *ctx) return &((struct net_shaper_nl_ctx *)ctx)->binding; } +static void net_shaper_lock(struct net_shaper_binding *binding) +{ + switch (binding->type) { + case NET_SHAPER_BINDING_TYPE_NETDEV: + mutex_lock(&binding->netdev->lock); + break; + } +} + +static void net_shaper_unlock(struct net_shaper_binding *binding) +{ + switch (binding->type) { + case NET_SHAPER_BINDING_TYPE_NETDEV: + mutex_unlock(&binding->netdev->lock); + break; + } +} + static struct net_shaper_hierarchy * net_shaper_hierarchy(struct net_shaper_binding *binding) { @@ -47,6 +65,16 @@ net_shaper_hierarchy(struct net_shaper_binding *binding) return NULL; } +static const struct net_shaper_ops * +net_shaper_ops(struct net_shaper_binding *binding) +{ + if (binding->type == NET_SHAPER_BINDING_TYPE_NETDEV) + return binding->netdev->netdev_ops->net_shaper_ops; + + /* No other type supported yet. */ + return NULL; +} + static int net_shaper_fill_binding(struct sk_buff *msg, const struct net_shaper_binding *binding, u32 type) @@ -170,6 +198,37 @@ static u32 net_shaper_handle_to_index(const struct net_shaper_handle *handle) FIELD_PREP(NET_SHAPER_ID_MASK, handle->id); } +static void net_shaper_index_to_handle(u32 index, + struct net_shaper_handle *handle) +{ + handle->scope = FIELD_GET(NET_SHAPER_SCOPE_MASK, index); + handle->id = FIELD_GET(NET_SHAPER_ID_MASK, index); +} + +static void net_shaper_default_parent(const struct net_shaper_handle *handle, + struct net_shaper_handle *parent) +{ + switch (handle->scope) { + case NET_SHAPER_SCOPE_UNSPEC: + case NET_SHAPER_SCOPE_NETDEV: + case __NET_SHAPER_SCOPE_MAX: + parent->scope = NET_SHAPER_SCOPE_UNSPEC; + break; + + case NET_SHAPER_SCOPE_QUEUE: + case NET_SHAPER_SCOPE_NODE: + parent->scope = NET_SHAPER_SCOPE_NETDEV; + break; + } + parent->id = 0; +} + +/* + * MARK_0 is already in use due to XA_FLAGS_ALLOC, can't reuse such flag as + * it's cleared by xa_store(). + */ +#define NET_SHAPER_NOT_VALID XA_MARK_1 + static struct net_shaper * net_shaper_lookup(struct net_shaper_binding *binding, const struct net_shaper_handle *handle) @@ -177,7 +236,154 @@ net_shaper_lookup(struct net_shaper_binding *binding, struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); u32 index = net_shaper_handle_to_index(handle); - return hierarchy ? xa_load(&hierarchy->shapers, index) : NULL; + if (!hierarchy || xa_get_mark(&hierarchy->shapers, index, + NET_SHAPER_NOT_VALID)) + return NULL; + + return xa_load(&hierarchy->shapers, index); +} + +/* Allocate on demand the per device shaper's hierarchy container. + * Called under the net shaper lock + */ +static struct net_shaper_hierarchy * +net_shaper_hierarchy_setup(struct net_shaper_binding *binding) +{ + struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); + + if (hierarchy) + return hierarchy; + + hierarchy = kmalloc(sizeof(*hierarchy), GFP_KERNEL); + if (!hierarchy) + return NULL; + + /* The flag is required for ID allocation */ + xa_init_flags(&hierarchy->shapers, XA_FLAGS_ALLOC); + + switch (binding->type) { + case NET_SHAPER_BINDING_TYPE_NETDEV: + /* Pairs with READ_ONCE in net_shaper_hierarchy. */ + WRITE_ONCE(binding->netdev->net_shaper_hierarchy, hierarchy); + break; + } + return hierarchy; +} + +/* Prepare the hierarchy container to actually insert the given shaper, doing + * in advance the needed allocations. + */ +static int net_shaper_pre_insert(struct net_shaper_binding *binding, + struct net_shaper_handle *handle, + struct netlink_ext_ack *extack) +{ + struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); + struct net_shaper *prev, *cur; + bool id_allocated = false; + int ret, index; + + if (!hierarchy) + return -ENOMEM; + + index = net_shaper_handle_to_index(handle); + cur = xa_load(&hierarchy->shapers, index); + if (cur) + return 0; + + /* Allocated a new id, if needed. */ + if (handle->scope == NET_SHAPER_SCOPE_NODE && + handle->id == NET_SHAPER_ID_UNSPEC) { + u32 min, max; + + handle->id = NET_SHAPER_ID_MASK - 1; + max = net_shaper_handle_to_index(handle); + handle->id = 0; + min = net_shaper_handle_to_index(handle); + + ret = xa_alloc(&hierarchy->shapers, &index, NULL, + XA_LIMIT(min, max), GFP_KERNEL); + if (ret < 0) { + NL_SET_ERR_MSG(extack, "Can't allocate new id for NODE shaper"); + return ret; + } + + net_shaper_index_to_handle(index, handle); + id_allocated = true; + } + + cur = kzalloc(sizeof(*cur), GFP_KERNEL); + if (!cur) { + ret = -ENOMEM; + goto free_id; + } + + /* Mark 'tentative' shaper inside the hierarchy container. + * xa_set_mark is a no-op if the previous store fails. + */ + xa_lock(&hierarchy->shapers); + prev = __xa_store(&hierarchy->shapers, index, cur, GFP_KERNEL); + __xa_set_mark(&hierarchy->shapers, index, NET_SHAPER_NOT_VALID); + xa_unlock(&hierarchy->shapers); + if (xa_err(prev)) { + NL_SET_ERR_MSG(extack, "Can't insert shaper into device store"); + kfree_rcu(cur, rcu); + ret = xa_err(prev); + goto free_id; + } + return 0; + +free_id: + if (id_allocated) + xa_erase(&hierarchy->shapers, index); + return ret; +} + +/* Commit the tentative insert with the actual values. + * Must be called only after a successful net_shaper_pre_insert(). + */ +static void net_shaper_commit(struct net_shaper_binding *binding, + int nr_shapers, const struct net_shaper *shapers) +{ + struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); + struct net_shaper *cur; + int index; + int i; + + xa_lock(&hierarchy->shapers); + for (i = 0; i < nr_shapers; ++i) { + index = net_shaper_handle_to_index(&shapers[i].handle); + + cur = xa_load(&hierarchy->shapers, index); + if (WARN_ON_ONCE(!cur)) + continue; + + /* Successful update: drop the tentative mark + * and update the hierarchy container. + */ + __xa_clear_mark(&hierarchy->shapers, index, + NET_SHAPER_NOT_VALID); + *cur = shapers[i]; + } + xa_unlock(&hierarchy->shapers); +} + +/* Rollback all the tentative inserts from the hierarchy. */ +static void net_shaper_rollback(struct net_shaper_binding *binding) +{ + struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); + struct net_shaper *cur; + unsigned long index; + + if (!hierarchy) + return; + + xa_lock(&hierarchy->shapers); + xa_for_each_marked(&hierarchy->shapers, index, cur, + NET_SHAPER_NOT_VALID) { + __xa_erase(&hierarchy->shapers, index); + kfree(cur); + } + xa_unlock(&hierarchy->shapers); } static int net_shaper_parse_handle(const struct nlattr *attr, @@ -215,6 +421,57 @@ static int net_shaper_parse_handle(const struct nlattr *attr, return 0; } +static int net_shaper_parse_info(struct net_shaper_binding *binding, + struct nlattr **tb, + const struct genl_info *info, + struct net_shaper *shaper, + bool *exists) +{ + struct net_shaper *old; + int ret; + + /* The shaper handle is the only mandatory attribute. */ + if (NL_REQ_ATTR_CHECK(info->extack, NULL, tb, NET_SHAPER_A_HANDLE)) + return -EINVAL; + + ret = net_shaper_parse_handle(tb[NET_SHAPER_A_HANDLE], info, + &shaper->handle); + if (ret) + return ret; + + if (shaper->handle.scope == NET_SHAPER_SCOPE_UNSPEC) { + NL_SET_BAD_ATTR(info->extack, tb[NET_SHAPER_A_HANDLE]); + return -EINVAL; + } + + /* Fetch existing hierarchy, if any, so that user provide info will + * incrementally update the existing shaper configuration. + */ + old = net_shaper_lookup(binding, &shaper->handle); + if (old) + *shaper = *old; + *exists = !!old; + + if (tb[NET_SHAPER_A_METRIC]) + shaper->metric = nla_get_u32(tb[NET_SHAPER_A_METRIC]); + + if (tb[NET_SHAPER_A_BW_MIN]) + shaper->bw_min = nla_get_uint(tb[NET_SHAPER_A_BW_MIN]); + + if (tb[NET_SHAPER_A_BW_MAX]) + shaper->bw_max = nla_get_uint(tb[NET_SHAPER_A_BW_MAX]); + + if (tb[NET_SHAPER_A_BURST]) + shaper->burst = nla_get_uint(tb[NET_SHAPER_A_BURST]); + + if (tb[NET_SHAPER_A_PRIORITY]) + shaper->priority = nla_get_u32(tb[NET_SHAPER_A_PRIORITY]); + + if (tb[NET_SHAPER_A_WEIGHT]) + shaper->weight = nla_get_u32(tb[NET_SHAPER_A_WEIGHT]); + return 0; +} + static int net_shaper_generic_pre(struct genl_info *info, int type) { struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)info->ctx; @@ -332,12 +589,129 @@ int net_shaper_nl_get_dumpit(struct sk_buff *skb, int net_shaper_nl_set_doit(struct sk_buff *skb, struct genl_info *info) { - return -EOPNOTSUPP; + struct net_shaper_hierarchy *hierarchy; + struct net_shaper_binding *binding; + const struct net_shaper_ops *ops; + struct net_shaper_handle handle; + struct net_shaper shaper = {}; + bool exists; + int ret; + + binding = net_shaper_binding_from_ctx(info->ctx); + + net_shaper_lock(binding); + ret = net_shaper_parse_info(binding, info->attrs, info, &shaper, + &exists); + if (ret) + goto unlock; + + if (!exists) + net_shaper_default_parent(&shaper.handle, &shaper.parent); + + hierarchy = net_shaper_hierarchy_setup(binding); + if (!hierarchy) { + ret = -ENOMEM; + goto unlock; + } + + /* The 'set' operation can't create node-scope shapers. */ + handle = shaper.handle; + if (handle.scope == NET_SHAPER_SCOPE_NODE && + !net_shaper_lookup(binding, &handle)) { + ret = -ENOENT; + goto unlock; + } + + ret = net_shaper_pre_insert(binding, &handle, info->extack); + if (ret) + goto unlock; + + ops = net_shaper_ops(binding); + ret = ops->set(binding, &shaper, info->extack); + if (ret) { + net_shaper_rollback(binding); + goto unlock; + } + + net_shaper_commit(binding, 1, &shaper); + +unlock: + net_shaper_unlock(binding); + return ret; +} + +static int __net_shaper_delete(struct net_shaper_binding *binding, + struct net_shaper *shaper, + struct netlink_ext_ack *extack) +{ + struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); + struct net_shaper_handle parent_handle, handle = shaper->handle; + const struct net_shaper_ops *ops = net_shaper_ops(binding); + int ret; + +again: + parent_handle = shaper->parent; + + ret = ops->delete(binding, &handle, extack); + if (ret < 0) + return ret; + + xa_erase(&hierarchy->shapers, net_shaper_handle_to_index(&handle)); + kfree_rcu(shaper, rcu); + + /* Eventually delete the parent, if it is left over with no leaves. */ + if (parent_handle.scope == NET_SHAPER_SCOPE_NODE) { + shaper = net_shaper_lookup(binding, &parent_handle); + if (shaper && !--shaper->leaves) { + handle = parent_handle; + goto again; + } + } + return 0; } int net_shaper_nl_delete_doit(struct sk_buff *skb, struct genl_info *info) { - return -EOPNOTSUPP; + struct net_shaper_hierarchy *hierarchy; + struct net_shaper_binding *binding; + struct net_shaper_handle handle; + struct net_shaper *shaper; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, NET_SHAPER_A_HANDLE)) + return -EINVAL; + + binding = net_shaper_binding_from_ctx(info->ctx); + + net_shaper_lock(binding); + ret = net_shaper_parse_handle(info->attrs[NET_SHAPER_A_HANDLE], info, + &handle); + if (ret) + goto unlock; + + hierarchy = net_shaper_hierarchy(binding); + if (!hierarchy) { + ret = -ENOENT; + goto unlock; + } + + shaper = net_shaper_lookup(binding, &handle); + if (!shaper) { + ret = -ENOENT; + goto unlock; + } + + if (handle.scope == NET_SHAPER_SCOPE_NODE) { + /* TODO: implement support for scope NODE delete. */ + ret = -EINVAL; + goto unlock; + } + + ret = __net_shaper_delete(binding, shaper, info->extack); + +unlock: + net_shaper_unlock(binding); + return ret; } static void net_shaper_flush(struct net_shaper_binding *binding) @@ -349,12 +723,15 @@ static void net_shaper_flush(struct net_shaper_binding *binding) if (!hierarchy) return; + net_shaper_lock(binding); xa_lock(&hierarchy->shapers); xa_for_each(&hierarchy->shapers, index, cur) { __xa_erase(&hierarchy->shapers, index); kfree(cur); } xa_unlock(&hierarchy->shapers); + net_shaper_unlock(binding); + kfree(hierarchy); } -- cgit v1.3 From 5d5d4700e75d861e83bf18eb6bf66ff90f85fe4e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Oct 2024 10:09:51 +0200 Subject: net-shapers: implement NL group operation Allow grouping multiple leaves shaper under the given root. The node and the leaves shapers are created, if needed, otherwise the existing shapers are re-linked as requested. Try hard to pre-allocated the needed resources, to avoid non trivial H/W configuration rollbacks in case of any failure. Reviewed-by: Jiri Pirko Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/8a721274fde18b872d1e3a61aaa916bb7b7996d3.1728460186.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/shaper/shaper.c | 350 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 350 insertions(+) (limited to 'net') diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index 5946f140f3d0..c23ac611850d 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -75,6 +75,24 @@ net_shaper_ops(struct net_shaper_binding *binding) return NULL; } +/* Count the number of [multi] attributes of the given type. */ +static int net_shaper_list_len(struct genl_info *info, int type) +{ + struct nlattr *attr; + int rem, cnt = 0; + + nla_for_each_attr_type(attr, type, genlmsg_data(info->genlhdr), + genlmsg_len(info->genlhdr), rem) + cnt++; + return cnt; +} + +static int net_shaper_handle_size(void) +{ + return nla_total_size(nla_total_size(sizeof(u32)) + + nla_total_size(sizeof(u32))); +} + static int net_shaper_fill_binding(struct sk_buff *msg, const struct net_shaper_binding *binding, u32 type) @@ -472,6 +490,74 @@ static int net_shaper_parse_info(struct net_shaper_binding *binding, return 0; } +/* Fetch the existing leaf and update it with the user-provided + * attributes. + */ +static int net_shaper_parse_leaf(struct net_shaper_binding *binding, + const struct nlattr *attr, + const struct genl_info *info, + const struct net_shaper *node, + struct net_shaper *shaper) +{ + struct nlattr *tb[NET_SHAPER_A_WEIGHT + 1]; + bool exists; + int ret; + + ret = nla_parse_nested(tb, NET_SHAPER_A_WEIGHT, attr, + net_shaper_leaf_info_nl_policy, info->extack); + if (ret < 0) + return ret; + + ret = net_shaper_parse_info(binding, tb, info, shaper, &exists); + if (ret < 0) + return ret; + + if (shaper->handle.scope != NET_SHAPER_SCOPE_QUEUE) { + NL_SET_BAD_ATTR(info->extack, tb[NET_SHAPER_A_HANDLE]); + return -EINVAL; + } + + if (!exists) + net_shaper_default_parent(&shaper->handle, &shaper->parent); + return 0; +} + +/* Alike net_parse_shaper_info(), but additionally allow the user specifying + * the shaper's parent handle. + */ +static int net_shaper_parse_node(struct net_shaper_binding *binding, + struct nlattr **tb, + const struct genl_info *info, + struct net_shaper *shaper) +{ + bool exists; + int ret; + + ret = net_shaper_parse_info(binding, tb, info, shaper, &exists); + if (ret) + return ret; + + if (shaper->handle.scope != NET_SHAPER_SCOPE_NODE && + shaper->handle.scope != NET_SHAPER_SCOPE_NETDEV) { + NL_SET_BAD_ATTR(info->extack, tb[NET_SHAPER_A_HANDLE]); + return -EINVAL; + } + + if (tb[NET_SHAPER_A_PARENT]) { + ret = net_shaper_parse_handle(tb[NET_SHAPER_A_PARENT], info, + &shaper->parent); + if (ret) + return ret; + + if (shaper->parent.scope != NET_SHAPER_SCOPE_NODE && + shaper->parent.scope != NET_SHAPER_SCOPE_NETDEV) { + NL_SET_BAD_ATTR(info->extack, tb[NET_SHAPER_A_PARENT]); + return -EINVAL; + } + } + return 0; +} + static int net_shaper_generic_pre(struct genl_info *info, int type) { struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)info->ctx; @@ -670,6 +756,123 @@ again: return 0; } +static int net_shaper_handle_cmp(const struct net_shaper_handle *a, + const struct net_shaper_handle *b) +{ + /* Must avoid holes in struct net_shaper_handle. */ + BUILD_BUG_ON(sizeof(*a) != 8); + + return memcmp(a, b, sizeof(*a)); +} + +static int net_shaper_parent_from_leaves(int leaves_count, + const struct net_shaper *leaves, + struct net_shaper *node, + struct netlink_ext_ack *extack) +{ + struct net_shaper_handle parent = leaves[0].parent; + int i; + + for (i = 1; i < leaves_count; ++i) { + if (net_shaper_handle_cmp(&leaves[i].parent, &parent)) { + NL_SET_ERR_MSG_FMT(extack, "All the leaves shapers must have the same old parent"); + return -EINVAL; + } + } + + node->parent = parent; + return 0; +} + +static int __net_shaper_group(struct net_shaper_binding *binding, + int leaves_count, struct net_shaper *leaves, + struct net_shaper *node, + struct netlink_ext_ack *extack) +{ + const struct net_shaper_ops *ops = net_shaper_ops(binding); + struct net_shaper_handle leaf_handle; + struct net_shaper *parent = NULL; + bool new_node = false; + int i, ret; + + if (node->handle.scope == NET_SHAPER_SCOPE_NODE) { + new_node = node->handle.id == NET_SHAPER_ID_UNSPEC; + + if (!new_node && !net_shaper_lookup(binding, &node->handle)) { + /* The related attribute is not available when + * reaching here from the delete() op. + */ + NL_SET_ERR_MSG_FMT(extack, "Node shaper %d:%d does not exists", + node->handle.scope, node->handle.id); + return -ENOENT; + } + + /* When unspecified, the node parent scope is inherited from + * the leaves. + */ + if (node->parent.scope == NET_SHAPER_SCOPE_UNSPEC) { + ret = net_shaper_parent_from_leaves(leaves_count, + leaves, node, + extack); + if (ret) + return ret; + } + + } else { + net_shaper_default_parent(&node->handle, &node->parent); + } + + if (node->parent.scope == NET_SHAPER_SCOPE_NODE) { + parent = net_shaper_lookup(binding, &node->parent); + if (!parent) { + NL_SET_ERR_MSG_FMT(extack, "Node parent shaper %d:%d does not exists", + node->parent.scope, node->parent.id); + return -ENOENT; + } + } + + /* For newly created node scope shaper, the following will update + * the handle, due to id allocation. + */ + ret = net_shaper_pre_insert(binding, &node->handle, extack); + if (ret) + return ret; + + for (i = 0; i < leaves_count; ++i) { + leaf_handle = leaves[i].handle; + + ret = net_shaper_pre_insert(binding, &leaf_handle, extack); + if (ret) + goto rollback; + + if (!net_shaper_handle_cmp(&leaves[i].parent, &node->handle)) + continue; + + /* The leaves shapers will be nested to the node, update the + * linking accordingly. + */ + leaves[i].parent = node->handle; + node->leaves++; + } + + ret = ops->group(binding, leaves_count, leaves, node, extack); + if (ret < 0) + goto rollback; + + /* The node's parent gains a new leaf only when the node itself + * is created by this group operation + */ + if (new_node && parent) + parent->leaves++; + net_shaper_commit(binding, 1, node); + net_shaper_commit(binding, leaves_count, leaves); + return 0; + +rollback: + net_shaper_rollback(binding); + return ret; +} + int net_shaper_nl_delete_doit(struct sk_buff *skb, struct genl_info *info) { struct net_shaper_hierarchy *hierarchy; @@ -714,6 +917,153 @@ unlock: return ret; } +static int net_shaper_group_send_reply(struct net_shaper_binding *binding, + const struct net_shaper_handle *handle, + struct genl_info *info, + struct sk_buff *msg) +{ + void *hdr; + + hdr = genlmsg_iput(msg, info); + if (!hdr) + goto free_msg; + + if (net_shaper_fill_binding(msg, binding, NET_SHAPER_A_IFINDEX) || + net_shaper_fill_handle(msg, handle, NET_SHAPER_A_HANDLE)) + goto free_msg; + + genlmsg_end(msg, hdr); + + return genlmsg_reply(msg, info); + +free_msg: + /* Should never happen as msg is pre-allocated with enough space. */ + WARN_ONCE(true, "calculated message payload length (%d)", + net_shaper_handle_size()); + nlmsg_free(msg); + return -EMSGSIZE; +} + +int net_shaper_nl_group_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct net_shaper **old_nodes, *leaves, node = {}; + struct net_shaper_hierarchy *hierarchy; + struct net_shaper_binding *binding; + int i, ret, rem, leaves_count; + int old_nodes_count = 0; + struct sk_buff *msg; + struct nlattr *attr; + + if (GENL_REQ_ATTR_CHECK(info, NET_SHAPER_A_LEAVES)) + return -EINVAL; + + binding = net_shaper_binding_from_ctx(info->ctx); + + /* The group operation is optional. */ + if (!net_shaper_ops(binding)->group) + return -EOPNOTSUPP; + + net_shaper_lock(binding); + leaves_count = net_shaper_list_len(info, NET_SHAPER_A_LEAVES); + if (!leaves_count) { + NL_SET_BAD_ATTR(info->extack, + info->attrs[NET_SHAPER_A_LEAVES]); + ret = -EINVAL; + goto unlock; + } + + leaves = kcalloc(leaves_count, sizeof(struct net_shaper) + + sizeof(struct net_shaper *), GFP_KERNEL); + if (!leaves) { + ret = -ENOMEM; + goto unlock; + } + old_nodes = (void *)&leaves[leaves_count]; + + ret = net_shaper_parse_node(binding, info->attrs, info, &node); + if (ret) + goto free_leaves; + + i = 0; + nla_for_each_attr_type(attr, NET_SHAPER_A_LEAVES, + genlmsg_data(info->genlhdr), + genlmsg_len(info->genlhdr), rem) { + if (WARN_ON_ONCE(i >= leaves_count)) + goto free_leaves; + + ret = net_shaper_parse_leaf(binding, attr, info, + &node, &leaves[i]); + if (ret) + goto free_leaves; + i++; + } + + /* Prepare the msg reply in advance, to avoid device operation + * rollback on allocation failure. + */ + msg = genlmsg_new(net_shaper_handle_size(), GFP_KERNEL); + if (!msg) + goto free_leaves; + + hierarchy = net_shaper_hierarchy_setup(binding); + if (!hierarchy) { + ret = -ENOMEM; + goto free_msg; + } + + /* Record the node shapers that this group() operation can make + * childless for later cleanup. + */ + for (i = 0; i < leaves_count; i++) { + if (leaves[i].parent.scope == NET_SHAPER_SCOPE_NODE && + net_shaper_handle_cmp(&leaves[i].parent, &node.handle)) { + struct net_shaper *tmp; + + tmp = net_shaper_lookup(binding, &leaves[i].parent); + if (!tmp) + continue; + + old_nodes[old_nodes_count++] = tmp; + } + } + + ret = __net_shaper_group(binding, leaves_count, leaves, &node, + info->extack); + if (ret) + goto free_msg; + + /* Check if we need to delete any node left alone by the new leaves + * linkage. + */ + for (i = 0; i < old_nodes_count; ++i) { + struct net_shaper *tmp = old_nodes[i]; + + if (--tmp->leaves > 0) + continue; + + /* Errors here are not fatal: the grouping operation is + * completed, and user-space can still explicitly clean-up + * left-over nodes. + */ + __net_shaper_delete(binding, tmp, info->extack); + } + + ret = net_shaper_group_send_reply(binding, &node.handle, info, msg); + if (ret) + GENL_SET_ERR_MSG_FMT(info, "Can't send reply"); + +free_leaves: + kfree(leaves); + +unlock: + net_shaper_unlock(binding); + return ret; + +free_msg: + kfree_skb(msg); + goto free_leaves; +} + static void net_shaper_flush(struct net_shaper_binding *binding) { struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); -- cgit v1.3 From bf230c497d31ab3bc9beac0df9e186595b351b19 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Oct 2024 10:09:52 +0200 Subject: net-shapers: implement delete support for NODE scope shaper Leverage the previously introduced group operation to implement the removal of NODE scope shaper, re-linking its leaves under the the parent node before actually deleting the specified NODE scope shaper. Reviewed-by: Jiri Pirko Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/763d484b5b69e365acccfd8031b183c647a367a4.1728460186.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/shaper/shaper.c | 86 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 74 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index c23ac611850d..ddd1999b3f27 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -785,7 +785,8 @@ static int net_shaper_parent_from_leaves(int leaves_count, } static int __net_shaper_group(struct net_shaper_binding *binding, - int leaves_count, struct net_shaper *leaves, + bool update_node, int leaves_count, + struct net_shaper *leaves, struct net_shaper *node, struct netlink_ext_ack *extack) { @@ -831,12 +832,14 @@ static int __net_shaper_group(struct net_shaper_binding *binding, } } - /* For newly created node scope shaper, the following will update - * the handle, due to id allocation. - */ - ret = net_shaper_pre_insert(binding, &node->handle, extack); - if (ret) - return ret; + if (update_node) { + /* For newly created node scope shaper, the following will + * update the handle, due to id allocation. + */ + ret = net_shaper_pre_insert(binding, &node->handle, extack); + if (ret) + return ret; + } for (i = 0; i < leaves_count; ++i) { leaf_handle = leaves[i].handle; @@ -864,7 +867,8 @@ static int __net_shaper_group(struct net_shaper_binding *binding, */ if (new_node && parent) parent->leaves++; - net_shaper_commit(binding, 1, node); + if (update_node) + net_shaper_commit(binding, 1, node); net_shaper_commit(binding, leaves_count, leaves); return 0; @@ -873,6 +877,64 @@ rollback: return ret; } +static int net_shaper_pre_del_node(struct net_shaper_binding *binding, + const struct net_shaper *shaper, + struct netlink_ext_ack *extack) +{ + struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); + struct net_shaper *cur, *leaves, node = {}; + int ret, leaves_count = 0; + unsigned long index; + bool update_node; + + if (!shaper->leaves) + return 0; + + /* Fetch the new node information. */ + node.handle = shaper->parent; + cur = net_shaper_lookup(binding, &node.handle); + if (cur) { + node = *cur; + } else { + /* A scope NODE shaper can be nested only to the NETDEV scope + * shaper without creating the latter, this check may fail only + * if the data is in inconsistent status. + */ + if (WARN_ON_ONCE(node.handle.scope != NET_SHAPER_SCOPE_NETDEV)) + return -EINVAL; + } + + leaves = kcalloc(shaper->leaves, sizeof(struct net_shaper), + GFP_KERNEL); + if (!leaves) + return -ENOMEM; + + /* Build the leaves arrays. */ + xa_for_each(&hierarchy->shapers, index, cur) { + if (net_shaper_handle_cmp(&cur->parent, &shaper->handle)) + continue; + + if (WARN_ON_ONCE(leaves_count == shaper->leaves)) { + ret = -EINVAL; + goto free; + } + + leaves[leaves_count++] = *cur; + } + + /* When re-linking to the netdev shaper, avoid the eventual, implicit, + * creation of the new node, would be surprising since the user is + * doing a delete operation. + */ + update_node = node.handle.scope != NET_SHAPER_SCOPE_NETDEV; + ret = __net_shaper_group(binding, update_node, leaves_count, + leaves, &node, extack); + +free: + kfree(leaves); + return ret; +} + int net_shaper_nl_delete_doit(struct sk_buff *skb, struct genl_info *info) { struct net_shaper_hierarchy *hierarchy; @@ -905,9 +967,9 @@ int net_shaper_nl_delete_doit(struct sk_buff *skb, struct genl_info *info) } if (handle.scope == NET_SHAPER_SCOPE_NODE) { - /* TODO: implement support for scope NODE delete. */ - ret = -EINVAL; - goto unlock; + ret = net_shaper_pre_del_node(binding, shaper, info->extack); + if (ret) + goto unlock; } ret = __net_shaper_delete(binding, shaper, info->extack); @@ -1027,7 +1089,7 @@ int net_shaper_nl_group_doit(struct sk_buff *skb, struct genl_info *info) } } - ret = __net_shaper_group(binding, leaves_count, leaves, &node, + ret = __net_shaper_group(binding, true, leaves_count, leaves, &node, info->extack); if (ret) goto free_msg; -- cgit v1.3 From ff7d4deb1f3e18b983cb51fc2dcb7af57991d827 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Oct 2024 10:09:53 +0200 Subject: net-shapers: implement shaper cleanup on queue deletion hook into netif_set_real_num_tx_queues() to cleanup any shaper configured on top of the to-be-destroyed TX queues. Reviewed-by: Jiri Pirko Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/6da4ee03cae2b2a757d7b59e88baf09cc94c5ef1.1728460186.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 ++ net/core/dev.h | 4 ++++ net/shaper/shaper.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 6e727c49a6f7..b590eefce3b4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2949,6 +2949,8 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) if (dev->num_tc) netif_setup_tc(dev, txq); + net_shaper_set_real_num_tx_queues(dev, txq); + dev_qdisc_change_real_num_tx(dev, txq); dev->real_num_tx_queues = txq; diff --git a/net/core/dev.h b/net/core/dev.h index 13c558874af3..d3ea92949ff3 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -37,8 +37,12 @@ void dev_addr_check(struct net_device *dev); #if IS_ENABLED(CONFIG_NET_SHAPER) void net_shaper_flush_netdev(struct net_device *dev); +void net_shaper_set_real_num_tx_queues(struct net_device *dev, + unsigned int txq); #else static inline void net_shaper_flush_netdev(struct net_device *dev) {} +static inline void net_shaper_set_real_num_tx_queues(struct net_device *dev, + unsigned int txq) {} #endif /* sysctls not referred to from outside net/core/ */ diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index ddd1999b3f27..85ad172833fc 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -1157,6 +1157,54 @@ void net_shaper_flush_netdev(struct net_device *dev) net_shaper_flush(&binding); } +void net_shaper_set_real_num_tx_queues(struct net_device *dev, + unsigned int txq) +{ + struct net_shaper_hierarchy *hierarchy; + struct net_shaper_binding binding; + int i; + + binding.type = NET_SHAPER_BINDING_TYPE_NETDEV; + binding.netdev = dev; + hierarchy = net_shaper_hierarchy(&binding); + if (!hierarchy) + return; + + /* Only drivers implementing shapers support ensure + * the lock is acquired in advance. + */ + lockdep_assert_held(&dev->lock); + + /* Take action only when decreasing the tx queue number. */ + for (i = txq; i < dev->real_num_tx_queues; ++i) { + struct net_shaper_handle handle, parent_handle; + struct net_shaper *shaper; + u32 index; + + handle.scope = NET_SHAPER_SCOPE_QUEUE; + handle.id = i; + shaper = net_shaper_lookup(&binding, &handle); + if (!shaper) + continue; + + /* Don't touch the H/W for the queue shaper, the drivers already + * deleted the queue and related resources. + */ + parent_handle = shaper->parent; + index = net_shaper_handle_to_index(&handle); + xa_erase(&hierarchy->shapers, index); + kfree_rcu(shaper, rcu); + + /* The recursion on parent does the full job. */ + if (parent_handle.scope != NET_SHAPER_SCOPE_NODE) + continue; + + shaper = net_shaper_lookup(&binding, &parent_handle); + if (shaper && !--shaper->leaves) + __net_shaper_delete(&binding, shaper, NULL); + } +} + static int __init shaper_init(void) { return genl_register_family(&net_shaper_nl_family); -- cgit v1.3 From 14bba9285aedefb99647d716b0f61bf32081e387 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Oct 2024 10:09:54 +0200 Subject: netlink: spec: add shaper introspection support Allow the user-space to fine-grain query the shaping features supported by the NIC on each domain. Reviewed-by: Jiri Pirko Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/3ddd10e450e3fe7d4b944c0d0b886d4483529ee6.1728460186.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/net_shaper.yaml | 88 +++++++++++++++++++++++++++++ include/uapi/linux/net_shaper.h | 17 ++++++ net/shaper/shaper.c | 32 +++++++++++ net/shaper/shaper_nl_gen.c | 29 ++++++++++ net/shaper/shaper_nl_gen.h | 10 ++++ 5 files changed, 176 insertions(+) (limited to 'net') diff --git a/Documentation/netlink/specs/net_shaper.yaml b/Documentation/netlink/specs/net_shaper.yaml index 618fc09932ff..8ebad0d02904 100644 --- a/Documentation/netlink/specs/net_shaper.yaml +++ b/Documentation/netlink/specs/net_shaper.yaml @@ -26,6 +26,11 @@ doc: | The user can query the running configuration via the @get operation. + Different devices can provide different feature sets, e.g. with no + support for complex scheduling hierarchy, or for some shaping + parameters. The user can introspect the HW capabilities via the + @cap-get operation. + definitions: - type: enum @@ -148,6 +153,53 @@ attribute-sets: name: priority - name: weight + - + name: caps + attributes: + - + name: ifindex + type: u32 + doc: Interface index queried for shapers capabilities. + - + name: scope + type: u32 + enum: scope + doc: The scope to which the queried capabilities apply. + - + name: support-metric-bps + type: flag + doc: The device accepts 'bps' metric for bw-min, bw-max and burst. + - + name: support-metric-pps + type: flag + doc: The device accepts 'pps' metric for bw-min, bw-max and burst. + - + name: support-nesting + type: flag + doc: | + The device supports nesting shaper belonging to this scope + below 'node' scoped shapers. Only 'queue' and 'node' + scope can have flag 'support-nesting'. + - + name: support-bw-min + type: flag + doc: The device supports a minimum guaranteed B/W. + - + name: support-bw-max + type: flag + doc: The device supports maximum B/W shaping. + - + name: support-burst + type: flag + doc: The device supports a maximum burst size. + - + name: support-priority + type: flag + doc: The device supports priority scheduling. + - + name: support-weight + type: flag + doc: The device supports weighted round robin scheduling. operations: list: @@ -272,3 +324,39 @@ operations: - leaves reply: attributes: *ns-binding + + - + name: cap-get + doc: | + Get the shaper capabilities supported by the given device + for the specified scope. + attribute-set: caps + + do: + pre: net-shaper-nl-cap-pre-doit + post: net-shaper-nl-cap-post-doit + request: + attributes: + - ifindex + - scope + reply: + attributes: &cap-attrs + - ifindex + - scope + - support-metric-bps + - support-metric-pps + - support-nesting + - support-bw-min + - support-bw-max + - support-burst + - support-priority + - support-weight + + dump: + pre: net-shaper-nl-cap-pre-dumpit + post: net-shaper-nl-cap-post-dumpit + request: + attributes: + - ifindex + reply: + attributes: *cap-attrs diff --git a/include/uapi/linux/net_shaper.h b/include/uapi/linux/net_shaper.h index 9e3fa63618ee..d8834b59f7d7 100644 --- a/include/uapi/linux/net_shaper.h +++ b/include/uapi/linux/net_shaper.h @@ -65,11 +65,28 @@ enum { NET_SHAPER_A_HANDLE_MAX = (__NET_SHAPER_A_HANDLE_MAX - 1) }; +enum { + NET_SHAPER_A_CAPS_IFINDEX = 1, + NET_SHAPER_A_CAPS_SCOPE, + NET_SHAPER_A_CAPS_SUPPORT_METRIC_BPS, + NET_SHAPER_A_CAPS_SUPPORT_METRIC_PPS, + NET_SHAPER_A_CAPS_SUPPORT_NESTING, + NET_SHAPER_A_CAPS_SUPPORT_BW_MIN, + NET_SHAPER_A_CAPS_SUPPORT_BW_MAX, + NET_SHAPER_A_CAPS_SUPPORT_BURST, + NET_SHAPER_A_CAPS_SUPPORT_PRIORITY, + NET_SHAPER_A_CAPS_SUPPORT_WEIGHT, + + __NET_SHAPER_A_CAPS_MAX, + NET_SHAPER_A_CAPS_MAX = (__NET_SHAPER_A_CAPS_MAX - 1) +}; + enum { NET_SHAPER_CMD_GET = 1, NET_SHAPER_CMD_SET, NET_SHAPER_CMD_DELETE, NET_SHAPER_CMD_GROUP, + NET_SHAPER_CMD_CAP_GET, __NET_SHAPER_CMD_MAX, NET_SHAPER_CMD_MAX = (__NET_SHAPER_CMD_MAX - 1) diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index 85ad172833fc..92c8da046391 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -598,6 +598,27 @@ int net_shaper_nl_post_dumpit(struct netlink_callback *cb) return 0; } +int net_shaper_nl_cap_pre_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +void net_shaper_nl_cap_post_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ +} + +int net_shaper_nl_cap_pre_dumpit(struct netlink_callback *cb) +{ + return -EOPNOTSUPP; +} + +int net_shaper_nl_cap_post_dumpit(struct netlink_callback *cb) +{ + return -EOPNOTSUPP; +} + int net_shaper_nl_get_doit(struct sk_buff *skb, struct genl_info *info) { struct net_shaper_binding *binding; @@ -1126,6 +1147,17 @@ free_msg: goto free_leaves; } +int net_shaper_nl_cap_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + return 0; +} + +int net_shaper_nl_cap_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return 0; +} + static void net_shaper_flush(struct net_shaper_binding *binding) { struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); diff --git a/net/shaper/shaper_nl_gen.c b/net/shaper/shaper_nl_gen.c index 34185c5989e6..204c8ae8c7b1 100644 --- a/net/shaper/shaper_nl_gen.c +++ b/net/shaper/shaper_nl_gen.c @@ -65,6 +65,17 @@ static const struct nla_policy net_shaper_group_nl_policy[NET_SHAPER_A_LEAVES + [NET_SHAPER_A_LEAVES] = NLA_POLICY_NESTED(net_shaper_leaf_info_nl_policy), }; +/* NET_SHAPER_CMD_CAP_GET - do */ +static const struct nla_policy net_shaper_cap_get_do_nl_policy[NET_SHAPER_A_CAPS_SCOPE + 1] = { + [NET_SHAPER_A_CAPS_IFINDEX] = { .type = NLA_U32, }, + [NET_SHAPER_A_CAPS_SCOPE] = NLA_POLICY_MAX(NLA_U32, 3), +}; + +/* NET_SHAPER_CMD_CAP_GET - dump */ +static const struct nla_policy net_shaper_cap_get_dump_nl_policy[NET_SHAPER_A_CAPS_IFINDEX + 1] = { + [NET_SHAPER_A_CAPS_IFINDEX] = { .type = NLA_U32, }, +}; + /* Ops table for net_shaper */ static const struct genl_split_ops net_shaper_nl_ops[] = { { @@ -112,6 +123,24 @@ static const struct genl_split_ops net_shaper_nl_ops[] = { .maxattr = NET_SHAPER_A_LEAVES, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, + { + .cmd = NET_SHAPER_CMD_CAP_GET, + .pre_doit = net_shaper_nl_cap_pre_doit, + .doit = net_shaper_nl_cap_get_doit, + .post_doit = net_shaper_nl_cap_post_doit, + .policy = net_shaper_cap_get_do_nl_policy, + .maxattr = NET_SHAPER_A_CAPS_SCOPE, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = NET_SHAPER_CMD_CAP_GET, + .start = net_shaper_nl_cap_pre_dumpit, + .dumpit = net_shaper_nl_cap_get_dumpit, + .done = net_shaper_nl_cap_post_dumpit, + .policy = net_shaper_cap_get_dump_nl_policy, + .maxattr = NET_SHAPER_A_CAPS_IFINDEX, + .flags = GENL_CMD_CAP_DUMP, + }, }; struct genl_family net_shaper_nl_family __ro_after_init = { diff --git a/net/shaper/shaper_nl_gen.h b/net/shaper/shaper_nl_gen.h index 016cb6f3187b..cb7f9026fc23 100644 --- a/net/shaper/shaper_nl_gen.h +++ b/net/shaper/shaper_nl_gen.h @@ -17,17 +17,27 @@ extern const struct nla_policy net_shaper_leaf_info_nl_policy[NET_SHAPER_A_WEIGH int net_shaper_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); +int net_shaper_nl_cap_pre_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); void net_shaper_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); +void +net_shaper_nl_cap_post_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); int net_shaper_nl_pre_dumpit(struct netlink_callback *cb); +int net_shaper_nl_cap_pre_dumpit(struct netlink_callback *cb); int net_shaper_nl_post_dumpit(struct netlink_callback *cb); +int net_shaper_nl_cap_post_dumpit(struct netlink_callback *cb); int net_shaper_nl_get_doit(struct sk_buff *skb, struct genl_info *info); int net_shaper_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int net_shaper_nl_set_doit(struct sk_buff *skb, struct genl_info *info); int net_shaper_nl_delete_doit(struct sk_buff *skb, struct genl_info *info); int net_shaper_nl_group_doit(struct sk_buff *skb, struct genl_info *info); +int net_shaper_nl_cap_get_doit(struct sk_buff *skb, struct genl_info *info); +int net_shaper_nl_cap_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); extern struct genl_family net_shaper_nl_family; -- cgit v1.3 From 553ea9f1efd6e8410b01f7a31cfb71a97cadcd8b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Oct 2024 10:09:55 +0200 Subject: net: shaper: implement introspection support The netlink op is a simple wrapper around the device callback. Extend the existing fetch_dev() helper adding an attribute argument for the requested device. Reuse such helper in the newly implemented operation. Reviewed-by: Jiri Pirko Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/66eb62f22b3a5ba06ca91d01ae77515e5f447e15.1728460186.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/shaper/shaper.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 95 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index 92c8da046391..f9399984165a 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -601,22 +601,29 @@ int net_shaper_nl_post_dumpit(struct netlink_callback *cb) int net_shaper_nl_cap_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { - return -EOPNOTSUPP; + return net_shaper_generic_pre(info, NET_SHAPER_A_CAPS_IFINDEX); } void net_shaper_nl_cap_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { + net_shaper_generic_post(info); } int net_shaper_nl_cap_pre_dumpit(struct netlink_callback *cb) { - return -EOPNOTSUPP; + struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)cb->ctx; + + return net_shaper_ctx_setup(genl_info_dump(cb), + NET_SHAPER_A_CAPS_IFINDEX, ctx); } int net_shaper_nl_cap_post_dumpit(struct netlink_callback *cb) { - return -EOPNOTSUPP; + struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)cb->ctx; + + net_shaper_ctx_cleanup(ctx); + return 0; } int net_shaper_nl_get_doit(struct sk_buff *skb, struct genl_info *info) @@ -1147,14 +1154,99 @@ free_msg: goto free_leaves; } +static int +net_shaper_cap_fill_one(struct sk_buff *msg, + struct net_shaper_binding *binding, + enum net_shaper_scope scope, unsigned long flags, + const struct genl_info *info) +{ + unsigned long cur; + void *hdr; + + hdr = genlmsg_iput(msg, info); + if (!hdr) + return -EMSGSIZE; + + if (net_shaper_fill_binding(msg, binding, NET_SHAPER_A_CAPS_IFINDEX) || + nla_put_u32(msg, NET_SHAPER_A_CAPS_SCOPE, scope)) + goto nla_put_failure; + + for (cur = NET_SHAPER_A_CAPS_SUPPORT_METRIC_BPS; + cur <= NET_SHAPER_A_CAPS_MAX; ++cur) { + if (flags & BIT(cur) && nla_put_flag(msg, cur)) + goto nla_put_failure; + } + + genlmsg_end(msg, hdr); + + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + int net_shaper_nl_cap_get_doit(struct sk_buff *skb, struct genl_info *info) { + struct net_shaper_binding *binding; + const struct net_shaper_ops *ops; + enum net_shaper_scope scope; + unsigned long flags = 0; + struct sk_buff *msg; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, NET_SHAPER_A_CAPS_SCOPE)) + return -EINVAL; + + binding = net_shaper_binding_from_ctx(info->ctx); + scope = nla_get_u32(info->attrs[NET_SHAPER_A_CAPS_SCOPE]); + ops = net_shaper_ops(binding); + ops->capabilities(binding, scope, &flags); + if (!flags) + return -EOPNOTSUPP; + + msg = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + ret = net_shaper_cap_fill_one(msg, binding, scope, flags, info); + if (ret) + goto free_msg; + + ret = genlmsg_reply(msg, info); + if (ret) + goto free_msg; return 0; + +free_msg: + nlmsg_free(msg); + return ret; } int net_shaper_nl_cap_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { + const struct genl_info *info = genl_info_dump(cb); + struct net_shaper_binding *binding; + const struct net_shaper_ops *ops; + enum net_shaper_scope scope; + int ret; + + binding = net_shaper_binding_from_ctx(cb->ctx); + ops = net_shaper_ops(binding); + for (scope = 0; scope <= NET_SHAPER_SCOPE_MAX; ++scope) { + unsigned long flags = 0; + + ops->capabilities(binding, scope, &flags); + if (!flags) + continue; + + ret = net_shaper_cap_fill_one(skb, binding, scope, flags, + info); + if (ret) + return ret; + } + return 0; } -- cgit v1.3 From ecd82cfee355d63c1b961a0fb8dadd8aab9dc2aa Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Oct 2024 10:09:56 +0200 Subject: net-shapers: implement cap validation in the core Use the device capabilities to reject invalid attribute values before pushing them to the H/W. Note that validating the metric explicitly avoids NL_SET_BAD_ATTR() usage, to provide unambiguous error messages to the user. Validating the nesting requires the knowledge of the new parent for the given shaper; as such is a chicken-egg problem: to validate the leaf nesting we need to know the node scope, to validate the node nesting we need to know the leafs parent scope. To break the circular dependency, place the leafs nesting validation after the parsing. Suggested-by: Jakub Kicinski Reviewed-by: Jakub Kicinski Reviewed-by: Jiri Pirko Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/54667601813e4c0348f39bf8ad2446ffc9fcd383.1728460186.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/shaper/shaper.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) (limited to 'net') diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index f9399984165a..15463062fe7b 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -439,6 +439,74 @@ static int net_shaper_parse_handle(const struct nlattr *attr, return 0; } +static int net_shaper_validate_caps(struct net_shaper_binding *binding, + struct nlattr **tb, + const struct genl_info *info, + struct net_shaper *shaper) +{ + const struct net_shaper_ops *ops = net_shaper_ops(binding); + struct nlattr *bad = NULL; + unsigned long caps = 0; + + ops->capabilities(binding, shaper->handle.scope, &caps); + + if (tb[NET_SHAPER_A_PRIORITY] && + !(caps & BIT(NET_SHAPER_A_CAPS_SUPPORT_PRIORITY))) + bad = tb[NET_SHAPER_A_PRIORITY]; + if (tb[NET_SHAPER_A_WEIGHT] && + !(caps & BIT(NET_SHAPER_A_CAPS_SUPPORT_WEIGHT))) + bad = tb[NET_SHAPER_A_WEIGHT]; + if (tb[NET_SHAPER_A_BW_MIN] && + !(caps & BIT(NET_SHAPER_A_CAPS_SUPPORT_BW_MIN))) + bad = tb[NET_SHAPER_A_BW_MIN]; + if (tb[NET_SHAPER_A_BW_MAX] && + !(caps & BIT(NET_SHAPER_A_CAPS_SUPPORT_BW_MAX))) + bad = tb[NET_SHAPER_A_BW_MAX]; + if (tb[NET_SHAPER_A_BURST] && + !(caps & BIT(NET_SHAPER_A_CAPS_SUPPORT_BURST))) + bad = tb[NET_SHAPER_A_BURST]; + + if (!caps) + bad = tb[NET_SHAPER_A_HANDLE]; + + if (bad) { + NL_SET_BAD_ATTR(info->extack, bad); + return -EOPNOTSUPP; + } + + if (shaper->handle.scope == NET_SHAPER_SCOPE_QUEUE && + binding->type == NET_SHAPER_BINDING_TYPE_NETDEV && + shaper->handle.id >= binding->netdev->real_num_tx_queues) { + NL_SET_ERR_MSG_FMT(info->extack, + "Not existing queue id %d max %d", + shaper->handle.id, + binding->netdev->real_num_tx_queues); + return -ENOENT; + } + + /* The metric is really used only if there is *any* rate-related + * setting, either in current attributes set or in pre-existing + * values. + */ + if (shaper->burst || shaper->bw_min || shaper->bw_max) { + u32 metric_cap = NET_SHAPER_A_CAPS_SUPPORT_METRIC_BPS + + shaper->metric; + + /* The metric test can fail even when the user did not + * specify the METRIC attribute. Pointing to rate related + * attribute will be confusing, as the attribute itself + * could be indeed supported, with a different metric. + * Be more specific. + */ + if (!(caps & BIT(metric_cap))) { + NL_SET_ERR_MSG_FMT(info->extack, "Bad metric %d", + shaper->metric); + return -EOPNOTSUPP; + } + } + return 0; +} + static int net_shaper_parse_info(struct net_shaper_binding *binding, struct nlattr **tb, const struct genl_info *info, @@ -487,6 +555,28 @@ static int net_shaper_parse_info(struct net_shaper_binding *binding, if (tb[NET_SHAPER_A_WEIGHT]) shaper->weight = nla_get_u32(tb[NET_SHAPER_A_WEIGHT]); + + ret = net_shaper_validate_caps(binding, tb, info, shaper); + if (ret < 0) + return ret; + + return 0; +} + +static int net_shaper_validate_nesting(struct net_shaper_binding *binding, + const struct net_shaper *shaper, + struct netlink_ext_ack *extack) +{ + const struct net_shaper_ops *ops = net_shaper_ops(binding); + unsigned long caps = 0; + + ops->capabilities(binding, shaper->handle.scope, &caps); + if (!(caps & BIT(NET_SHAPER_A_CAPS_SUPPORT_NESTING))) { + NL_SET_ERR_MSG_FMT(extack, + "Nesting not supported for scope %d", + shaper->handle.scope); + return -EOPNOTSUPP; + } return 0; } @@ -517,6 +607,13 @@ static int net_shaper_parse_leaf(struct net_shaper_binding *binding, return -EINVAL; } + if (node->handle.scope == NET_SHAPER_SCOPE_NODE) { + ret = net_shaper_validate_nesting(binding, shaper, + info->extack); + if (ret < 0) + return ret; + } + if (!exists) net_shaper_default_parent(&shaper->handle, &shaper->parent); return 0; @@ -858,6 +955,10 @@ static int __net_shaper_group(struct net_shaper_binding *binding, node->parent.scope, node->parent.id); return -ENOENT; } + + ret = net_shaper_validate_nesting(binding, node, extack); + if (ret < 0) + return ret; } if (update_node) { -- cgit v1.3 From cd959bf7c3bbaf64a29750c5e36776078a18a8fe Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 9 Oct 2024 11:05:21 +0100 Subject: net/smc: Address spelling errors Address spelling errors flagged by codespell. This patch is intended to cover all files under drivers/smc Signed-off-by: Simon Horman Reviewed-by: D. Wythe Reviewed-by: Guangguan Wang Reviewed-by: Randy Dunlap Reviewed-by: Wenjia Zhang Link: https://patch.msgid.link/20241009-smc-starspell-v1-1-b8b395bbaf82@kernel.org Signed-off-by: Jakub Kicinski --- net/smc/smc.h | 2 +- net/smc/smc_clc.h | 2 +- net/smc/smc_core.c | 2 +- net/smc/smc_core.h | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/smc/smc.h b/net/smc/smc.h index ad77d6b6b8d3..78ae10d06ed2 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -278,7 +278,7 @@ struct smc_connection { */ u64 peer_token; /* SMC-D token of peer */ u8 killed : 1; /* abnormal termination */ - u8 freed : 1; /* normal termiation */ + u8 freed : 1; /* normal termination */ u8 out_of_sync : 1; /* out of sync with peer */ }; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 5625fda2960b..5fd6f5b8ef03 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -156,7 +156,7 @@ struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/ } __aligned(4); struct smc_clc_msg_smcd { /* SMC-D GID information */ - struct smc_clc_smcd_gid_chid ism; /* ISM native GID+CHID of requestor */ + struct smc_clc_smcd_gid_chid ism; /* ISM native GID+CHID of requester */ __be16 v2_ext_offset; /* SMC Version 2 Extension Offset */ u8 vendor_oui[3]; /* vendor organizationally unique identifier */ u8 vendor_exp_options[5]; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 4e694860ece4..500952c2e67b 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -2321,7 +2321,7 @@ static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, } if (lgr->buf_type == SMCR_PHYS_CONT_BUFS) goto out; - fallthrough; // try virtually continguous buf + fallthrough; // try virtually contiguous buf case SMCR_VIRT_CONT_BUFS: buf_desc->order = get_order(bufsize); buf_desc->cpu_addr = vzalloc(PAGE_SIZE << buf_desc->order); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 0db4e5f79ac4..69b54ecd6503 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -30,7 +30,7 @@ */ #define SMC_CONN_PER_LGR_PREFER 255 /* Preferred connections per link group used for * SMC-R v2.1 and later negotiation, vendors or - * distrubutions may modify it to a value between + * distributions may modify it to a value between * 16-255 as needed. */ @@ -181,7 +181,7 @@ struct smc_link { */ #define SMC_LINKS_PER_LGR_MAX_PREFER 2 /* Preferred max links per link group used for * SMC-R v2.1 and later negotiation, vendors or - * distrubutions may modify it to a value between + * distributions may modify it to a value between * 1-2 as needed. */ -- cgit v1.3 From d677aebd663ddc287f2b2bda098474694a0ca875 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2024 03:41:00 +0000 Subject: tcp: move sysctl_tcp_l3mdev_accept to netns_ipv4_read_rx sysctl_tcp_l3mdev_accept is read from TCP receive fast path from tcp_v6_early_demux(), __inet6_lookup_established, inet_request_bound_dev_if(). Move it to netns_ipv4_read_rx. Remove the '#ifdef CONFIG_NET_L3_MASTER_DEV' that was guarding its definition. Note this adds a hole of three bytes that could be filled later. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Cc: Wei Wang Cc: Coco Li Link: https://patch.msgid.link/20241010034100.320832-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst | 2 +- include/net/netns/ipv4.h | 5 ++--- net/core/net_namespace.c | 4 +++- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst b/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst index 392e08a6ec04..629da6dc6d74 100644 --- a/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst +++ b/Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst @@ -59,7 +59,7 @@ u8 sysctl_udp_early_demux u8 sysctl_nexthop_compat_mode u8 sysctl_fwmark_reflect u8 sysctl_tcp_fwmark_accept -u8 sysctl_tcp_l3mdev_accept +u8 sysctl_tcp_l3mdev_accept read_mostly __inet6_lookup_established/inet_request_bound_dev_if u8 sysctl_tcp_mtu_probing int sysctl_tcp_mtu_probe_floor int sysctl_tcp_base_mss diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 66a4cffc44ee..3b1de80b5c25 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -76,6 +76,8 @@ struct netns_ipv4 { __cacheline_group_begin(netns_ipv4_read_rx); u8 sysctl_ip_early_demux; u8 sysctl_tcp_early_demux; + u8 sysctl_tcp_l3mdev_accept; + /* 3 bytes hole, try to pack */ int sysctl_tcp_reordering; int sysctl_tcp_rmem[3]; __cacheline_group_end(netns_ipv4_read_rx); @@ -151,9 +153,6 @@ struct netns_ipv4 { u8 sysctl_fwmark_reflect; u8 sysctl_tcp_fwmark_accept; -#ifdef CONFIG_NET_L3_MASTER_DEV - u8 sysctl_tcp_l3mdev_accept; -#endif u8 sysctl_tcp_mtu_probing; int sysctl_tcp_mtu_probe_floor; int sysctl_tcp_base_mss; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index a5bc1fd8b034..0a86aff17f51 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1159,11 +1159,13 @@ static void __init netns_ipv4_struct_check(void) sysctl_ip_early_demux); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_early_demux); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_l3mdev_accept); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_reordering); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_rmem); - CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 18); + CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 22); } #endif -- cgit v1.3 From 37f670aacd481128ad9a940ac2d3372aecd92824 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 9 Oct 2024 10:32:15 -0700 Subject: lsm: use lsm_prop in security_current_getsecid Change the security_current_getsecid_subj() and security_task_getsecid_obj() interfaces to fill in a lsm_prop structure instead of a u32 secid. Audit interfaces will need to collect all possible security data for possible reporting. Cc: linux-integrity@vger.kernel.org Cc: audit@vger.kernel.org Cc: selinux@vger.kernel.org Signed-off-by: Casey Schaufler [PM: subject line tweak] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 6 ++-- include/linux/security.h | 13 ++++---- kernel/audit.c | 11 +++---- kernel/auditfilter.c | 3 +- kernel/auditsc.c | 22 ++++++++----- net/netlabel/netlabel_unlabeled.c | 5 ++- net/netlabel/netlabel_user.h | 6 +++- security/apparmor/lsm.c | 20 ++++++++---- security/integrity/ima/ima.h | 6 ++-- security/integrity/ima/ima_api.c | 6 ++-- security/integrity/ima/ima_appraise.c | 6 ++-- security/integrity/ima/ima_main.c | 59 ++++++++++++++++++----------------- security/integrity/ima/ima_policy.c | 14 ++++----- security/security.c | 28 ++++++++--------- security/selinux/hooks.c | 17 ++++++---- security/smack/smack_lsm.c | 25 +++++++++------ 16 files changed, 139 insertions(+), 108 deletions(-) (limited to 'net') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 6ef2a345ea03..8a90fd9ff3c8 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -235,9 +235,9 @@ LSM_HOOK(int, 0, task_fix_setgroups, struct cred *new, const struct cred * old) LSM_HOOK(int, 0, task_setpgid, struct task_struct *p, pid_t pgid) LSM_HOOK(int, 0, task_getpgid, struct task_struct *p) LSM_HOOK(int, 0, task_getsid, struct task_struct *p) -LSM_HOOK(void, LSM_RET_VOID, current_getsecid_subj, u32 *secid) -LSM_HOOK(void, LSM_RET_VOID, task_getsecid_obj, - struct task_struct *p, u32 *secid) +LSM_HOOK(void, LSM_RET_VOID, current_getlsmprop_subj, struct lsm_prop *prop) +LSM_HOOK(void, LSM_RET_VOID, task_getlsmprop_obj, + struct task_struct *p, struct lsm_prop *prop) LSM_HOOK(int, 0, task_setnice, struct task_struct *p, int nice) LSM_HOOK(int, 0, task_setioprio, struct task_struct *p, int ioprio) LSM_HOOK(int, 0, task_getioprio, struct task_struct *p) diff --git a/include/linux/security.h b/include/linux/security.h index 15aef5f68e77..9bc8153f4e8b 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -507,8 +507,8 @@ int security_task_fix_setgroups(struct cred *new, const struct cred *old); int security_task_setpgid(struct task_struct *p, pid_t pgid); int security_task_getpgid(struct task_struct *p); int security_task_getsid(struct task_struct *p); -void security_current_getsecid_subj(u32 *secid); -void security_task_getsecid_obj(struct task_struct *p, u32 *secid); +void security_current_getlsmprop_subj(struct lsm_prop *prop); +void security_task_getlsmprop_obj(struct task_struct *p, struct lsm_prop *prop); int security_task_setnice(struct task_struct *p, int nice); int security_task_setioprio(struct task_struct *p, int ioprio); int security_task_getioprio(struct task_struct *p); @@ -1305,14 +1305,15 @@ static inline int security_task_getsid(struct task_struct *p) return 0; } -static inline void security_current_getsecid_subj(u32 *secid) +static inline void security_current_getlsmprop_subj(struct lsm_prop *prop) { - *secid = 0; + lsmprop_init(prop); } -static inline void security_task_getsecid_obj(struct task_struct *p, u32 *secid) +static inline void security_task_getlsmprop_obj(struct task_struct *p, + struct lsm_prop *prop) { - *secid = 0; + lsmprop_init(prop); } static inline int security_task_setnice(struct task_struct *p, int nice) diff --git a/kernel/audit.c b/kernel/audit.c index 47c41e6f9ea9..d2797e8fe182 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -2179,16 +2179,16 @@ void audit_log_key(struct audit_buffer *ab, char *key) int audit_log_task_context(struct audit_buffer *ab) { + struct lsm_prop prop; char *ctx = NULL; unsigned len; int error; - u32 sid; - security_current_getsecid_subj(&sid); - if (!sid) + security_current_getlsmprop_subj(&prop); + if (!lsmprop_is_set(&prop)) return 0; - error = security_secid_to_secctx(sid, &ctx, &len); + error = security_lsmprop_to_secctx(&prop, &ctx, &len); if (error) { if (error != -EINVAL) goto error_path; @@ -2405,8 +2405,7 @@ int audit_signal_info(int sig, struct task_struct *t) audit_sig_uid = auid; else audit_sig_uid = uid; - /* scaffolding */ - security_current_getsecid_subj(&audit_sig_lsm.scaffold.secid); + security_current_getlsmprop_subj(&audit_sig_lsm); } return audit_signal_info_syscall(t); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 288a2092fd0d..a7de3dabe6e1 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1371,8 +1371,7 @@ int audit_filter(int msgtype, unsigned int listtype) case AUDIT_SUBJ_CLR: if (f->lsm_rule) { /* scaffolding */ - security_current_getsecid_subj( - &prop.scaffold.secid); + security_current_getlsmprop_subj(&prop); result = security_audit_rule_match( &prop, f->type, f->op, f->lsm_rule); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 5019eb32a97f..6b2b2a8d5647 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -470,7 +470,6 @@ static int audit_filter_rules(struct task_struct *tsk, { const struct cred *cred; int i, need_sid = 1; - u32 sid; struct lsm_prop prop = { }; unsigned int sessionid; @@ -675,15 +674,14 @@ static int audit_filter_rules(struct task_struct *tsk, * fork()/copy_process() in which case * the new @tsk creds are still a dup * of @current's creds so we can still - * use security_current_getsecid_subj() + * use + * security_current_getlsmprop_subj() * here even though it always refs * @current's creds */ - security_current_getsecid_subj(&sid); + security_current_getlsmprop_subj(&prop); need_sid = 0; } - /* scaffolding */ - prop.scaffold.secid = sid; result = security_audit_rule_match(&prop, f->type, f->op, @@ -2730,12 +2728,15 @@ int __audit_sockaddr(int len, void *a) void __audit_ptrace(struct task_struct *t) { struct audit_context *context = audit_context(); + struct lsm_prop prop; context->target_pid = task_tgid_nr(t); context->target_auid = audit_get_loginuid(t); context->target_uid = task_uid(t); context->target_sessionid = audit_get_sessionid(t); - security_task_getsecid_obj(t, &context->target_sid); + security_task_getlsmprop_obj(t, &prop); + /* scaffolding */ + context->target_sid = prop.scaffold.secid; memcpy(context->target_comm, t->comm, TASK_COMM_LEN); } @@ -2751,6 +2752,7 @@ int audit_signal_info_syscall(struct task_struct *t) struct audit_aux_data_pids *axp; struct audit_context *ctx = audit_context(); kuid_t t_uid = task_uid(t); + struct lsm_prop prop; if (!audit_signals || audit_dummy_context()) return 0; @@ -2762,7 +2764,9 @@ int audit_signal_info_syscall(struct task_struct *t) ctx->target_auid = audit_get_loginuid(t); ctx->target_uid = t_uid; ctx->target_sessionid = audit_get_sessionid(t); - security_task_getsecid_obj(t, &ctx->target_sid); + security_task_getlsmprop_obj(t, &prop); + /* scaffolding */ + ctx->target_sid = prop.scaffold.secid; memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN); return 0; } @@ -2783,7 +2787,9 @@ int audit_signal_info_syscall(struct task_struct *t) axp->target_auid[axp->pid_count] = audit_get_loginuid(t); axp->target_uid[axp->pid_count] = t_uid; axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); - security_task_getsecid_obj(t, &axp->target_sid[axp->pid_count]); + security_task_getlsmprop_obj(t, &prop); + /* scaffolding */ + axp->target_sid[axp->pid_count] = prop.scaffold.secid; memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN); axp->pid_count++; diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 9996883bf2b7..5925f48a3ade 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -1534,11 +1534,14 @@ int __init netlbl_unlabel_defconf(void) int ret_val; struct netlbl_dom_map *entry; struct netlbl_audit audit_info; + struct lsm_prop prop; /* Only the kernel is allowed to call this function and the only time * it is called is at bootup before the audit subsystem is reporting * messages so don't worry to much about these values. */ - security_current_getsecid_subj(&audit_info.secid); + security_current_getlsmprop_subj(&prop); + /* scaffolding */ + audit_info.secid = prop.scaffold.secid; audit_info.loginuid = GLOBAL_ROOT_UID; audit_info.sessionid = 0; diff --git a/net/netlabel/netlabel_user.h b/net/netlabel/netlabel_user.h index d6c5b31eb4eb..39f4f6df5f51 100644 --- a/net/netlabel/netlabel_user.h +++ b/net/netlabel/netlabel_user.h @@ -32,7 +32,11 @@ */ static inline void netlbl_netlink_auditinfo(struct netlbl_audit *audit_info) { - security_current_getsecid_subj(&audit_info->secid); + struct lsm_prop prop; + + security_current_getlsmprop_subj(&prop); + /* scaffolding */ + audit_info->secid = prop.scaffold.secid; audit_info->loginuid = audit_get_loginuid(current); audit_info->sessionid = audit_get_sessionid(current); } diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index a58b72ed246c..6331bcb35ec0 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -982,17 +982,24 @@ static void apparmor_bprm_committed_creds(const struct linux_binprm *bprm) return; } -static void apparmor_current_getsecid_subj(u32 *secid) +static void apparmor_current_getlsmprop_subj(struct lsm_prop *prop) { struct aa_label *label = __begin_current_label_crit_section(); - *secid = label->secid; + + prop->apparmor.label = label; + /* scaffolding */ + prop->scaffold.secid = label->secid; __end_current_label_crit_section(label); } -static void apparmor_task_getsecid_obj(struct task_struct *p, u32 *secid) +static void apparmor_task_getlsmprop_obj(struct task_struct *p, + struct lsm_prop *prop) { struct aa_label *label = aa_get_task_label(p); - *secid = label->secid; + + prop->apparmor.label = label; + /* scaffolding */ + prop->scaffold.secid = label->secid; aa_put_label(label); } @@ -1503,8 +1510,9 @@ static struct security_hook_list apparmor_hooks[] __ro_after_init = { LSM_HOOK_INIT(task_free, apparmor_task_free), LSM_HOOK_INIT(task_alloc, apparmor_task_alloc), - LSM_HOOK_INIT(current_getsecid_subj, apparmor_current_getsecid_subj), - LSM_HOOK_INIT(task_getsecid_obj, apparmor_task_getsecid_obj), + LSM_HOOK_INIT(current_getlsmprop_subj, + apparmor_current_getlsmprop_subj), + LSM_HOOK_INIT(task_getlsmprop_obj, apparmor_task_getlsmprop_obj), LSM_HOOK_INIT(task_setrlimit, apparmor_task_setrlimit), LSM_HOOK_INIT(task_kill, apparmor_task_kill), LSM_HOOK_INIT(userns_create, apparmor_userns_create), diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index cdfe8c8c7bac..c0d3b716d11f 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -369,7 +369,7 @@ static inline void ima_process_queued_keys(void) {} /* LIM API function definitions */ int ima_get_action(struct mnt_idmap *idmap, struct inode *inode, - const struct cred *cred, u32 secid, int mask, + const struct cred *cred, struct lsm_prop *prop, int mask, enum ima_hooks func, int *pcr, struct ima_template_desc **template_desc, const char *func_data, unsigned int *allowed_algos); @@ -400,8 +400,8 @@ const char *ima_d_path(const struct path *path, char **pathbuf, char *filename); /* IMA policy related functions */ int ima_match_policy(struct mnt_idmap *idmap, struct inode *inode, - const struct cred *cred, u32 secid, enum ima_hooks func, - int mask, int flags, int *pcr, + const struct cred *cred, struct lsm_prop *prop, + enum ima_hooks func, int mask, int flags, int *pcr, struct ima_template_desc **template_desc, const char *func_data, unsigned int *allowed_algos); void ima_init_policy(void); diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c index 984e861f6e33..c35ea613c9f8 100644 --- a/security/integrity/ima/ima_api.c +++ b/security/integrity/ima/ima_api.c @@ -165,7 +165,7 @@ err_out: * @idmap: idmap of the mount the inode was found from * @inode: pointer to the inode associated with the object being validated * @cred: pointer to credentials structure to validate - * @secid: secid of the task being validated + * @prop: properties of the task being validated * @mask: contains the permission mask (MAY_READ, MAY_WRITE, MAY_EXEC, * MAY_APPEND) * @func: caller identifier @@ -187,7 +187,7 @@ err_out: * */ int ima_get_action(struct mnt_idmap *idmap, struct inode *inode, - const struct cred *cred, u32 secid, int mask, + const struct cred *cred, struct lsm_prop *prop, int mask, enum ima_hooks func, int *pcr, struct ima_template_desc **template_desc, const char *func_data, unsigned int *allowed_algos) @@ -196,7 +196,7 @@ int ima_get_action(struct mnt_idmap *idmap, struct inode *inode, flags &= ima_policy_flag; - return ima_match_policy(idmap, inode, cred, secid, func, mask, + return ima_match_policy(idmap, inode, cred, prop, func, mask, flags, pcr, template_desc, func_data, allowed_algos); } diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c index 656c709b974f..884a3533f7af 100644 --- a/security/integrity/ima/ima_appraise.c +++ b/security/integrity/ima/ima_appraise.c @@ -73,13 +73,13 @@ bool is_ima_appraise_enabled(void) int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode, int mask, enum ima_hooks func) { - u32 secid; + struct lsm_prop prop; if (!ima_appraise) return 0; - security_current_getsecid_subj(&secid); - return ima_match_policy(idmap, inode, current_cred(), secid, + security_current_getlsmprop_subj(&prop); + return ima_match_policy(idmap, inode, current_cred(), &prop, func, mask, IMA_APPRAISE | IMA_HASH, NULL, NULL, NULL, NULL); } diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 06132cf47016..553a6f54a1e2 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -206,8 +206,8 @@ static void ima_file_free(struct file *file) } static int process_measurement(struct file *file, const struct cred *cred, - u32 secid, char *buf, loff_t size, int mask, - enum ima_hooks func) + struct lsm_prop *prop, char *buf, loff_t size, + int mask, enum ima_hooks func) { struct inode *real_inode, *inode = file_inode(file); struct ima_iint_cache *iint = NULL; @@ -232,7 +232,7 @@ static int process_measurement(struct file *file, const struct cred *cred, * bitmask based on the appraise/audit/measurement policy. * Included is the appraise submask. */ - action = ima_get_action(file_mnt_idmap(file), inode, cred, secid, + action = ima_get_action(file_mnt_idmap(file), inode, cred, prop, mask, func, &pcr, &template_desc, NULL, &allowed_algos); violation_check = ((func == FILE_CHECK || func == MMAP_CHECK || @@ -443,23 +443,23 @@ out: static int ima_file_mmap(struct file *file, unsigned long reqprot, unsigned long prot, unsigned long flags) { - u32 secid; + struct lsm_prop prop; int ret; if (!file) return 0; - security_current_getsecid_subj(&secid); + security_current_getlsmprop_subj(&prop); if (reqprot & PROT_EXEC) { - ret = process_measurement(file, current_cred(), secid, NULL, + ret = process_measurement(file, current_cred(), &prop, NULL, 0, MAY_EXEC, MMAP_CHECK_REQPROT); if (ret) return ret; } if (prot & PROT_EXEC) - return process_measurement(file, current_cred(), secid, NULL, + return process_measurement(file, current_cred(), &prop, NULL, 0, MAY_EXEC, MMAP_CHECK); return 0; @@ -488,9 +488,9 @@ static int ima_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, char *pathbuf = NULL; const char *pathname = NULL; struct inode *inode; + struct lsm_prop prop; int result = 0; int action; - u32 secid; int pcr; /* Is mprotect making an mmap'ed file executable? */ @@ -498,13 +498,13 @@ static int ima_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, !(prot & PROT_EXEC) || (vma->vm_flags & VM_EXEC)) return 0; - security_current_getsecid_subj(&secid); + security_current_getlsmprop_subj(&prop); inode = file_inode(vma->vm_file); action = ima_get_action(file_mnt_idmap(vma->vm_file), inode, - current_cred(), secid, MAY_EXEC, MMAP_CHECK, + current_cred(), &prop, MAY_EXEC, MMAP_CHECK, &pcr, &template, NULL, NULL); action |= ima_get_action(file_mnt_idmap(vma->vm_file), inode, - current_cred(), secid, MAY_EXEC, + current_cred(), &prop, MAY_EXEC, MMAP_CHECK_REQPROT, &pcr, &template, NULL, NULL); @@ -542,15 +542,18 @@ static int ima_bprm_check(struct linux_binprm *bprm) { int ret; u32 secid; + struct lsm_prop prop = { }; - security_current_getsecid_subj(&secid); - ret = process_measurement(bprm->file, current_cred(), secid, NULL, 0, - MAY_EXEC, BPRM_CHECK); + security_current_getlsmprop_subj(&prop); + ret = process_measurement(bprm->file, current_cred(), + &prop, NULL, 0, MAY_EXEC, BPRM_CHECK); if (ret) return ret; security_cred_getsecid(bprm->cred, &secid); - return process_measurement(bprm->file, bprm->cred, secid, NULL, 0, + /* scaffolding */ + prop.scaffold.secid = secid; + return process_measurement(bprm->file, bprm->cred, &prop, NULL, 0, MAY_EXEC, CREDS_CHECK); } @@ -566,10 +569,10 @@ static int ima_bprm_check(struct linux_binprm *bprm) */ static int ima_file_check(struct file *file, int mask) { - u32 secid; + struct lsm_prop prop; - security_current_getsecid_subj(&secid); - return process_measurement(file, current_cred(), secid, NULL, 0, + security_current_getlsmprop_subj(&prop); + return process_measurement(file, current_cred(), &prop, NULL, 0, mask & (MAY_READ | MAY_WRITE | MAY_EXEC | MAY_APPEND), FILE_CHECK); } @@ -768,7 +771,7 @@ static int ima_read_file(struct file *file, enum kernel_read_file_id read_id, bool contents) { enum ima_hooks func; - u32 secid; + struct lsm_prop prop; /* * Do devices using pre-allocated memory run the risk of the @@ -788,9 +791,9 @@ static int ima_read_file(struct file *file, enum kernel_read_file_id read_id, /* Read entire file for all partial reads. */ func = read_idmap[read_id] ?: FILE_CHECK; - security_current_getsecid_subj(&secid); - return process_measurement(file, current_cred(), secid, NULL, - 0, MAY_READ, func); + security_current_getlsmprop_subj(&prop); + return process_measurement(file, current_cred(), &prop, NULL, 0, + MAY_READ, func); } const int read_idmap[READING_MAX_ID] = { @@ -818,7 +821,7 @@ static int ima_post_read_file(struct file *file, char *buf, loff_t size, enum kernel_read_file_id read_id) { enum ima_hooks func; - u32 secid; + struct lsm_prop prop; /* permit signed certs */ if (!file && read_id == READING_X509_CERTIFICATE) @@ -831,8 +834,8 @@ static int ima_post_read_file(struct file *file, char *buf, loff_t size, } func = read_idmap[read_id] ?: FILE_CHECK; - security_current_getsecid_subj(&secid); - return process_measurement(file, current_cred(), secid, buf, size, + security_current_getlsmprop_subj(&prop); + return process_measurement(file, current_cred(), &prop, buf, size, MAY_READ, func); } @@ -967,7 +970,7 @@ int process_buffer_measurement(struct mnt_idmap *idmap, int digest_hash_len = hash_digest_size[ima_hash_algo]; int violation = 0; int action = 0; - u32 secid; + struct lsm_prop prop; if (digest && digest_len < digest_hash_len) return -EINVAL; @@ -990,9 +993,9 @@ int process_buffer_measurement(struct mnt_idmap *idmap, * buffer measurements. */ if (func) { - security_current_getsecid_subj(&secid); + security_current_getlsmprop_subj(&prop); action = ima_get_action(idmap, inode, current_cred(), - secid, 0, func, &pcr, &template, + &prop, 0, func, &pcr, &template, func_data, NULL); if (!(action & IMA_MEASURE) && !digest) return -ENOENT; diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 22a62e675ebc..a96dc3ff6aa0 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -557,7 +557,7 @@ static bool ima_match_rule_data(struct ima_rule_entry *rule, * @idmap: idmap of the mount the inode was found from * @inode: a pointer to an inode * @cred: a pointer to a credentials structure for user validation - * @secid: the secid of the task to be validated + * @prop: LSM properties of the task to be validated * @func: LIM hook identifier * @mask: requested action (MAY_READ | MAY_WRITE | MAY_APPEND | MAY_EXEC) * @func_data: func specific data, may be NULL @@ -567,7 +567,7 @@ static bool ima_match_rule_data(struct ima_rule_entry *rule, static bool ima_match_rules(struct ima_rule_entry *rule, struct mnt_idmap *idmap, struct inode *inode, const struct cred *cred, - u32 secid, enum ima_hooks func, int mask, + struct lsm_prop *prop, enum ima_hooks func, int mask, const char *func_data) { int i; @@ -658,8 +658,6 @@ retry: case LSM_SUBJ_USER: case LSM_SUBJ_ROLE: case LSM_SUBJ_TYPE: - /* scaffolding */ - prop.scaffold.secid = secid; rc = ima_filter_rule_match(&prop, lsm_rule->lsm[i].type, Audit_equal, lsm_rule->lsm[i].rule); @@ -723,7 +721,7 @@ static int get_subaction(struct ima_rule_entry *rule, enum ima_hooks func) * @inode: pointer to an inode for which the policy decision is being made * @cred: pointer to a credentials structure for which the policy decision is * being made - * @secid: LSM secid of the task to be validated + * @prop: LSM properties of the task to be validated * @func: IMA hook identifier * @mask: requested action (MAY_READ | MAY_WRITE | MAY_APPEND | MAY_EXEC) * @flags: IMA actions to consider (e.g. IMA_MEASURE | IMA_APPRAISE) @@ -740,8 +738,8 @@ static int get_subaction(struct ima_rule_entry *rule, enum ima_hooks func) * than writes so ima_match_policy() is classical RCU candidate. */ int ima_match_policy(struct mnt_idmap *idmap, struct inode *inode, - const struct cred *cred, u32 secid, enum ima_hooks func, - int mask, int flags, int *pcr, + const struct cred *cred, struct lsm_prop *prop, + enum ima_hooks func, int mask, int flags, int *pcr, struct ima_template_desc **template_desc, const char *func_data, unsigned int *allowed_algos) { @@ -759,7 +757,7 @@ int ima_match_policy(struct mnt_idmap *idmap, struct inode *inode, if (!(entry->action & actmask)) continue; - if (!ima_match_rules(entry, idmap, inode, cred, secid, + if (!ima_match_rules(entry, idmap, inode, cred, prop, func, mask, func_data)) continue; diff --git a/security/security.c b/security/security.c index f269421c2d72..5cdb5b171ff2 100644 --- a/security/security.c +++ b/security/security.c @@ -3492,33 +3492,33 @@ int security_task_getsid(struct task_struct *p) } /** - * security_current_getsecid_subj() - Get the current task's subjective secid - * @secid: secid value + * security_current_getlsmprop_subj() - Current task's subjective LSM data + * @prop: lsm specific information * * Retrieve the subjective security identifier of the current task and return - * it in @secid. In case of failure, @secid will be set to zero. + * it in @prop. */ -void security_current_getsecid_subj(u32 *secid) +void security_current_getlsmprop_subj(struct lsm_prop *prop) { - *secid = 0; - call_void_hook(current_getsecid_subj, secid); + lsmprop_init(prop); + call_void_hook(current_getlsmprop_subj, prop); } -EXPORT_SYMBOL(security_current_getsecid_subj); +EXPORT_SYMBOL(security_current_getlsmprop_subj); /** - * security_task_getsecid_obj() - Get a task's objective secid + * security_task_getlsmprop_obj() - Get a task's objective LSM data * @p: target task - * @secid: secid value + * @prop: lsm specific information * * Retrieve the objective security identifier of the task_struct in @p and - * return it in @secid. In case of failure, @secid will be set to zero. + * return it in @prop. */ -void security_task_getsecid_obj(struct task_struct *p, u32 *secid) +void security_task_getlsmprop_obj(struct task_struct *p, struct lsm_prop *prop) { - *secid = 0; - call_void_hook(task_getsecid_obj, p, secid); + lsmprop_init(prop); + call_void_hook(task_getlsmprop_obj, p, prop); } -EXPORT_SYMBOL(security_task_getsecid_obj); +EXPORT_SYMBOL(security_task_getlsmprop_obj); /** * security_task_setnice() - Check if setting a task's nice value is allowed diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 1d43367009ed..7d6ffd3483a8 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -4169,14 +4169,19 @@ static int selinux_task_getsid(struct task_struct *p) PROCESS__GETSESSION, NULL); } -static void selinux_current_getsecid_subj(u32 *secid) +static void selinux_current_getlsmprop_subj(struct lsm_prop *prop) { - *secid = current_sid(); + prop->selinux.secid = current_sid(); + /* scaffolding */ + prop->scaffold.secid = prop->selinux.secid; } -static void selinux_task_getsecid_obj(struct task_struct *p, u32 *secid) +static void selinux_task_getlsmprop_obj(struct task_struct *p, + struct lsm_prop *prop) { - *secid = task_sid_obj(p); + prop->selinux.secid = task_sid_obj(p); + /* scaffolding */ + prop->scaffold.secid = prop->selinux.secid; } static int selinux_task_setnice(struct task_struct *p, int nice) @@ -7203,8 +7208,8 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = { LSM_HOOK_INIT(task_setpgid, selinux_task_setpgid), LSM_HOOK_INIT(task_getpgid, selinux_task_getpgid), LSM_HOOK_INIT(task_getsid, selinux_task_getsid), - LSM_HOOK_INIT(current_getsecid_subj, selinux_current_getsecid_subj), - LSM_HOOK_INIT(task_getsecid_obj, selinux_task_getsecid_obj), + LSM_HOOK_INIT(current_getlsmprop_subj, selinux_current_getlsmprop_subj), + LSM_HOOK_INIT(task_getlsmprop_obj, selinux_task_getlsmprop_obj), LSM_HOOK_INIT(task_setnice, selinux_task_setnice), LSM_HOOK_INIT(task_setioprio, selinux_task_setioprio), LSM_HOOK_INIT(task_getioprio, selinux_task_getioprio), diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index fed44b4fc73d..e74e22c4232f 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -2239,30 +2239,35 @@ static int smack_task_getsid(struct task_struct *p) } /** - * smack_current_getsecid_subj - get the subjective secid of the current task - * @secid: where to put the result + * smack_current_getlsmprop_subj - get the subjective secid of the current task + * @prop: where to put the result * * Sets the secid to contain a u32 version of the task's subjective smack label. */ -static void smack_current_getsecid_subj(u32 *secid) +static void smack_current_getlsmprop_subj(struct lsm_prop *prop) { struct smack_known *skp = smk_of_current(); - *secid = skp->smk_secid; + prop->smack.skp = skp; + /* scaffolding */ + prop->scaffold.secid = skp->smk_secid; } /** - * smack_task_getsecid_obj - get the objective secid of the task + * smack_task_getlsmprop_obj - get the objective data of the task * @p: the task - * @secid: where to put the result + * @prop: where to put the result * * Sets the secid to contain a u32 version of the task's objective smack label. */ -static void smack_task_getsecid_obj(struct task_struct *p, u32 *secid) +static void smack_task_getlsmprop_obj(struct task_struct *p, + struct lsm_prop *prop) { struct smack_known *skp = smk_of_task_struct_obj(p); - *secid = skp->smk_secid; + prop->smack.skp = skp; + /* scaffolding */ + prop->scaffold.secid = skp->smk_secid; } /** @@ -5130,8 +5135,8 @@ static struct security_hook_list smack_hooks[] __ro_after_init = { LSM_HOOK_INIT(task_setpgid, smack_task_setpgid), LSM_HOOK_INIT(task_getpgid, smack_task_getpgid), LSM_HOOK_INIT(task_getsid, smack_task_getsid), - LSM_HOOK_INIT(current_getsecid_subj, smack_current_getsecid_subj), - LSM_HOOK_INIT(task_getsecid_obj, smack_task_getsecid_obj), + LSM_HOOK_INIT(current_getlsmprop_subj, smack_current_getlsmprop_subj), + LSM_HOOK_INIT(task_getlsmprop_obj, smack_task_getlsmprop_obj), LSM_HOOK_INIT(task_setnice, smack_task_setnice), LSM_HOOK_INIT(task_setioprio, smack_task_setioprio), LSM_HOOK_INIT(task_getioprio, smack_task_getioprio), -- cgit v1.3 From 05a344e54d0b4892736526e4a309851da8ee9c89 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 9 Oct 2024 10:32:20 -0700 Subject: netlabel,smack: use lsm_prop for audit data Replace the secid in the netlbl_audit structure with an lsm_prop. Remove scaffolding that was required when the value was a secid. Signed-off-by: Casey Schaufler [PM: fix the subject line] Signed-off-by: Paul Moore --- include/net/netlabel.h | 2 +- net/netlabel/netlabel_unlabeled.c | 5 +---- net/netlabel/netlabel_user.c | 7 +++---- net/netlabel/netlabel_user.h | 6 +----- security/smack/smackfs.c | 4 +--- 5 files changed, 7 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/net/netlabel.h b/include/net/netlabel.h index 529160f76cac..8de8344ee93c 100644 --- a/include/net/netlabel.h +++ b/include/net/netlabel.h @@ -97,7 +97,7 @@ struct calipso_doi; /* NetLabel audit information */ struct netlbl_audit { - u32 secid; + struct lsm_prop prop; kuid_t loginuid; unsigned int sessionid; }; diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 5925f48a3ade..1bc2d0890a9f 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -1534,14 +1534,11 @@ int __init netlbl_unlabel_defconf(void) int ret_val; struct netlbl_dom_map *entry; struct netlbl_audit audit_info; - struct lsm_prop prop; /* Only the kernel is allowed to call this function and the only time * it is called is at bootup before the audit subsystem is reporting * messages so don't worry to much about these values. */ - security_current_getlsmprop_subj(&prop); - /* scaffolding */ - audit_info.secid = prop.scaffold.secid; + security_current_getlsmprop_subj(&audit_info.prop); audit_info.loginuid = GLOBAL_ROOT_UID; audit_info.sessionid = 0; diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c index 3ed4fea2a2de..81635a13987b 100644 --- a/net/netlabel/netlabel_user.c +++ b/net/netlabel/netlabel_user.c @@ -98,10 +98,9 @@ struct audit_buffer *netlbl_audit_start_common(int type, from_kuid(&init_user_ns, audit_info->loginuid), audit_info->sessionid); - if (audit_info->secid != 0 && - security_secid_to_secctx(audit_info->secid, - &secctx, - &secctx_len) == 0) { + if (lsmprop_is_set(&audit_info->prop) && + security_lsmprop_to_secctx(&audit_info->prop, &secctx, + &secctx_len) == 0) { audit_log_format(audit_buf, " subj=%s", secctx); security_release_secctx(secctx, secctx_len); } diff --git a/net/netlabel/netlabel_user.h b/net/netlabel/netlabel_user.h index 39f4f6df5f51..d4c434956212 100644 --- a/net/netlabel/netlabel_user.h +++ b/net/netlabel/netlabel_user.h @@ -32,11 +32,7 @@ */ static inline void netlbl_netlink_auditinfo(struct netlbl_audit *audit_info) { - struct lsm_prop prop; - - security_current_getlsmprop_subj(&prop); - /* scaffolding */ - audit_info->secid = prop.scaffold.secid; + security_current_getlsmprop_subj(&audit_info->prop); audit_info->loginuid = audit_get_loginuid(current); audit_info->sessionid = audit_get_sessionid(current); } diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index 5dd1e164f9b1..1401412fd794 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -182,11 +182,9 @@ static inline void smack_catset_bit(unsigned int cat, char *catsetp) */ static void smk_netlabel_audit_set(struct netlbl_audit *nap) { - struct smack_known *skp = smk_of_current(); - nap->loginuid = audit_get_loginuid(current); nap->sessionid = audit_get_sessionid(current); - nap->secid = skp->smk_secid; + nap->prop.smack.skp = smk_of_current(); } /* -- cgit v1.3 From a716ff52bebfe806fbcf5227cee4c7bda4e6724b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2024 18:44:01 +0000 Subject: fib: rules: use READ_ONCE()/WRITE_ONCE() on ops->fib_rules_seq Using RTNL to protect ops->fib_rules_seq reads seems a big hammer. Writes are protected by RTNL. We can use READ_ONCE() on readers. Constify 'struct net' argument of fib_rules_seq_read() and lookup_rules_ops(). Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: David Ahern Link: https://patch.msgid.link/20241009184405.3752829-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/fib_rules.h | 2 +- net/core/fib_rules.c | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index d17855c52ef9..04383d90a1e3 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -176,7 +176,7 @@ int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table); bool fib_rule_matchall(const struct fib_rule *rule); int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, struct netlink_ext_ack *extack); -unsigned int fib_rules_seq_read(struct net *net, int family); +unsigned int fib_rules_seq_read(const struct net *net, int family); int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 154a2681f55c..82ef090c0037 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -101,7 +101,8 @@ static void notify_rule_change(int event, struct fib_rule *rule, struct fib_rules_ops *ops, struct nlmsghdr *nlh, u32 pid); -static struct fib_rules_ops *lookup_rules_ops(struct net *net, int family) +static struct fib_rules_ops *lookup_rules_ops(const struct net *net, + int family) { struct fib_rules_ops *ops; @@ -370,7 +371,9 @@ static int call_fib_rule_notifiers(struct net *net, .rule = rule, }; - ops->fib_rules_seq++; + ASSERT_RTNL(); + /* Paired with READ_ONCE() in fib_rules_seq() */ + WRITE_ONCE(ops->fib_rules_seq, ops->fib_rules_seq + 1); return call_fib_notifiers(net, event_type, &info.info); } @@ -397,17 +400,16 @@ int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, } EXPORT_SYMBOL_GPL(fib_rules_dump); -unsigned int fib_rules_seq_read(struct net *net, int family) +unsigned int fib_rules_seq_read(const struct net *net, int family) { unsigned int fib_rules_seq; struct fib_rules_ops *ops; - ASSERT_RTNL(); - ops = lookup_rules_ops(net, family); if (!ops) return 0; - fib_rules_seq = ops->fib_rules_seq; + /* Paired with WRITE_ONCE() in call_fib_rule_notifiers() */ + fib_rules_seq = READ_ONCE(ops->fib_rules_seq); rules_ops_put(ops); return fib_rules_seq; -- cgit v1.3 From 16207384d29287a19f81436e1953b41946aa8258 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2024 18:44:02 +0000 Subject: ipv4: use READ_ONCE()/WRITE_ONCE() on net->ipv4.fib_seq Using RTNL to protect ops->fib_rules_seq reads seems a big hammer. Writes are protected by RTNL. We can use READ_ONCE() when reading it. Constify 'struct net' argument of fib4_rules_seq_read() Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: David Ahern Link: https://patch.msgid.link/20241009184405.3752829-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip_fib.h | 4 ++-- include/net/netns/ipv4.h | 2 +- net/ipv4/fib_notifier.c | 8 ++++---- net/ipv4/fib_rules.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 06130933542d..b6e44f4eaa4c 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -347,7 +347,7 @@ static inline int fib4_rules_dump(struct net *net, struct notifier_block *nb, return 0; } -static inline unsigned int fib4_rules_seq_read(struct net *net) +static inline unsigned int fib4_rules_seq_read(const struct net *net) { return 0; } @@ -411,7 +411,7 @@ static inline bool fib4_has_custom_rules(const struct net *net) bool fib4_rule_default(const struct fib_rule *rule); int fib4_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); -unsigned int fib4_rules_seq_read(struct net *net); +unsigned int fib4_rules_seq_read(const struct net *net); static inline bool fib4_rules_early_flow_dissect(struct net *net, struct sk_buff *skb, diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 3b1de80b5c25..3c014170e001 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -262,7 +262,7 @@ struct netns_ipv4 { #endif struct fib_notifier_ops *notifier_ops; - unsigned int fib_seq; /* protected by rtnl_mutex */ + unsigned int fib_seq; /* writes protected by rtnl_mutex */ struct fib_notifier_ops *ipmr_notifier_ops; unsigned int ipmr_seq; /* protected by rtnl_mutex */ diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c index 0e23ade74493..21c85c80de64 100644 --- a/net/ipv4/fib_notifier.c +++ b/net/ipv4/fib_notifier.c @@ -22,15 +22,15 @@ int call_fib4_notifiers(struct net *net, enum fib_event_type event_type, ASSERT_RTNL(); info->family = AF_INET; - net->ipv4.fib_seq++; + /* Paired with READ_ONCE() in fib4_seq_read() */ + WRITE_ONCE(net->ipv4.fib_seq, net->ipv4.fib_seq + 1); return call_fib_notifiers(net, event_type, info); } static unsigned int fib4_seq_read(struct net *net) { - ASSERT_RTNL(); - - return net->ipv4.fib_seq + fib4_rules_seq_read(net); + /* Paired with WRITE_ONCE() in call_fib4_notifiers() */ + return READ_ONCE(net->ipv4.fib_seq) + fib4_rules_seq_read(net); } static int fib4_dump(struct net *net, struct notifier_block *nb, diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index b07292d50ee7..8325224ef072 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -74,7 +74,7 @@ int fib4_rules_dump(struct net *net, struct notifier_block *nb, return fib_rules_dump(net, nb, AF_INET, extack); } -unsigned int fib4_rules_seq_read(struct net *net) +unsigned int fib4_rules_seq_read(const struct net *net) { return fib_rules_seq_read(net, AF_INET); } -- cgit v1.3 From e60ea45447768c48309b944596a8a34f6bae50e2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2024 18:44:03 +0000 Subject: ipv6: use READ_ONCE()/WRITE_ONCE() on fib6_table->fib_seq Using RTNL to protect ops->fib_rules_seq reads seems a big hammer. Writes are protected by RTNL. We can use READ_ONCE() when reading it. Constify 'struct net' argument of fib6_tables_seq_read() and fib6_rules_seq_read(). Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: David Ahern Link: https://patch.msgid.link/20241009184405.3752829-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip6_fib.h | 8 ++++---- net/ipv6/fib6_rules.c | 2 +- net/ipv6/ip6_fib.c | 14 +++++++------- 3 files changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 6cb867ce4878..7c87873ae211 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -394,7 +394,7 @@ struct fib6_table { struct fib6_node tb6_root; struct inet_peer_base tb6_peers; unsigned int flags; - unsigned int fib_seq; + unsigned int fib_seq; /* writes protected by rtnl_mutex */ struct hlist_head tb6_gc_hlist; /* GC candidates */ #define RT6_TABLE_HAS_DFLT_ROUTER BIT(0) }; @@ -563,7 +563,7 @@ int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, int __net_init fib6_notifier_init(struct net *net); void __net_exit fib6_notifier_exit(struct net *net); -unsigned int fib6_tables_seq_read(struct net *net); +unsigned int fib6_tables_seq_read(const struct net *net); int fib6_tables_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); @@ -632,7 +632,7 @@ void fib6_rules_cleanup(void); bool fib6_rule_default(const struct fib_rule *rule); int fib6_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); -unsigned int fib6_rules_seq_read(struct net *net); +unsigned int fib6_rules_seq_read(const struct net *net); static inline bool fib6_rules_early_flow_dissect(struct net *net, struct sk_buff *skb, @@ -676,7 +676,7 @@ static inline int fib6_rules_dump(struct net *net, struct notifier_block *nb, { return 0; } -static inline unsigned int fib6_rules_seq_read(struct net *net) +static inline unsigned int fib6_rules_seq_read(const struct net *net) { return 0; } diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 04a9ed5e8310..c85c1627cb16 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -56,7 +56,7 @@ int fib6_rules_dump(struct net *net, struct notifier_block *nb, return fib_rules_dump(net, nb, AF_INET6, extack); } -unsigned int fib6_rules_seq_read(struct net *net) +unsigned int fib6_rules_seq_read(const struct net *net) { return fib_rules_seq_read(net, AF_INET6); } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index eb111d20615c..cea160b249d2 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -345,17 +345,17 @@ static void __net_init fib6_tables_init(struct net *net) #endif -unsigned int fib6_tables_seq_read(struct net *net) +unsigned int fib6_tables_seq_read(const struct net *net) { unsigned int h, fib_seq = 0; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { - struct hlist_head *head = &net->ipv6.fib_table_hash[h]; - struct fib6_table *tb; + const struct hlist_head *head = &net->ipv6.fib_table_hash[h]; + const struct fib6_table *tb; hlist_for_each_entry_rcu(tb, head, tb6_hlist) - fib_seq += tb->fib_seq; + fib_seq += READ_ONCE(tb->fib_seq); } rcu_read_unlock(); @@ -400,7 +400,7 @@ int call_fib6_entry_notifiers(struct net *net, .rt = rt, }; - rt->fib6_table->fib_seq++; + WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, event_type, &info.info); } @@ -416,7 +416,7 @@ int call_fib6_multipath_entry_notifiers(struct net *net, .nsiblings = nsiblings, }; - rt->fib6_table->fib_seq++; + WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, event_type, &info.info); } @@ -427,7 +427,7 @@ int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt) .nsiblings = rt->fib6_nsiblings, }; - rt->fib6_table->fib_seq++; + WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info); } -- cgit v1.3 From 055202b16c589cc82cc8ab9d4316701547fb8853 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2024 18:44:04 +0000 Subject: ipmr: use READ_ONCE() to read net->ipv[46].ipmr_seq mr_call_vif_notifiers() and mr_call_mfc_notifiers() already uses WRITE_ONCE() on the write side. Using RTNL to protect the reads seems a big hammer. Constify 'struct net' argument of ip6mr_rules_seq_read() and ipmr_rules_seq_read(). Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: David Ahern Link: https://patch.msgid.link/20241009184405.3752829-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/ipmr.c | 8 +++----- net/ipv6/ip6mr.c | 8 +++----- 2 files changed, 6 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 089864c6a35e..35ed03165184 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -288,7 +288,7 @@ static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR, extack); } -static unsigned int ipmr_rules_seq_read(struct net *net) +static unsigned int ipmr_rules_seq_read(const struct net *net) { return fib_rules_seq_read(net, RTNL_FAMILY_IPMR); } @@ -346,7 +346,7 @@ static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, return 0; } -static unsigned int ipmr_rules_seq_read(struct net *net) +static unsigned int ipmr_rules_seq_read(const struct net *net) { return 0; } @@ -3037,9 +3037,7 @@ static const struct net_protocol pim_protocol = { static unsigned int ipmr_seq_read(struct net *net) { - ASSERT_RTNL(); - - return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net); + return READ_ONCE(net->ipv4.ipmr_seq) + ipmr_rules_seq_read(net); } static int ipmr_dump(struct net *net, struct notifier_block *nb, diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 2ce4ae0d8dc3..3f9501fd8c1a 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -276,7 +276,7 @@ static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR, extack); } -static unsigned int ip6mr_rules_seq_read(struct net *net) +static unsigned int ip6mr_rules_seq_read(const struct net *net) { return fib_rules_seq_read(net, RTNL_FAMILY_IP6MR); } @@ -335,7 +335,7 @@ static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, return 0; } -static unsigned int ip6mr_rules_seq_read(struct net *net) +static unsigned int ip6mr_rules_seq_read(const struct net *net) { return 0; } @@ -1262,9 +1262,7 @@ static int ip6mr_device_event(struct notifier_block *this, static unsigned int ip6mr_seq_read(struct net *net) { - ASSERT_RTNL(); - - return net->ipv6.ipmr_seq + ip6mr_rules_seq_read(net); + return READ_ONCE(net->ipv6.ipmr_seq) + ip6mr_rules_seq_read(net); } static int ip6mr_dump(struct net *net, struct notifier_block *nb, -- cgit v1.3 From 2698acd6ea4770809af7e65bb8b3250e0a3a807e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2024 18:44:05 +0000 Subject: net: do not acquire rtnl in fib_seq_sum() After we made sure no fib_seq_read() handlers needs RTNL anymore, we can remove RTNL from fib_seq_sum(). Note that after RTNL was dropped, fib_seq_sum() result was possibly outdated anyway. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: David Ahern Link: https://patch.msgid.link/20241009184405.3752829-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/fib_notifier.h | 2 +- net/core/fib_notifier.c | 2 -- net/ipv4/fib_notifier.c | 2 +- net/ipv4/ipmr.c | 2 +- net/ipv6/fib6_notifier.c | 2 +- net/ipv6/ip6mr.c | 2 +- 6 files changed, 5 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h index 6d59221ff05a..48aad6128fea 100644 --- a/include/net/fib_notifier.h +++ b/include/net/fib_notifier.h @@ -28,7 +28,7 @@ enum fib_event_type { struct fib_notifier_ops { int family; struct list_head list; - unsigned int (*fib_seq_read)(struct net *net); + unsigned int (*fib_seq_read)(const struct net *net); int (*fib_dump)(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); struct module *owner; diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c index fc96259807b6..5cdca49b1d7c 100644 --- a/net/core/fib_notifier.c +++ b/net/core/fib_notifier.c @@ -43,7 +43,6 @@ static unsigned int fib_seq_sum(struct net *net) struct fib_notifier_ops *ops; unsigned int fib_seq = 0; - rtnl_lock(); rcu_read_lock(); list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) { if (!try_module_get(ops->owner)) @@ -52,7 +51,6 @@ static unsigned int fib_seq_sum(struct net *net) module_put(ops->owner); } rcu_read_unlock(); - rtnl_unlock(); return fib_seq; } diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c index 21c85c80de64..b1551c26554b 100644 --- a/net/ipv4/fib_notifier.c +++ b/net/ipv4/fib_notifier.c @@ -27,7 +27,7 @@ int call_fib4_notifiers(struct net *net, enum fib_event_type event_type, return call_fib_notifiers(net, event_type, info); } -static unsigned int fib4_seq_read(struct net *net) +static unsigned int fib4_seq_read(const struct net *net) { /* Paired with WRITE_ONCE() in call_fib4_notifiers() */ return READ_ONCE(net->ipv4.fib_seq) + fib4_rules_seq_read(net); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 35ed03165184..7a95daeb1946 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -3035,7 +3035,7 @@ static const struct net_protocol pim_protocol = { }; #endif -static unsigned int ipmr_seq_read(struct net *net) +static unsigned int ipmr_seq_read(const struct net *net) { return READ_ONCE(net->ipv4.ipmr_seq) + ipmr_rules_seq_read(net); } diff --git a/net/ipv6/fib6_notifier.c b/net/ipv6/fib6_notifier.c index f87ae33e1d01..949b72610df7 100644 --- a/net/ipv6/fib6_notifier.c +++ b/net/ipv6/fib6_notifier.c @@ -22,7 +22,7 @@ int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, return call_fib_notifiers(net, event_type, info); } -static unsigned int fib6_seq_read(struct net *net) +static unsigned int fib6_seq_read(const struct net *net) { return fib6_tables_seq_read(net) + fib6_rules_seq_read(net); } diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 3f9501fd8c1a..9528e17665fd 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1260,7 +1260,7 @@ static int ip6mr_device_event(struct notifier_block *this, return NOTIFY_DONE; } -static unsigned int ip6mr_seq_read(struct net *net) +static unsigned int ip6mr_seq_read(const struct net *net) { return READ_ONCE(net->ipv6.ipmr_seq) + ip6mr_rules_seq_read(net); } -- cgit v1.3 From edc344568922eb9588e77ba49de1ef0cb9a2ff1c Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Wed, 9 Oct 2024 13:53:46 +0300 Subject: net: ethtool: Add new parameters and a function to support EPL In the CMIS specification for pluggable modules, LPL (Local Payload) and EPL (Extended Payload) are two types of data payloads used for managing various functions and features of the module. EPL payloads are used for more complex and extensive management functions that require a larger amount of data, so writing firmware blocks using EPL is much more efficient. Currently, only LPL payload is supported for writing firmware blocks to the module. Add EPL related parameters to the function ethtool_cmis_cdb_compose_args() and add a specific function for calculating the maximum allowable length extension for EPL. Both will be used in the next patch to add support for writing firmware blocks using EPL. Signed-off-by: Danielle Ratson Reviewed-by: Petr Machata Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/ethtool/cmis.h | 12 +++++++----- net/ethtool/cmis_cdb.c | 32 +++++++++++++++++++++----------- net/ethtool/cmis_fw_update.c | 17 ++++++++++------- 3 files changed, 38 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/ethtool/cmis.h b/net/ethtool/cmis.h index 3e7c293af78c..73a5060d0f4c 100644 --- a/net/ethtool/cmis.h +++ b/net/ethtool/cmis.h @@ -96,13 +96,15 @@ struct ethtool_cmis_cdb_rpl { u8 payload[ETHTOOL_CMIS_CDB_LPL_MAX_PL_LENGTH]; }; -u32 ethtool_cmis_get_max_payload_size(u8 num_of_byte_octs); +u32 ethtool_cmis_get_max_lpl_size(u8 num_of_byte_octs); +u32 ethtool_cmis_get_max_epl_size(u8 num_of_byte_octs); void ethtool_cmis_cdb_compose_args(struct ethtool_cmis_cdb_cmd_args *args, - enum ethtool_cmis_cdb_cmd_id cmd, u8 *pl, - u8 lpl_len, u16 max_duration, - u8 read_write_len_ext, u16 msleep_pre_rpl, - u8 rpl_exp_len, u8 flags); + enum ethtool_cmis_cdb_cmd_id cmd, u8 *lpl, + u8 lpl_len, u8 *epl, u16 epl_len, + u16 max_duration, u8 read_write_len_ext, + u16 msleep_pre_rpl, u8 rpl_exp_len, + u8 flags); void ethtool_cmis_cdb_check_completion_flag(u8 cmis_rev, u8 *flags); diff --git a/net/ethtool/cmis_cdb.c b/net/ethtool/cmis_cdb.c index 4d5581147952..80bb475fd52a 100644 --- a/net/ethtool/cmis_cdb.c +++ b/net/ethtool/cmis_cdb.c @@ -11,25 +11,34 @@ * min(i, 15) byte octets where i specifies the allowable additional number of * byte octets in a READ or a WRITE. */ -u32 ethtool_cmis_get_max_payload_size(u8 num_of_byte_octs) +u32 ethtool_cmis_get_max_lpl_size(u8 num_of_byte_octs) { return 8 * (1 + min_t(u8, num_of_byte_octs, 15)); } +/* For accessing the EPL field on page 9Fh, the allowable length extension is + * min(i, 255) byte octets where i specifies the allowable additional number of + * byte octets in a READ or a WRITE. + */ +u32 ethtool_cmis_get_max_epl_size(u8 num_of_byte_octs) +{ + return 8 * (1 + min_t(u8, num_of_byte_octs, 255)); +} + void ethtool_cmis_cdb_compose_args(struct ethtool_cmis_cdb_cmd_args *args, - enum ethtool_cmis_cdb_cmd_id cmd, u8 *pl, - u8 lpl_len, u16 max_duration, - u8 read_write_len_ext, u16 msleep_pre_rpl, - u8 rpl_exp_len, u8 flags) + enum ethtool_cmis_cdb_cmd_id cmd, u8 *lpl, + u8 lpl_len, u8 *epl, u16 epl_len, + u16 max_duration, u8 read_write_len_ext, + u16 msleep_pre_rpl, u8 rpl_exp_len, u8 flags) { args->req.id = cpu_to_be16(cmd); args->req.lpl_len = lpl_len; - if (pl) - memcpy(args->req.payload, pl, args->req.lpl_len); + if (lpl) + memcpy(args->req.payload, lpl, args->req.lpl_len); args->max_duration = max_duration; args->read_write_len_ext = - ethtool_cmis_get_max_payload_size(read_write_len_ext); + ethtool_cmis_get_max_lpl_size(read_write_len_ext); args->msleep_pre_rpl = msleep_pre_rpl; args->rpl_exp_len = rpl_exp_len; args->flags = flags; @@ -183,7 +192,7 @@ cmis_cdb_validate_password(struct ethtool_cmis_cdb *cdb, } ethtool_cmis_cdb_compose_args(&args, ETHTOOL_CMIS_CDB_CMD_QUERY_STATUS, - (u8 *)&qs_pl, sizeof(qs_pl), 0, + (u8 *)&qs_pl, sizeof(qs_pl), NULL, 0, 0, cdb->read_write_len_ext, 1000, sizeof(*rpl), CDB_F_COMPLETION_VALID | CDB_F_STATUS_VALID); @@ -245,8 +254,9 @@ static int cmis_cdb_module_features_get(struct ethtool_cmis_cdb *cdb, ethtool_cmis_cdb_check_completion_flag(cdb->cmis_rev, &flags); ethtool_cmis_cdb_compose_args(&args, ETHTOOL_CMIS_CDB_CMD_MODULE_FEATURES, - NULL, 0, 0, cdb->read_write_len_ext, - 1000, sizeof(*rpl), flags); + NULL, 0, NULL, 0, 0, + cdb->read_write_len_ext, 1000, + sizeof(*rpl), flags); err = ethtool_cmis_cdb_execute_cmd(dev, &args); if (err < 0) { diff --git a/net/ethtool/cmis_fw_update.c b/net/ethtool/cmis_fw_update.c index 655ff5224ffa..a514127985d4 100644 --- a/net/ethtool/cmis_fw_update.c +++ b/net/ethtool/cmis_fw_update.c @@ -54,7 +54,8 @@ cmis_fw_update_fw_mng_features_get(struct ethtool_cmis_cdb *cdb, ethtool_cmis_cdb_check_completion_flag(cdb->cmis_rev, &flags); ethtool_cmis_cdb_compose_args(&args, ETHTOOL_CMIS_CDB_CMD_FW_MANAGMENT_FEATURES, - NULL, 0, cdb->max_completion_time, + NULL, 0, NULL, 0, + cdb->max_completion_time, cdb->read_write_len_ext, 1000, sizeof(*rpl), flags); @@ -122,7 +123,7 @@ cmis_fw_update_start_download(struct ethtool_cmis_cdb *cdb, ethtool_cmis_cdb_compose_args(&args, ETHTOOL_CMIS_CDB_CMD_START_FW_DOWNLOAD, - (u8 *)&pl, lpl_len, + (u8 *)&pl, lpl_len, NULL, 0, fw_mng->max_duration_start, cdb->read_write_len_ext, 1000, 0, CDB_F_COMPLETION_VALID | CDB_F_STATUS_VALID); @@ -158,7 +159,7 @@ cmis_fw_update_write_image(struct ethtool_cmis_cdb *cdb, int err; max_lpl_len = min_t(u32, - ethtool_cmis_get_max_payload_size(cdb->read_write_len_ext), + ethtool_cmis_get_max_lpl_size(cdb->read_write_len_ext), ETHTOOL_CMIS_CDB_LPL_MAX_PL_LENGTH); max_block_size = max_lpl_len - sizeof_field(struct cmis_cdb_write_fw_block_lpl_pl, @@ -183,7 +184,7 @@ cmis_fw_update_write_image(struct ethtool_cmis_cdb *cdb, ethtool_cmis_cdb_compose_args(&args, ETHTOOL_CMIS_CDB_CMD_WRITE_FW_BLOCK_LPL, - (u8 *)&pl, lpl_len, + (u8 *)&pl, lpl_len, NULL, 0, fw_mng->max_duration_write, cdb->read_write_len_ext, 1, 0, CDB_F_COMPLETION_VALID | CDB_F_STATUS_VALID); @@ -212,7 +213,8 @@ cmis_fw_update_complete_download(struct ethtool_cmis_cdb *cdb, ethtool_cmis_cdb_compose_args(&args, ETHTOOL_CMIS_CDB_CMD_COMPLETE_FW_DOWNLOAD, - NULL, 0, fw_mng->max_duration_complete, + NULL, 0, NULL, 0, + fw_mng->max_duration_complete, cdb->read_write_len_ext, 1000, 0, CDB_F_COMPLETION_VALID | CDB_F_STATUS_VALID); @@ -294,7 +296,7 @@ cmis_fw_update_run_image(struct ethtool_cmis_cdb *cdb, struct net_device *dev, int err; ethtool_cmis_cdb_compose_args(&args, ETHTOOL_CMIS_CDB_CMD_RUN_FW_IMAGE, - (u8 *)&pl, sizeof(pl), + (u8 *)&pl, sizeof(pl), NULL, 0, cdb->max_completion_time, cdb->read_write_len_ext, 1000, 0, CDB_F_MODULE_STATE_VALID); @@ -326,7 +328,8 @@ cmis_fw_update_commit_image(struct ethtool_cmis_cdb *cdb, ethtool_cmis_cdb_compose_args(&args, ETHTOOL_CMIS_CDB_CMD_COMMIT_FW_IMAGE, - NULL, 0, cdb->max_completion_time, + NULL, 0, NULL, 0, + cdb->max_completion_time, cdb->read_write_len_ext, 1000, 0, CDB_F_COMPLETION_VALID | CDB_F_STATUS_VALID); -- cgit v1.3 From 9a3b0d078bd825613c0821bf7bf5a2e1d8d60057 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Wed, 9 Oct 2024 13:53:47 +0300 Subject: net: ethtool: Add support for writing firmware blocks using EPL payload In the CMIS specification for pluggable modules, LPL (Local Payload) and EPL (Extended Payload) are two types of data payloads used for managing various functions and features of the module. EPL payloads are used for more complex and extensive management functions that require a larger amount of data, so writing firmware blocks using EPL is much more efficient. Currently, only LPL payload is supported for writing firmware blocks to the module. Add support for writing firmware block using EPL payload, both to support modules that supports only EPL write mechanism, and to optimize the flashing process of modules that support LPL and EPL. Signed-off-by: Danielle Ratson Reviewed-by: Petr Machata Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/ethtool/cmis.h | 4 ++ net/ethtool/cmis_cdb.c | 66 ++++++++++++++++++++++++++++++-- net/ethtool/cmis_fw_update.c | 91 +++++++++++++++++++++++++++++++++++++++----- 3 files changed, 148 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ethtool/cmis.h b/net/ethtool/cmis.h index 73a5060d0f4c..1e790413db0e 100644 --- a/net/ethtool/cmis.h +++ b/net/ethtool/cmis.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-only */ #define ETHTOOL_CMIS_CDB_LPL_MAX_PL_LENGTH 120 +#define ETHTOOL_CMIS_CDB_EPL_MAX_PL_LENGTH 2048 #define ETHTOOL_CMIS_CDB_CMD_PAGE 0x9F #define ETHTOOL_CMIS_CDB_PAGE_I2C_ADDR 0x50 @@ -23,6 +24,7 @@ enum ethtool_cmis_cdb_cmd_id { ETHTOOL_CMIS_CDB_CMD_FW_MANAGMENT_FEATURES = 0x0041, ETHTOOL_CMIS_CDB_CMD_START_FW_DOWNLOAD = 0x0101, ETHTOOL_CMIS_CDB_CMD_WRITE_FW_BLOCK_LPL = 0x0103, + ETHTOOL_CMIS_CDB_CMD_WRITE_FW_BLOCK_EPL = 0x0104, ETHTOOL_CMIS_CDB_CMD_COMPLETE_FW_DOWNLOAD = 0x0107, ETHTOOL_CMIS_CDB_CMD_RUN_FW_IMAGE = 0x0109, ETHTOOL_CMIS_CDB_CMD_COMMIT_FW_IMAGE = 0x010A, @@ -38,6 +40,7 @@ enum ethtool_cmis_cdb_cmd_id { * @resv1: Added to match the CMIS standard request continuity. * @resv2: Added to match the CMIS standard request continuity. * @payload: Payload for the CDB commands. + * @epl: Extended payload for the CDB commands. */ struct ethtool_cmis_cdb_request { __be16 id; @@ -49,6 +52,7 @@ struct ethtool_cmis_cdb_request { u8 resv2; u8 payload[ETHTOOL_CMIS_CDB_LPL_MAX_PL_LENGTH]; ); + u8 *epl; /* Everything above this field checksummed. */ }; #define CDB_F_COMPLETION_VALID BIT(0) diff --git a/net/ethtool/cmis_cdb.c b/net/ethtool/cmis_cdb.c index 80bb475fd52a..d159dc121bde 100644 --- a/net/ethtool/cmis_cdb.c +++ b/net/ethtool/cmis_cdb.c @@ -33,12 +33,19 @@ void ethtool_cmis_cdb_compose_args(struct ethtool_cmis_cdb_cmd_args *args, { args->req.id = cpu_to_be16(cmd); args->req.lpl_len = lpl_len; - if (lpl) + if (lpl) { memcpy(args->req.payload, lpl, args->req.lpl_len); + args->read_write_len_ext = + ethtool_cmis_get_max_lpl_size(read_write_len_ext); + } + if (epl) { + args->req.epl_len = cpu_to_be16(epl_len); + args->req.epl = epl; + args->read_write_len_ext = + ethtool_cmis_get_max_epl_size(read_write_len_ext); + } args->max_duration = max_duration; - args->read_write_len_ext = - ethtool_cmis_get_max_lpl_size(read_write_len_ext); args->msleep_pre_rpl = msleep_pre_rpl; args->rpl_exp_len = rpl_exp_len; args->flags = flags; @@ -556,6 +563,49 @@ __ethtool_cmis_cdb_execute_cmd(struct net_device *dev, return err; } +#define CMIS_CDB_EPL_PAGE_START 0xA0 +#define CMIS_CDB_EPL_PAGE_END 0xAF +#define CMIS_CDB_EPL_FW_BLOCK_OFFSET_START 128 +#define CMIS_CDB_EPL_FW_BLOCK_OFFSET_END 255 + +static int +ethtool_cmis_cdb_execute_epl_cmd(struct net_device *dev, + struct ethtool_cmis_cdb_cmd_args *args, + struct ethtool_module_eeprom *page_data) +{ + u16 epl_len = be16_to_cpu(args->req.epl_len); + u32 bytes_written = 0; + u8 page; + int err; + + for (page = CMIS_CDB_EPL_PAGE_START; + page <= CMIS_CDB_EPL_PAGE_END && bytes_written < epl_len; page++) { + u16 offset = CMIS_CDB_EPL_FW_BLOCK_OFFSET_START; + + while (offset <= CMIS_CDB_EPL_FW_BLOCK_OFFSET_END && + bytes_written < epl_len) { + u32 bytes_left = epl_len - bytes_written; + u16 space_left, bytes_to_write; + + space_left = CMIS_CDB_EPL_FW_BLOCK_OFFSET_END - offset + 1; + bytes_to_write = min_t(u16, bytes_left, + min_t(u16, space_left, + args->read_write_len_ext)); + + err = __ethtool_cmis_cdb_execute_cmd(dev, page_data, + page, offset, + bytes_to_write, + args->req.epl + bytes_written); + if (err < 0) + return err; + + offset += bytes_to_write; + bytes_written += bytes_to_write; + } + } + return 0; +} + static u8 cmis_cdb_calc_checksum(const void *data, size_t size) { const u8 *bytes = (const u8 *)data; @@ -577,7 +627,9 @@ int ethtool_cmis_cdb_execute_cmd(struct net_device *dev, int err; args->req.chk_code = - cmis_cdb_calc_checksum(&args->req, sizeof(args->req)); + cmis_cdb_calc_checksum(&args->req, + offsetof(struct ethtool_cmis_cdb_request, + epl)); if (args->req.lpl_len > args->read_write_len_ext) { args->err_msg = "LPL length is longer than CDB read write length extension allows"; @@ -599,6 +651,12 @@ int ethtool_cmis_cdb_execute_cmd(struct net_device *dev, if (err < 0) return err; + if (args->req.epl_len) { + err = ethtool_cmis_cdb_execute_epl_cmd(dev, args, &page_data); + if (err < 0) + return err; + } + offset = CMIS_CDB_CMD_ID_OFFSET + offsetof(struct ethtool_cmis_cdb_request, id); err = __ethtool_cmis_cdb_execute_cmd(dev, &page_data, diff --git a/net/ethtool/cmis_fw_update.c b/net/ethtool/cmis_fw_update.c index a514127985d4..48aef6220f00 100644 --- a/net/ethtool/cmis_fw_update.c +++ b/net/ethtool/cmis_fw_update.c @@ -9,6 +9,7 @@ struct cmis_fw_update_fw_mng_features { u8 start_cmd_payload_size; + u8 write_mechanism; u16 max_duration_start; u16 max_duration_write; u16 max_duration_complete; @@ -36,7 +37,9 @@ struct cmis_cdb_fw_mng_features_rpl { }; enum cmis_cdb_fw_write_mechanism { + CMIS_CDB_FW_WRITE_MECHANISM_NONE = 0x00, CMIS_CDB_FW_WRITE_MECHANISM_LPL = 0x01, + CMIS_CDB_FW_WRITE_MECHANISM_EPL = 0x10, CMIS_CDB_FW_WRITE_MECHANISM_BOTH = 0x11, }; @@ -68,10 +71,9 @@ cmis_fw_update_fw_mng_features_get(struct ethtool_cmis_cdb *cdb, } rpl = (struct cmis_cdb_fw_mng_features_rpl *)args.req.payload; - if (!(rpl->write_mechanism == CMIS_CDB_FW_WRITE_MECHANISM_LPL || - rpl->write_mechanism == CMIS_CDB_FW_WRITE_MECHANISM_BOTH)) { + if (rpl->write_mechanism == CMIS_CDB_FW_WRITE_MECHANISM_NONE) { ethnl_module_fw_flash_ntf_err(dev, ntf_params, - "Write LPL is not supported", + "CDB write mechanism is not supported", NULL); return -EOPNOTSUPP; } @@ -83,6 +85,10 @@ cmis_fw_update_fw_mng_features_get(struct ethtool_cmis_cdb *cdb, */ cdb->read_write_len_ext = rpl->read_write_len_ext; fw_mng->start_cmd_payload_size = rpl->start_cmd_payload_size; + fw_mng->write_mechanism = + rpl->write_mechanism == CMIS_CDB_FW_WRITE_MECHANISM_LPL ? + CMIS_CDB_FW_WRITE_MECHANISM_LPL : + CMIS_CDB_FW_WRITE_MECHANISM_EPL; fw_mng->max_duration_start = be16_to_cpu(rpl->max_duration_start); fw_mng->max_duration_write = be16_to_cpu(rpl->max_duration_write); fw_mng->max_duration_complete = be16_to_cpu(rpl->max_duration_complete); @@ -149,9 +155,9 @@ struct cmis_cdb_write_fw_block_lpl_pl { }; static int -cmis_fw_update_write_image(struct ethtool_cmis_cdb *cdb, - struct ethtool_cmis_fw_update_params *fw_update, - struct cmis_fw_update_fw_mng_features *fw_mng) +cmis_fw_update_write_image_lpl(struct ethtool_cmis_cdb *cdb, + struct ethtool_cmis_fw_update_params *fw_update, + struct cmis_fw_update_fw_mng_features *fw_mng) { u8 start = fw_mng->start_cmd_payload_size; u32 offset, max_block_size, max_lpl_len; @@ -202,6 +208,67 @@ cmis_fw_update_write_image(struct ethtool_cmis_cdb *cdb, return 0; } +struct cmis_cdb_write_fw_block_epl_pl { + u8 fw_block[ETHTOOL_CMIS_CDB_EPL_MAX_PL_LENGTH]; +}; + +static int +cmis_fw_update_write_image_epl(struct ethtool_cmis_cdb *cdb, + struct ethtool_cmis_fw_update_params *fw_update, + struct cmis_fw_update_fw_mng_features *fw_mng) +{ + u8 start = fw_mng->start_cmd_payload_size; + u32 image_size = fw_update->fw->size; + u32 offset, lpl_len; + int err; + + lpl_len = sizeof_field(struct cmis_cdb_write_fw_block_lpl_pl, + block_address); + + for (offset = start; offset < image_size; + offset += ETHTOOL_CMIS_CDB_EPL_MAX_PL_LENGTH) { + struct cmis_cdb_write_fw_block_lpl_pl lpl = { + .block_address = cpu_to_be32(offset - start), + }; + struct cmis_cdb_write_fw_block_epl_pl *epl; + struct ethtool_cmis_cdb_cmd_args args = {}; + u32 epl_len; + + ethnl_module_fw_flash_ntf_in_progress(fw_update->dev, + &fw_update->ntf_params, + offset - start, + image_size); + + epl_len = min_t(u32, ETHTOOL_CMIS_CDB_EPL_MAX_PL_LENGTH, + image_size - offset); + epl = kmalloc_array(epl_len, sizeof(u8), GFP_KERNEL); + if (!epl) + return -ENOMEM; + + memcpy(epl->fw_block, &fw_update->fw->data[offset], epl_len); + + ethtool_cmis_cdb_compose_args(&args, + ETHTOOL_CMIS_CDB_CMD_WRITE_FW_BLOCK_EPL, + (u8 *)&lpl, lpl_len, (u8 *)epl, + epl_len, + fw_mng->max_duration_write, + cdb->read_write_len_ext, 1, 0, + CDB_F_COMPLETION_VALID | CDB_F_STATUS_VALID); + + err = ethtool_cmis_cdb_execute_cmd(fw_update->dev, &args); + kfree(epl); + if (err < 0) { + ethnl_module_fw_flash_ntf_err(fw_update->dev, + &fw_update->ntf_params, + "Write FW block EPL command failed", + args.err_msg); + return err; + } + } + + return 0; +} + static int cmis_fw_update_complete_download(struct ethtool_cmis_cdb *cdb, struct net_device *dev, @@ -238,9 +305,15 @@ cmis_fw_update_download_image(struct ethtool_cmis_cdb *cdb, if (err < 0) return err; - err = cmis_fw_update_write_image(cdb, fw_update, fw_mng); - if (err < 0) - return err; + if (fw_mng->write_mechanism == CMIS_CDB_FW_WRITE_MECHANISM_LPL) { + err = cmis_fw_update_write_image_lpl(cdb, fw_update, fw_mng); + if (err < 0) + return err; + } else { + err = cmis_fw_update_write_image_epl(cdb, fw_update, fw_mng); + if (err < 0) + return err; + } err = cmis_fw_update_complete_download(cdb, fw_update->dev, fw_mng, &fw_update->ntf_params); -- cgit v1.3 From 356c81b6c494a359ed6e25087931acc78c518fb9 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 13 Oct 2024 22:16:53 +0200 Subject: batman-adv: replace call_rcu by kfree_rcu for simple kmem_cache_free callback Since SLOB was removed and since commit 6c6c47b063b5 ("mm, slab: call kvfree_rcu_barrier() from kmem_cache_destroy()"), it is not necessary to use call_rcu when the callback only performs kmem_cache_free. Use kfree_rcu() directly. The changes were made using Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/translation-table.c | 47 +++----------------------------------- 1 file changed, 3 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 6815d1262feb..b44c382226a1 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -209,20 +209,6 @@ batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, return tt_global_entry; } -/** - * batadv_tt_local_entry_free_rcu() - free the tt_local_entry - * @rcu: rcu pointer of the tt_local_entry - */ -static void batadv_tt_local_entry_free_rcu(struct rcu_head *rcu) -{ - struct batadv_tt_local_entry *tt_local_entry; - - tt_local_entry = container_of(rcu, struct batadv_tt_local_entry, - common.rcu); - - kmem_cache_free(batadv_tl_cache, tt_local_entry); -} - /** * batadv_tt_local_entry_release() - release tt_local_entry from lists and queue * for free after rcu grace period @@ -237,7 +223,7 @@ static void batadv_tt_local_entry_release(struct kref *ref) batadv_softif_vlan_put(tt_local_entry->vlan); - call_rcu(&tt_local_entry->common.rcu, batadv_tt_local_entry_free_rcu); + kfree_rcu(tt_local_entry, common.rcu); } /** @@ -255,20 +241,6 @@ batadv_tt_local_entry_put(struct batadv_tt_local_entry *tt_local_entry) batadv_tt_local_entry_release); } -/** - * batadv_tt_global_entry_free_rcu() - free the tt_global_entry - * @rcu: rcu pointer of the tt_global_entry - */ -static void batadv_tt_global_entry_free_rcu(struct rcu_head *rcu) -{ - struct batadv_tt_global_entry *tt_global_entry; - - tt_global_entry = container_of(rcu, struct batadv_tt_global_entry, - common.rcu); - - kmem_cache_free(batadv_tg_cache, tt_global_entry); -} - /** * batadv_tt_global_entry_release() - release tt_global_entry from lists and * queue for free after rcu grace period @@ -283,7 +255,7 @@ void batadv_tt_global_entry_release(struct kref *ref) batadv_tt_global_del_orig_list(tt_global_entry); - call_rcu(&tt_global_entry->common.rcu, batadv_tt_global_entry_free_rcu); + kfree_rcu(tt_global_entry, common.rcu); } /** @@ -408,19 +380,6 @@ static void batadv_tt_global_size_dec(struct batadv_orig_node *orig_node, batadv_tt_global_size_mod(orig_node, vid, -1); } -/** - * batadv_tt_orig_list_entry_free_rcu() - free the orig_entry - * @rcu: rcu pointer of the orig_entry - */ -static void batadv_tt_orig_list_entry_free_rcu(struct rcu_head *rcu) -{ - struct batadv_tt_orig_list_entry *orig_entry; - - orig_entry = container_of(rcu, struct batadv_tt_orig_list_entry, rcu); - - kmem_cache_free(batadv_tt_orig_cache, orig_entry); -} - /** * batadv_tt_orig_list_entry_release() - release tt orig entry from lists and * queue for free after rcu grace period @@ -434,7 +393,7 @@ static void batadv_tt_orig_list_entry_release(struct kref *ref) refcount); batadv_orig_node_put(orig_entry->orig_node); - call_rcu(&orig_entry->rcu, batadv_tt_orig_list_entry_free_rcu); + kfree_rcu(orig_entry, rcu); } /** -- cgit v1.3 From e4c416533f0633c96eba849832a9212ace3c3ec4 Mon Sep 17 00:00:00 2001 From: Yu Liao Date: Thu, 10 Oct 2024 17:27:44 +0800 Subject: net: hsr: convert to use new timer APIs del_timer() and del_timer_sync() have been renamed to timer_delete() and timer_delete_sync(). Inconsistent API usage makes the code a bit confusing, so replace with the new APIs. No functional changes intended. Signed-off-by: Yu Liao Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/hsr/hsr_netlink.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index f6ff0b61e08a..6f09b9512484 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -128,9 +128,9 @@ static void hsr_dellink(struct net_device *dev, struct list_head *head) { struct hsr_priv *hsr = netdev_priv(dev); - del_timer_sync(&hsr->prune_timer); - del_timer_sync(&hsr->prune_proxy_timer); - del_timer_sync(&hsr->announce_timer); + timer_delete_sync(&hsr->prune_timer); + timer_delete_sync(&hsr->prune_proxy_timer); + timer_delete_sync(&hsr->announce_timer); timer_delete_sync(&hsr->announce_proxy_timer); hsr_debugfs_term(hsr); -- cgit v1.3 From b692bf9a7543af7ad11a59d182a3757578f0ba53 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Mon, 7 Oct 2024 14:24:53 +0200 Subject: xsk: Get rid of xdp_buff_xsk::xskb_list_node Let's bring xdp_buff_xsk back to occupying 2 cachelines by removing xskb_list_node - for the purpose of gathering the xskb frags free_list_node can be used, head of the list (xsk_buff_pool::xskb_list) stays as-is, just reuse the node ptr. It is safe to do as a single xdp_buff_xsk can never reside in two pool's lists simultaneously. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20241007122458.282590-2-maciej.fijalkowski@intel.com --- include/net/xdp_sock_drv.h | 14 +++++++------- include/net/xsk_buff_pool.h | 1 - net/xdp/xsk.c | 4 ++-- net/xdp/xsk_buff_pool.c | 1 - 4 files changed, 9 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 0a5dca2b2b3f..360bc1244c6a 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -126,8 +126,8 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) if (likely(!xdp_buff_has_frags(xdp))) goto out; - list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { - list_del(&pos->xskb_list_node); + list_for_each_entry_safe(pos, tmp, xskb_list, free_list_node) { + list_del(&pos->free_list_node); xp_free(pos); } @@ -140,7 +140,7 @@ static inline void xsk_buff_add_frag(struct xdp_buff *xdp) { struct xdp_buff_xsk *frag = container_of(xdp, struct xdp_buff_xsk, xdp); - list_add_tail(&frag->xskb_list_node, &frag->pool->xskb_list); + list_add_tail(&frag->free_list_node, &frag->pool->xskb_list); } static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) @@ -150,9 +150,9 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) struct xdp_buff_xsk *frag; frag = list_first_entry_or_null(&xskb->pool->xskb_list, - struct xdp_buff_xsk, xskb_list_node); + struct xdp_buff_xsk, free_list_node); if (frag) { - list_del(&frag->xskb_list_node); + list_del(&frag->free_list_node); ret = &frag->xdp; } @@ -163,7 +163,7 @@ static inline void xsk_buff_del_tail(struct xdp_buff *tail) { struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); - list_del(&xskb->xskb_list_node); + list_del(&xskb->free_list_node); } static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) @@ -172,7 +172,7 @@ static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) struct xdp_buff_xsk *frag; frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, - xskb_list_node); + free_list_node); return &frag->xdp; } diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index bacb33f1e3e5..aa7f1d0b3a5e 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -30,7 +30,6 @@ struct xdp_buff_xsk { struct xsk_buff_pool *pool; u64 orig_addr; struct list_head free_list_node; - struct list_head xskb_list_node; }; #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 1140b2a120ca..9c93064349a8 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -171,14 +171,14 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) return 0; xskb_list = &xskb->pool->xskb_list; - list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { + list_for_each_entry_safe(pos, tmp, xskb_list, free_list_node) { if (list_is_singular(xskb_list)) contd = 0; len = pos->xdp.data_end - pos->xdp.data; err = __xsk_rcv_zc(xs, pos, len, contd); if (err) goto err; - list_del(&pos->xskb_list_node); + list_del(&pos->free_list_node); } return 0; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 521a2938e50a..e5368db7d18e 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -102,7 +102,6 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, xskb->pool = pool; xskb->xdp.frame_sz = umem->chunk_size - umem->headroom; INIT_LIST_HEAD(&xskb->free_list_node); - INIT_LIST_HEAD(&xskb->xskb_list_node); if (pool->unaligned) pool->free_heads[i] = xskb; else -- cgit v1.3 From 30ec2c1baaead43903ad63ff8e3083949059083c Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Mon, 7 Oct 2024 14:24:54 +0200 Subject: xsk: s/free_list_node/list_node/ Now that free_list_node's purpose is two-folded, make it just a 'list_node'. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20241007122458.282590-3-maciej.fijalkowski@intel.com --- include/net/xdp_sock_drv.h | 14 +++++++------- include/net/xsk_buff_pool.h | 2 +- net/xdp/xsk.c | 4 ++-- net/xdp/xsk_buff_pool.c | 14 +++++++------- 4 files changed, 17 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 360bc1244c6a..40085afd9160 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -126,8 +126,8 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) if (likely(!xdp_buff_has_frags(xdp))) goto out; - list_for_each_entry_safe(pos, tmp, xskb_list, free_list_node) { - list_del(&pos->free_list_node); + list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { + list_del(&pos->list_node); xp_free(pos); } @@ -140,7 +140,7 @@ static inline void xsk_buff_add_frag(struct xdp_buff *xdp) { struct xdp_buff_xsk *frag = container_of(xdp, struct xdp_buff_xsk, xdp); - list_add_tail(&frag->free_list_node, &frag->pool->xskb_list); + list_add_tail(&frag->list_node, &frag->pool->xskb_list); } static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) @@ -150,9 +150,9 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) struct xdp_buff_xsk *frag; frag = list_first_entry_or_null(&xskb->pool->xskb_list, - struct xdp_buff_xsk, free_list_node); + struct xdp_buff_xsk, list_node); if (frag) { - list_del(&frag->free_list_node); + list_del(&frag->list_node); ret = &frag->xdp; } @@ -163,7 +163,7 @@ static inline void xsk_buff_del_tail(struct xdp_buff *tail) { struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); - list_del(&xskb->free_list_node); + list_del(&xskb->list_node); } static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) @@ -172,7 +172,7 @@ static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) struct xdp_buff_xsk *frag; frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, - free_list_node); + list_node); return &frag->xdp; } diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index aa7f1d0b3a5e..af8b6f776f86 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -29,7 +29,7 @@ struct xdp_buff_xsk { dma_addr_t frame_dma; struct xsk_buff_pool *pool; u64 orig_addr; - struct list_head free_list_node; + struct list_head list_node; }; #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 9c93064349a8..520023405908 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -171,14 +171,14 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) return 0; xskb_list = &xskb->pool->xskb_list; - list_for_each_entry_safe(pos, tmp, xskb_list, free_list_node) { + list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { if (list_is_singular(xskb_list)) contd = 0; len = pos->xdp.data_end - pos->xdp.data; err = __xsk_rcv_zc(xs, pos, len, contd); if (err) goto err; - list_del(&pos->free_list_node); + list_del(&pos->list_node); } return 0; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index e5368db7d18e..973557d5e4f7 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -101,7 +101,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, xskb = &pool->heads[i]; xskb->pool = pool; xskb->xdp.frame_sz = umem->chunk_size - umem->headroom; - INIT_LIST_HEAD(&xskb->free_list_node); + INIT_LIST_HEAD(&xskb->list_node); if (pool->unaligned) pool->free_heads[i] = xskb; else @@ -549,8 +549,8 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool) } else { pool->free_list_cnt--; xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, - free_list_node); - list_del_init(&xskb->free_list_node); + list_node); + list_del_init(&xskb->list_node); } xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM; @@ -616,8 +616,8 @@ static u32 xp_alloc_reused(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u3 i = nb_entries; while (i--) { - xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, free_list_node); - list_del_init(&xskb->free_list_node); + xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, list_node); + list_del_init(&xskb->list_node); *xdp = &xskb->xdp; xdp++; @@ -687,11 +687,11 @@ EXPORT_SYMBOL(xp_can_alloc); void xp_free(struct xdp_buff_xsk *xskb) { - if (!list_empty(&xskb->free_list_node)) + if (!list_empty(&xskb->list_node)) return; xskb->pool->free_list_cnt++; - list_add(&xskb->free_list_node, &xskb->pool->free_list); + list_add(&xskb->list_node, &xskb->pool->free_list); } EXPORT_SYMBOL(xp_free); -- cgit v1.3 From bea14124bacbe5c9366381e62635eed28ac892ae Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Mon, 7 Oct 2024 14:24:55 +0200 Subject: xsk: Get rid of xdp_buff_xsk::orig_addr Continue the process of dieting xdp_buff_xsk by removing orig_addr member. It can be calculated from xdp->data_hard_start where it was previously used, so it is not anything that has to be carried around in struct used widely in hot path. This has been used for initializing xdp_buff_xsk::frame_dma during pool setup and as a shortcut in xp_get_handle() to retrieve address provided to xsk Rx queue. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20241007122458.282590-4-maciej.fijalkowski@intel.com --- include/net/xsk_buff_pool.h | 19 +++++++++++-------- net/xdp/xsk.c | 2 +- net/xdp/xsk_buff_pool.c | 4 +++- 3 files changed, 15 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index af8b6f776f86..468a23b1b4c5 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -28,7 +28,6 @@ struct xdp_buff_xsk { dma_addr_t dma; dma_addr_t frame_dma; struct xsk_buff_pool *pool; - u64 orig_addr; struct list_head list_node; }; @@ -119,7 +118,6 @@ void xp_free(struct xdp_buff_xsk *xskb); static inline void xp_init_xskb_addr(struct xdp_buff_xsk *xskb, struct xsk_buff_pool *pool, u64 addr) { - xskb->orig_addr = addr; xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom; } @@ -221,14 +219,19 @@ static inline void xp_release(struct xdp_buff_xsk *xskb) xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; } -static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb) +static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb, + struct xsk_buff_pool *pool) { - u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start; + u64 orig_addr = xskb->xdp.data - pool->addrs; + u64 offset; - offset += xskb->pool->headroom; - if (!xskb->pool->unaligned) - return xskb->orig_addr + offset; - return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); + if (!pool->unaligned) + return orig_addr; + + offset = xskb->xdp.data - xskb->xdp.data_hard_start; + orig_addr -= offset; + offset += pool->headroom; + return orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); } static inline bool xp_tx_metadata_enabled(const struct xsk_buff_pool *pool) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 520023405908..6c31c1de1619 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -141,7 +141,7 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, u64 addr; int err; - addr = xp_get_handle(xskb); + addr = xp_get_handle(xskb, xskb->pool); err = xskq_prod_reserve_desc(xs->rx, addr, len, flags); if (err) { xs->rx_queue_full++; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 973557d5e4f7..7ecd4ccd2473 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -416,8 +416,10 @@ static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_ for (i = 0; i < pool->heads_cnt; i++) { struct xdp_buff_xsk *xskb = &pool->heads[i]; + u64 orig_addr; - xp_init_xskb_dma(xskb, pool, dma_map->dma_pages, xskb->orig_addr); + orig_addr = xskb->xdp.data_hard_start - pool->addrs - pool->headroom; + xp_init_xskb_dma(xskb, pool, dma_map->dma_pages, orig_addr); } } -- cgit v1.3 From 6e126872191df946a6fe01b79273119d32d96711 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Mon, 7 Oct 2024 14:24:56 +0200 Subject: xsk: Carry a copy of xdp_zc_max_segs within xsk_buff_pool This so we avoid dereferencing struct net_device within hot path. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20241007122458.282590-5-maciej.fijalkowski@intel.com --- include/net/xsk_buff_pool.h | 1 + net/xdp/xsk_buff_pool.c | 1 + net/xdp/xsk_queue.h | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 468a23b1b4c5..bb03cee716b3 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -76,6 +76,7 @@ struct xsk_buff_pool { u32 chunk_size; u32 chunk_shift; u32 frame_len; + u32 xdp_zc_max_segs; u8 tx_metadata_len; /* inherited from umem */ u8 cached_need_wakeup; bool uses_need_wakeup; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 7ecd4ccd2473..e946ba4a5ccf 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -229,6 +229,7 @@ int xp_assign_dev(struct xsk_buff_pool *pool, goto err_unreg_xsk; } pool->umem->zc = true; + pool->xdp_zc_max_segs = netdev->xdp_zc_max_segs; return 0; err_unreg_xsk: diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 406b20dfee8d..46d87e961ad6 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -260,7 +260,7 @@ u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool, nr_frags = 0; } else { nr_frags++; - if (nr_frags == pool->netdev->xdp_zc_max_segs) { + if (nr_frags == pool->xdp_zc_max_segs) { nr_frags = 0; break; } -- cgit v1.3 From 1d10b2bed2d4b2003f174da739d8163b7f7957cf Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Mon, 7 Oct 2024 14:24:57 +0200 Subject: xsk: Wrap duplicated code to function Both allocation paths have exactly the same code responsible for getting and initializing xskb. Pull it out to common function. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20241007122458.282590-6-maciej.fijalkowski@intel.com --- net/xdp/xsk_buff_pool.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index e946ba4a5ccf..ae71da7d2cd6 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -503,6 +503,22 @@ static bool xp_check_aligned(struct xsk_buff_pool *pool, u64 *addr) return *addr < pool->addrs_cnt; } +static struct xdp_buff_xsk *xp_get_xskb(struct xsk_buff_pool *pool, u64 addr) +{ + struct xdp_buff_xsk *xskb; + + if (pool->unaligned) { + xskb = pool->free_heads[--pool->free_heads_cnt]; + xp_init_xskb_addr(xskb, pool, addr); + if (pool->dma_pages) + xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr); + } else { + xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)]; + } + + return xskb; +} + static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) { struct xdp_buff_xsk *xskb; @@ -528,14 +544,7 @@ static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) break; } - if (pool->unaligned) { - xskb = pool->free_heads[--pool->free_heads_cnt]; - xp_init_xskb_addr(xskb, pool, addr); - if (pool->dma_pages) - xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr); - } else { - xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)]; - } + xskb = xp_get_xskb(pool, addr); xskq_cons_release(pool->fq); return xskb; @@ -593,14 +602,7 @@ static u32 xp_alloc_new_from_fq(struct xsk_buff_pool *pool, struct xdp_buff **xd continue; } - if (pool->unaligned) { - xskb = pool->free_heads[--pool->free_heads_cnt]; - xp_init_xskb_addr(xskb, pool, addr); - if (pool->dma_pages) - xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr); - } else { - xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)]; - } + xskb = xp_get_xskb(pool, addr); *xdp = &xskb->xdp; xdp++; -- cgit v1.3 From e6c4047f5122803f2fe4ab9b1ab7038626e51ec1 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Mon, 7 Oct 2024 14:24:58 +0200 Subject: xsk: Use xsk_buff_pool directly for cq functions Currently xsk_cq_{reserve_addr,submit,cancel}_locked() take xdp_sock as an input argument but it is only used for pulling out xsk_buff_pool pointer from it. Change mentioned functions to take pool pointer as an input argument to avoid unnecessary dereferences. Signed-off-by: Maciej Fijalkowski Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20241007122458.282590-7-maciej.fijalkowski@intel.com --- net/xdp/xsk.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 6c31c1de1619..7d7e37f53708 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -527,34 +527,34 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags) return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); } -static int xsk_cq_reserve_addr_locked(struct xdp_sock *xs, u64 addr) +static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr) { unsigned long flags; int ret; - spin_lock_irqsave(&xs->pool->cq_lock, flags); - ret = xskq_prod_reserve_addr(xs->pool->cq, addr); - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); + spin_lock_irqsave(&pool->cq_lock, flags); + ret = xskq_prod_reserve_addr(pool->cq, addr); + spin_unlock_irqrestore(&pool->cq_lock, flags); return ret; } -static void xsk_cq_submit_locked(struct xdp_sock *xs, u32 n) +static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n) { unsigned long flags; - spin_lock_irqsave(&xs->pool->cq_lock, flags); - xskq_prod_submit_n(xs->pool->cq, n); - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); + spin_lock_irqsave(&pool->cq_lock, flags); + xskq_prod_submit_n(pool->cq, n); + spin_unlock_irqrestore(&pool->cq_lock, flags); } -static void xsk_cq_cancel_locked(struct xdp_sock *xs, u32 n) +static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n) { unsigned long flags; - spin_lock_irqsave(&xs->pool->cq_lock, flags); - xskq_prod_cancel_n(xs->pool->cq, n); - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); + spin_lock_irqsave(&pool->cq_lock, flags); + xskq_prod_cancel_n(pool->cq, n); + spin_unlock_irqrestore(&pool->cq_lock, flags); } static u32 xsk_get_num_desc(struct sk_buff *skb) @@ -571,7 +571,7 @@ static void xsk_destruct_skb(struct sk_buff *skb) *compl->tx_timestamp = ktime_get_tai_fast_ns(); } - xsk_cq_submit_locked(xdp_sk(skb->sk), xsk_get_num_desc(skb)); + xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb)); sock_wfree(skb); } @@ -587,7 +587,7 @@ static void xsk_consume_skb(struct sk_buff *skb) struct xdp_sock *xs = xdp_sk(skb->sk); skb->destructor = sock_wfree; - xsk_cq_cancel_locked(xs, xsk_get_num_desc(skb)); + xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb)); /* Free skb without triggering the perf drop trace */ consume_skb(skb); xs->skb = NULL; @@ -765,7 +765,7 @@ free_err: xskq_cons_release(xs->tx); } else { /* Let application retry */ - xsk_cq_cancel_locked(xs, 1); + xsk_cq_cancel_locked(xs->pool, 1); } return ERR_PTR(err); @@ -802,7 +802,7 @@ static int __xsk_generic_xmit(struct sock *sk) * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ - if (xsk_cq_reserve_addr_locked(xs, desc.addr)) + if (xsk_cq_reserve_addr_locked(xs->pool, desc.addr)) goto out; skb = xsk_build_skb(xs, &desc); -- cgit v1.3 From 78e2baf3d96edd21c6f26d8afc0e68d02ec2c51c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2024 17:48:13 +0000 Subject: net: add TIME_WAIT logic to sk_to_full_sk() TCP will soon attach TIME_WAIT sockets to some ACK and RST. Make sure sk_to_full_sk() detects this and does not return a non full socket. v3: also changed sk_const_to_full_sk() Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Martin KaFai Lau Reviewed-by: Brian Vazquez Link: https://patch.msgid.link/20241010174817.1543642-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/bpf-cgroup.h | 2 +- include/net/inet_sock.h | 8 ++++++-- net/core/filter.c | 6 +----- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index ce91d9b2acb9..f0f219271daf 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -209,7 +209,7 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk) { \ typeof(sk) __sk = sk_to_full_sk(sk); \ - if (sk_fullsock(__sk) && __sk == skb_to_full_sk(skb) && \ + if (__sk && __sk == skb_to_full_sk(skb) && \ cgroup_bpf_sock_enabled(__sk, CGROUP_INET_EGRESS)) \ __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \ CGROUP_INET_EGRESS); \ diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index f01dd273bea6..56d8bc5593d3 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -321,8 +321,10 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet) static inline struct sock *sk_to_full_sk(struct sock *sk) { #ifdef CONFIG_INET - if (sk && sk->sk_state == TCP_NEW_SYN_RECV) + if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV) sk = inet_reqsk(sk)->rsk_listener; + if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT) + sk = NULL; #endif return sk; } @@ -331,8 +333,10 @@ static inline struct sock *sk_to_full_sk(struct sock *sk) static inline const struct sock *sk_const_to_full_sk(const struct sock *sk) { #ifdef CONFIG_INET - if (sk && sk->sk_state == TCP_NEW_SYN_RECV) + if (sk && READ_ONCE(sk->sk_state) == TCP_NEW_SYN_RECV) sk = ((const struct request_sock *)sk)->rsk_listener; + if (sk && READ_ONCE(sk->sk_state) == TCP_TIME_WAIT) + sk = NULL; #endif return sk; } diff --git a/net/core/filter.c b/net/core/filter.c index bd0d08bf76bb..202c1d386e19 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6778,8 +6778,6 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk * sock refcnt is decremented to prevent a request_sock leak. */ - if (!sk_fullsock(sk2)) - sk2 = NULL; if (sk2 != sk) { sock_gen_put(sk); /* Ensure there is no need to bump sk2 refcnt */ @@ -6826,8 +6824,6 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk * sock refcnt is decremented to prevent a request_sock leak. */ - if (!sk_fullsock(sk2)) - sk2 = NULL; if (sk2 != sk) { sock_gen_put(sk); /* Ensure there is no need to bump sk2 refcnt */ @@ -7276,7 +7272,7 @@ BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk) { sk = sk_to_full_sk(sk); - if (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE)) + if (sk && sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE)) return (unsigned long)sk; return (unsigned long)NULL; -- cgit v1.3 From bc43a3c83cad46a27d6e3bf869acdd926bbe79ad Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2024 17:48:14 +0000 Subject: net_sched: sch_fq: prepare for TIME_WAIT sockets TCP stack is not attaching skb to TIME_WAIT sockets yet, but we would like to allow this in the future. Add sk_listener_or_tw() helper to detect the three states that FQ needs to take care. Like NEW_SYN_RECV, TIME_WAIT are not full sockets and do not contain sk->sk_pacing_status, sk->sk_pacing_rate. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Brian Vazquez Link: https://patch.msgid.link/20241010174817.1543642-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 10 ++++++++++ net/sched/sch_fq.c | 3 ++- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 6da420ab1ee1..2f200d91ef11 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2802,6 +2802,16 @@ static inline bool sk_listener(const struct sock *sk) return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV); } +/* This helper checks if a socket is a LISTEN or NEW_SYN_RECV or TIME_WAIT + * TCP SYNACK messages can be attached to LISTEN or NEW_SYN_RECV (depending on SYNCOOKIE) + * TCP RST and ACK can be attached to TIME_WAIT. + */ +static inline bool sk_listener_or_tw(const struct sock *sk) +{ + return (1 << READ_ONCE(sk->sk_state)) & + (TCPF_LISTEN | TCPF_NEW_SYN_RECV | TCPF_TIME_WAIT); +} + void sock_enable_timestamp(struct sock *sk, enum sock_flags flag); int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type); diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index aeabf45c9200..a97638bef6da 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -362,8 +362,9 @@ static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb, * 3) We do not want to rate limit them (eg SYNFLOOD attack), * especially if the listener set SO_MAX_PACING_RATE * 4) We pretend they are orphaned + * TCP can also associate TIME_WAIT sockets with RST or ACK packets. */ - if (!sk || sk_listener(sk)) { + if (!sk || sk_listener_or_tw(sk)) { unsigned long hash = skb_get_hash(skb) & q->orphan_mask; /* By forcing low order bit to 1, we make sure to not -- cgit v1.3 From 5ced52fa8f0dc23adb067f7b8a009a5ee051efb7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2024 17:48:15 +0000 Subject: net: add skb_set_owner_edemux() helper This can be used to attach a socket to an skb, taking a reference on sk->sk_refcnt. This helper might be a NOP if sk->sk_refcnt is zero. Use it from tcp_make_synack(). Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Brian Vazquez Link: https://patch.msgid.link/20241010174817.1543642-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 9 +++++++++ net/core/sock.c | 9 +++------ net/ipv4/tcp_output.c | 2 +- 3 files changed, 13 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 2f200d91ef11..bf7fa3db10ae 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1760,6 +1760,15 @@ void sock_efree(struct sk_buff *skb); #ifdef CONFIG_INET void sock_edemux(struct sk_buff *skb); void sock_pfree(struct sk_buff *skb); + +static inline void skb_set_owner_edemux(struct sk_buff *skb, struct sock *sk) +{ + skb_orphan(skb); + if (refcount_inc_not_zero(&sk->sk_refcnt)) { + skb->sk = sk; + skb->destructor = sock_edemux; + } +} #else #define sock_edemux sock_efree #endif diff --git a/net/core/sock.c b/net/core/sock.c index 083d438d8b6f..f8c0d4eda888 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2592,14 +2592,11 @@ void __sock_wfree(struct sk_buff *skb) void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); - skb->sk = sk; #ifdef CONFIG_INET - if (unlikely(!sk_fullsock(sk))) { - skb->destructor = sock_edemux; - sock_hold(sk); - return; - } + if (unlikely(!sk_fullsock(sk))) + return skb_set_owner_edemux(skb, sk); #endif + skb->sk = sk; skb->destructor = sock_wfree; skb_set_hash_from_sk(skb, sk); /* diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 06200bb111f8..054244ce5117 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3728,7 +3728,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, switch (synack_type) { case TCP_SYNACK_NORMAL: - skb_set_owner_w(skb, req_to_sk(req)); + skb_set_owner_edemux(skb, req_to_sk(req)); break; case TCP_SYNACK_COOKIE: /* Under synflood, we do not attach skb to a socket, -- cgit v1.3 From 507a96737d99686ca1714c7ba1f60ac323178189 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2024 17:48:16 +0000 Subject: ipv6: tcp: give socket pointer to control skbs tcp_v6_send_response() send orphaned 'control packets'. These are RST packets and also ACK packets sent from TIME_WAIT. Some eBPF programs would prefer to have a meaningful skb->sk pointer as much as possible. This means that TCP can now attach TIME_WAIT sockets to outgoing skbs. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Brian Vazquez Link: https://patch.msgid.link/20241010174817.1543642-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/tcp_ipv6.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7634c0be6acb..597920061a3a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -967,6 +967,9 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 } if (sk) { + /* unconstify the socket only to attach it to buff with care. */ + skb_set_owner_edemux(buff, (struct sock *)sk); + if (sk->sk_state == TCP_TIME_WAIT) mark = inet_twsk(sk)->tw_mark; else -- cgit v1.3 From 79636038d37e7bd4d078238f2a3f002cab4423bc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2024 17:48:17 +0000 Subject: ipv4: tcp: give socket pointer to control skbs ip_send_unicast_reply() send orphaned 'control packets'. These are RST packets and also ACK packets sent from TIME_WAIT. Some eBPF programs would prefer to have a meaningful skb->sk pointer as much as possible. This means that TCP can now attach TIME_WAIT sockets to outgoing skbs. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Brian Vazquez Link: https://patch.msgid.link/20241010174817.1543642-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 3 ++- net/ipv4/ip_output.c | 5 ++++- net/ipv4/tcp_ipv4.c | 4 ++-- 3 files changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/ip.h b/include/net/ip.h index bab084df1567..4be0a6a603b2 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -288,7 +288,8 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg) return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0; } -void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, +void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, + struct sk_buff *skb, const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e5c55a95063d..0065b1996c94 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1596,7 +1596,8 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, * Generic function to send a packet as reply to another packet. * Used to send some TCP resets/acks so far. */ -void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, +void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, + struct sk_buff *skb, const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, @@ -1662,6 +1663,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; + if (orig_sk) + skb_set_owner_edemux(nskb, (struct sock *)orig_sk); if (transmit_time) nskb->tstamp_type = SKB_CLOCK_MONOTONIC; if (txhash) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 985028434f64..9d3dd101ea71 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -907,7 +907,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, ctl_sk->sk_mark = 0; ctl_sk->sk_priority = 0; } - ip_send_unicast_reply(ctl_sk, + ip_send_unicast_reply(ctl_sk, sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len, @@ -1021,7 +1021,7 @@ static void tcp_v4_send_ack(const struct sock *sk, ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); transmit_time = tcp_transmit_time(sk); - ip_send_unicast_reply(ctl_sk, + ip_send_unicast_reply(ctl_sk, sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len, -- cgit v1.3 From f15e3b3ddb9fab1c1731b6154e2cd6573fb54c4d Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Fri, 11 Oct 2024 18:44:56 +0000 Subject: net: napi: Make napi_defer_hard_irqs per-NAPI Add defer_hard_irqs to napi_struct in preparation for per-NAPI settings. The existing sysfs parameter is respected; writes to sysfs will write to all NAPI structs for the device and the net_device defer_hard_irq field. Reads from sysfs show the net_device field. The ability to set defer_hard_irqs on specific NAPI instances will be added in a later commit, via netdev-genl. Signed-off-by: Joe Damato Reviewed-by: Eric Dumazet Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/20241011184527.16393-2-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- .../networking/net_cachelines/net_device.rst | 1 + include/linux/netdevice.h | 3 +- net/core/dev.c | 10 +++--- net/core/dev.h | 36 ++++++++++++++++++++++ net/core/net-sysfs.c | 2 +- 5 files changed, 45 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index 1b018ac35e9a..5a7388b2ab6f 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -186,4 +186,5 @@ struct dpll_pin* dpll_pin struct hlist_head page_pools struct dim_irq_moder* irq_moder u64 max_pacing_offload_horizon +u32 napi_defer_hard_irqs =================================== =========================== =================== =================== =================================================================================== diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e6b93d01e631..2e7bc23660ec 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -373,6 +373,7 @@ struct napi_struct { unsigned int napi_id; struct hrtimer timer; struct task_struct *thread; + u32 defer_hard_irqs; /* control-path-only fields follow */ struct list_head dev_list; struct hlist_node napi_hash_node; @@ -2085,7 +2086,6 @@ struct net_device { unsigned int real_num_rx_queues; struct netdev_rx_queue *_rx; unsigned long gro_flush_timeout; - u32 napi_defer_hard_irqs; unsigned int gro_max_size; unsigned int gro_ipv4_max_size; rx_handler_func_t __rcu *rx_handler; @@ -2413,6 +2413,7 @@ struct net_device { struct dim_irq_moder *irq_moder; u64 max_pacing_offload_horizon; + u32 napi_defer_hard_irqs; /** * @lock: protects @net_shaper_hierarchy, feel free to use for other diff --git a/net/core/dev.c b/net/core/dev.c index b590eefce3b4..fbaa9eabf77f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6233,7 +6233,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) if (work_done) { if (n->gro_bitmask) timeout = READ_ONCE(n->dev->gro_flush_timeout); - n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs); + n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n); } if (n->defer_hard_irqs_count > 0) { n->defer_hard_irqs_count--; @@ -6371,7 +6371,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (flags & NAPI_F_PREFER_BUSY_POLL) { - napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); + napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi); timeout = READ_ONCE(napi->dev->gro_flush_timeout); if (napi->defer_hard_irqs_count && timeout) { hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); @@ -6653,6 +6653,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, INIT_HLIST_NODE(&napi->napi_hash_node); hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; + napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs)); init_gro_hash(napi); napi->skb = NULL; INIT_LIST_HEAD(&napi->rx_list); @@ -11059,7 +11060,7 @@ void netdev_sw_irq_coalesce_default_on(struct net_device *dev) if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { dev->gro_flush_timeout = 20000; - dev->napi_defer_hard_irqs = 1; + netdev_set_defer_hard_irqs(dev, 1); } } EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on); @@ -12003,7 +12004,6 @@ static void __init net_dev_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout); - CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler); @@ -12015,7 +12015,7 @@ static void __init net_dev_struct_check(void) #ifdef CONFIG_NET_XGRESS CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress); #endif - CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104); + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 100); } /* diff --git a/net/core/dev.h b/net/core/dev.h index d3ea92949ff3..0716b1048261 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -148,6 +148,42 @@ static inline void netif_set_gro_ipv4_max_size(struct net_device *dev, WRITE_ONCE(dev->gro_ipv4_max_size, size); } +/** + * napi_get_defer_hard_irqs - get the NAPI's defer_hard_irqs + * @n: napi struct to get the defer_hard_irqs field from + * + * Return: the per-NAPI value of the defar_hard_irqs field. + */ +static inline u32 napi_get_defer_hard_irqs(const struct napi_struct *n) +{ + return READ_ONCE(n->defer_hard_irqs); +} + +/** + * napi_set_defer_hard_irqs - set the defer_hard_irqs for a napi + * @n: napi_struct to set the defer_hard_irqs field + * @defer: the value the field should be set to + */ +static inline void napi_set_defer_hard_irqs(struct napi_struct *n, u32 defer) +{ + WRITE_ONCE(n->defer_hard_irqs, defer); +} + +/** + * netdev_set_defer_hard_irqs - set defer_hard_irqs for all NAPIs of a netdev + * @netdev: the net_device for which all NAPIs will have defer_hard_irqs set + * @defer: the defer_hard_irqs value to set + */ +static inline void netdev_set_defer_hard_irqs(struct net_device *netdev, + u32 defer) +{ + struct napi_struct *napi; + + WRITE_ONCE(netdev->napi_defer_hard_irqs, defer); + list_for_each_entry(napi, &netdev->napi_list, dev_list) + napi_set_defer_hard_irqs(napi, defer); +} + int rps_cpumask_housekeeping(struct cpumask *mask); #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 05cf5347f25e..25125f356a15 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -429,7 +429,7 @@ static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val if (val > S32_MAX) return -ERANGE; - WRITE_ONCE(dev->napi_defer_hard_irqs, val); + netdev_set_defer_hard_irqs(dev, (u32)val); return 0; } -- cgit v1.3 From 516010460011ae74ac3b7383cf90ed27e2711cd6 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Fri, 11 Oct 2024 18:44:57 +0000 Subject: netdev-genl: Dump napi_defer_hard_irqs Support dumping defer_hard_irqs for a NAPI ID. Signed-off-by: Joe Damato Reviewed-by: Eric Dumazet Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/20241011184527.16393-3-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 8 ++++++++ include/uapi/linux/netdev.h | 1 + net/core/netdev-genl.c | 6 ++++++ tools/include/uapi/linux/netdev.h | 1 + 4 files changed, 16 insertions(+) (limited to 'net') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 08412c279297..585e87ec3c16 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -248,6 +248,13 @@ attribute-sets: threaded mode. If NAPI is not in threaded mode (i.e. uses normal softirq context), the attribute will be absent. type: u32 + - + name: defer-hard-irqs + doc: The number of consecutive empty polls before IRQ deferral ends + and hardware IRQs are re-enabled. + type: u32 + checks: + max: s32-max - name: queue attributes: @@ -636,6 +643,7 @@ operations: - ifindex - irq - pid + - defer-hard-irqs dump: request: attributes: diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 7c308f04e7a0..13dc0b027e86 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -122,6 +122,7 @@ enum { NETDEV_A_NAPI_ID, NETDEV_A_NAPI_IRQ, NETDEV_A_NAPI_PID, + NETDEV_A_NAPI_DEFER_HARD_IRQS, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 358cba248796..f98e5d1d0d21 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -161,6 +161,7 @@ static int netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, const struct genl_info *info) { + u32 napi_defer_hard_irqs; void *hdr; pid_t pid; @@ -189,6 +190,11 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, goto nla_put_failure; } + napi_defer_hard_irqs = napi_get_defer_hard_irqs(napi); + if (nla_put_s32(rsp, NETDEV_A_NAPI_DEFER_HARD_IRQS, + napi_defer_hard_irqs)) + goto nla_put_failure; + genlmsg_end(rsp, hdr); return 0; diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 7c308f04e7a0..13dc0b027e86 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -122,6 +122,7 @@ enum { NETDEV_A_NAPI_ID, NETDEV_A_NAPI_IRQ, NETDEV_A_NAPI_PID, + NETDEV_A_NAPI_DEFER_HARD_IRQS, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) -- cgit v1.3 From acb8d4ed5661d05f794ef2ce34fd11e699e9ca32 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Fri, 11 Oct 2024 18:44:58 +0000 Subject: net: napi: Make gro_flush_timeout per-NAPI Allow per-NAPI gro_flush_timeout setting. The existing sysfs parameter is respected; writes to sysfs will write to all NAPI structs for the device and the net_device gro_flush_timeout field. Reads from sysfs will read from the net_device field. The ability to set gro_flush_timeout on specific NAPI instances will be added in a later commit, via netdev-genl. Signed-off-by: Joe Damato Reviewed-by: Eric Dumazet Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/20241011184527.16393-4-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- .../networking/net_cachelines/net_device.rst | 1 + include/linux/netdevice.h | 3 +- net/core/dev.c | 12 +++---- net/core/dev.h | 40 ++++++++++++++++++++++ net/core/net-sysfs.c | 2 +- 5 files changed, 50 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index 5a7388b2ab6f..67910ea49160 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -186,5 +186,6 @@ struct dpll_pin* dpll_pin struct hlist_head page_pools struct dim_irq_moder* irq_moder u64 max_pacing_offload_horizon +unsigned_long gro_flush_timeout u32 napi_defer_hard_irqs =================================== =========================== =================== =================== =================================================================================== diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2e7bc23660ec..93241d4de437 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -373,6 +373,7 @@ struct napi_struct { unsigned int napi_id; struct hrtimer timer; struct task_struct *thread; + unsigned long gro_flush_timeout; u32 defer_hard_irqs; /* control-path-only fields follow */ struct list_head dev_list; @@ -2085,7 +2086,6 @@ struct net_device { int ifindex; unsigned int real_num_rx_queues; struct netdev_rx_queue *_rx; - unsigned long gro_flush_timeout; unsigned int gro_max_size; unsigned int gro_ipv4_max_size; rx_handler_func_t __rcu *rx_handler; @@ -2413,6 +2413,7 @@ struct net_device { struct dim_irq_moder *irq_moder; u64 max_pacing_offload_horizon; + unsigned long gro_flush_timeout; u32 napi_defer_hard_irqs; /** diff --git a/net/core/dev.c b/net/core/dev.c index fbaa9eabf77f..e21ace3551d5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6232,12 +6232,12 @@ bool napi_complete_done(struct napi_struct *n, int work_done) if (work_done) { if (n->gro_bitmask) - timeout = READ_ONCE(n->dev->gro_flush_timeout); + timeout = napi_get_gro_flush_timeout(n); n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n); } if (n->defer_hard_irqs_count > 0) { n->defer_hard_irqs_count--; - timeout = READ_ONCE(n->dev->gro_flush_timeout); + timeout = napi_get_gro_flush_timeout(n); if (timeout) ret = false; } @@ -6372,7 +6372,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, if (flags & NAPI_F_PREFER_BUSY_POLL) { napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi); - timeout = READ_ONCE(napi->dev->gro_flush_timeout); + timeout = napi_get_gro_flush_timeout(napi); if (napi->defer_hard_irqs_count && timeout) { hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); skip_schedule = true; @@ -6654,6 +6654,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs)); + napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout)); init_gro_hash(napi); napi->skb = NULL; INIT_LIST_HEAD(&napi->rx_list); @@ -11059,7 +11060,7 @@ void netdev_sw_irq_coalesce_default_on(struct net_device *dev) WARN_ON(dev->reg_state == NETREG_REGISTERED); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { - dev->gro_flush_timeout = 20000; + netdev_set_gro_flush_timeout(dev, 20000); netdev_set_defer_hard_irqs(dev, 1); } } @@ -12003,7 +12004,6 @@ static void __init net_dev_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx); - CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler); @@ -12015,7 +12015,7 @@ static void __init net_dev_struct_check(void) #ifdef CONFIG_NET_XGRESS CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress); #endif - CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 100); + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92); } /* diff --git a/net/core/dev.h b/net/core/dev.h index 0716b1048261..7d0aab7e3ef1 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -184,6 +184,46 @@ static inline void netdev_set_defer_hard_irqs(struct net_device *netdev, napi_set_defer_hard_irqs(napi, defer); } +/** + * napi_get_gro_flush_timeout - get the gro_flush_timeout + * @n: napi struct to get the gro_flush_timeout from + * + * Return: the per-NAPI value of the gro_flush_timeout field. + */ +static inline unsigned long +napi_get_gro_flush_timeout(const struct napi_struct *n) +{ + return READ_ONCE(n->gro_flush_timeout); +} + +/** + * napi_set_gro_flush_timeout - set the gro_flush_timeout for a napi + * @n: napi struct to set the gro_flush_timeout + * @timeout: timeout value to set + * + * napi_set_gro_flush_timeout sets the per-NAPI gro_flush_timeout + */ +static inline void napi_set_gro_flush_timeout(struct napi_struct *n, + unsigned long timeout) +{ + WRITE_ONCE(n->gro_flush_timeout, timeout); +} + +/** + * netdev_set_gro_flush_timeout - set gro_flush_timeout of a netdev's NAPIs + * @netdev: the net_device for which all NAPIs will have gro_flush_timeout set + * @timeout: the timeout value to set + */ +static inline void netdev_set_gro_flush_timeout(struct net_device *netdev, + unsigned long timeout) +{ + struct napi_struct *napi; + + WRITE_ONCE(netdev->gro_flush_timeout, timeout); + list_for_each_entry(napi, &netdev->napi_list, dev_list) + napi_set_gro_flush_timeout(napi, timeout); +} + int rps_cpumask_housekeeping(struct cpumask *mask); #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 25125f356a15..2d9afc6e2161 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -409,7 +409,7 @@ NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec); static int change_gro_flush_timeout(struct net_device *dev, unsigned long val) { - WRITE_ONCE(dev->gro_flush_timeout, val); + netdev_set_gro_flush_timeout(dev, val); return 0; } -- cgit v1.3 From 0137891e74576f77a7901718dc0ce08ca074ae74 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Fri, 11 Oct 2024 18:44:59 +0000 Subject: netdev-genl: Dump gro_flush_timeout Support dumping gro_flush_timeout for a NAPI ID. Signed-off-by: Joe Damato Reviewed-by: Jakub Kicinski Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241011184527.16393-5-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 9 +++++++++ include/uapi/linux/netdev.h | 1 + net/core/netdev-genl.c | 6 ++++++ tools/include/uapi/linux/netdev.h | 1 + 4 files changed, 17 insertions(+) (limited to 'net') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 585e87ec3c16..7b47454c51dd 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -255,6 +255,14 @@ attribute-sets: type: u32 checks: max: s32-max + - + name: gro-flush-timeout + doc: The timeout, in nanoseconds, of when to trigger the NAPI watchdog + timer which schedules NAPI processing. Additionally, a non-zero + value will also prevent GRO from flushing recent super-frames at + the end of a NAPI cycle. This may add receive latency in exchange + for reducing the number of frames processed by the network stack. + type: uint - name: queue attributes: @@ -644,6 +652,7 @@ operations: - irq - pid - defer-hard-irqs + - gro-flush-timeout dump: request: attributes: diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 13dc0b027e86..cacd33359c76 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -123,6 +123,7 @@ enum { NETDEV_A_NAPI_IRQ, NETDEV_A_NAPI_PID, NETDEV_A_NAPI_DEFER_HARD_IRQS, + NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index f98e5d1d0d21..ac19f2e6cfbe 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -161,6 +161,7 @@ static int netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, const struct genl_info *info) { + unsigned long gro_flush_timeout; u32 napi_defer_hard_irqs; void *hdr; pid_t pid; @@ -195,6 +196,11 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, napi_defer_hard_irqs)) goto nla_put_failure; + gro_flush_timeout = napi_get_gro_flush_timeout(napi); + if (nla_put_uint(rsp, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, + gro_flush_timeout)) + goto nla_put_failure; + genlmsg_end(rsp, hdr); return 0; diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 13dc0b027e86..cacd33359c76 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -123,6 +123,7 @@ enum { NETDEV_A_NAPI_IRQ, NETDEV_A_NAPI_PID, NETDEV_A_NAPI_DEFER_HARD_IRQS, + NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) -- cgit v1.3 From 86e25f40aa1e9e54e081e55016f65b5c92523989 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Fri, 11 Oct 2024 18:45:00 +0000 Subject: net: napi: Add napi_config Add a persistent NAPI config area for NAPI configuration to the core. Drivers opt-in to setting the persistent config for a NAPI by passing an index when calling netif_napi_add_config. napi_config is allocated in alloc_netdev_mqs, freed in free_netdev (after the NAPIs are deleted). Drivers which call netif_napi_add_config will have persistent per-NAPI settings: NAPI IDs, gro_flush_timeout, and defer_hard_irq settings. Per-NAPI settings are saved in napi_disable and restored in napi_enable. Co-developed-by: Martin Karsten Signed-off-by: Martin Karsten Signed-off-by: Joe Damato Reviewed-by: Jakub Kicinski Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241011184527.16393-6-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- .../networking/net_cachelines/net_device.rst | 1 + include/linux/netdevice.h | 36 +++++++++- net/core/dev.c | 80 ++++++++++++++++++++-- net/core/dev.h | 12 ++++ 4 files changed, 119 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index 67910ea49160..db6192b2bb50 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -186,6 +186,7 @@ struct dpll_pin* dpll_pin struct hlist_head page_pools struct dim_irq_moder* irq_moder u64 max_pacing_offload_horizon +struct_napi_config* napi_config unsigned_long gro_flush_timeout u32 napi_defer_hard_irqs =================================== =========================== =================== =================== =================================================================================== diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 93241d4de437..8feaca12655e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -342,6 +342,15 @@ struct gro_list { */ #define GRO_HASH_BUCKETS 8 +/* + * Structure for per-NAPI config + */ +struct napi_config { + u64 gro_flush_timeout; + u32 defer_hard_irqs; + unsigned int napi_id; +}; + /* * Structure for NAPI scheduling similar to tasklet but with weighting */ @@ -379,6 +388,8 @@ struct napi_struct { struct list_head dev_list; struct hlist_node napi_hash_node; int irq; + int index; + struct napi_config *config; }; enum { @@ -1868,9 +1879,6 @@ enum netdev_reg_state { * allocated at register_netdev() time * @real_num_rx_queues: Number of RX queues currently active in device * @xdp_prog: XDP sockets filter program pointer - * @gro_flush_timeout: timeout for GRO layer in NAPI - * @napi_defer_hard_irqs: If not zero, provides a counter that would - * allow to avoid NIC hard IRQ, on busy queues. * * @rx_handler: handler for received packets * @rx_handler_data: XXX: need comments on this one @@ -2020,6 +2028,11 @@ enum netdev_reg_state { * where the clock is recovered. * * @max_pacing_offload_horizon: max EDT offload horizon in nsec. + * @napi_config: An array of napi_config structures containing per-NAPI + * settings. + * @gro_flush_timeout: timeout for GRO layer in NAPI + * @napi_defer_hard_irqs: If not zero, provides a counter that would + * allow to avoid NIC hard IRQ, on busy queues. * * FIXME: cleanup struct net_device such that network protocol info * moves out. @@ -2413,6 +2426,7 @@ struct net_device { struct dim_irq_moder *irq_moder; u64 max_pacing_offload_horizon; + struct napi_config *napi_config; unsigned long gro_flush_timeout; u32 napi_defer_hard_irqs; @@ -2678,6 +2692,22 @@ netif_napi_add_tx_weight(struct net_device *dev, netif_napi_add_weight(dev, napi, poll, weight); } +/** + * netif_napi_add_config - initialize a NAPI context with persistent config + * @dev: network device + * @napi: NAPI context + * @poll: polling function + * @index: the NAPI index + */ +static inline void +netif_napi_add_config(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int index) +{ + napi->index = index; + napi->config = &dev->napi_config[index]; + netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT); +} + /** * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only * @dev: network device diff --git a/net/core/dev.c b/net/core/dev.c index e21ace3551d5..c682173a7642 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6505,6 +6505,23 @@ EXPORT_SYMBOL(napi_busy_loop); #endif /* CONFIG_NET_RX_BUSY_POLL */ +static void __napi_hash_add_with_id(struct napi_struct *napi, + unsigned int napi_id) +{ + napi->napi_id = napi_id; + hlist_add_head_rcu(&napi->napi_hash_node, + &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); +} + +static void napi_hash_add_with_id(struct napi_struct *napi, + unsigned int napi_id) +{ + spin_lock(&napi_hash_lock); + WARN_ON_ONCE(napi_by_id(napi_id)); + __napi_hash_add_with_id(napi, napi_id); + spin_unlock(&napi_hash_lock); +} + static void napi_hash_add(struct napi_struct *napi) { if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state)) @@ -6517,10 +6534,8 @@ static void napi_hash_add(struct napi_struct *napi) if (unlikely(++napi_gen_id < MIN_NAPI_ID)) napi_gen_id = MIN_NAPI_ID; } while (napi_by_id(napi_gen_id)); - napi->napi_id = napi_gen_id; - hlist_add_head_rcu(&napi->napi_hash_node, - &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); + __napi_hash_add_with_id(napi, napi_gen_id); spin_unlock(&napi_hash_lock); } @@ -6643,6 +6658,28 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, } EXPORT_SYMBOL(netif_queue_set_napi); +static void napi_restore_config(struct napi_struct *n) +{ + n->defer_hard_irqs = n->config->defer_hard_irqs; + n->gro_flush_timeout = n->config->gro_flush_timeout; + /* a NAPI ID might be stored in the config, if so use it. if not, use + * napi_hash_add to generate one for us. It will be saved to the config + * in napi_disable. + */ + if (n->config->napi_id) + napi_hash_add_with_id(n, n->config->napi_id); + else + napi_hash_add(n); +} + +static void napi_save_config(struct napi_struct *n) +{ + n->config->defer_hard_irqs = n->defer_hard_irqs; + n->config->gro_flush_timeout = n->gro_flush_timeout; + n->config->napi_id = n->napi_id; + napi_hash_del(n); +} + void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { @@ -6653,8 +6690,6 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, INIT_HLIST_NODE(&napi->napi_hash_node); hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; - napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs)); - napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout)); init_gro_hash(napi); napi->skb = NULL; INIT_LIST_HEAD(&napi->rx_list); @@ -6672,7 +6707,13 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, set_bit(NAPI_STATE_SCHED, &napi->state); set_bit(NAPI_STATE_NPSVC, &napi->state); list_add_rcu(&napi->dev_list, &dev->napi_list); - napi_hash_add(napi); + + /* default settings from sysfs are applied to all NAPIs. any per-NAPI + * configuration will be loaded in napi_enable + */ + napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs)); + napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout)); + napi_get_frags_check(napi); /* Create kthread for this napi if dev->threaded is set. * Clear dev->threaded if kthread creation failed so that @@ -6704,6 +6745,11 @@ void napi_disable(struct napi_struct *n) hrtimer_cancel(&n->timer); + if (n->config) + napi_save_config(n); + else + napi_hash_del(n); + clear_bit(NAPI_STATE_DISABLE, &n->state); } EXPORT_SYMBOL(napi_disable); @@ -6719,6 +6765,11 @@ void napi_enable(struct napi_struct *n) { unsigned long new, val = READ_ONCE(n->state); + if (n->config) + napi_restore_config(n); + else + napi_hash_add(n); + do { BUG_ON(!test_bit(NAPI_STATE_SCHED, &val)); @@ -6748,7 +6799,11 @@ void __netif_napi_del(struct napi_struct *napi) if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state)) return; - napi_hash_del(napi); + if (napi->config) { + napi->index = -1; + napi->config = NULL; + } + list_del_rcu(&napi->dev_list); napi_free_frags(napi); @@ -11085,6 +11140,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, unsigned int txqs, unsigned int rxqs) { struct net_device *dev; + size_t napi_config_sz; + unsigned int maxqs; BUG_ON(strlen(name) >= sizeof(dev->name)); @@ -11098,6 +11155,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, return NULL; } + maxqs = max(txqs, rxqs); + dev = kvzalloc(struct_size(dev, priv, sizeof_priv), GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!dev) @@ -11174,6 +11233,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, if (!dev->ethtool) goto free_all; + napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config)); + dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT); + if (!dev->napi_config) + goto free_all; + strscpy(dev->name, name); dev->name_assign_type = name_assign_type; dev->group = INIT_NETDEV_GROUP; @@ -11237,6 +11301,8 @@ void free_netdev(struct net_device *dev) list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) netif_napi_del(p); + kvfree(dev->napi_config); + ref_tracker_dir_exit(&dev->refcnt_tracker); #ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); diff --git a/net/core/dev.h b/net/core/dev.h index 7d0aab7e3ef1..7881bced70a9 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -177,11 +177,17 @@ static inline void napi_set_defer_hard_irqs(struct napi_struct *n, u32 defer) static inline void netdev_set_defer_hard_irqs(struct net_device *netdev, u32 defer) { + unsigned int count = max(netdev->num_rx_queues, + netdev->num_tx_queues); struct napi_struct *napi; + int i; WRITE_ONCE(netdev->napi_defer_hard_irqs, defer); list_for_each_entry(napi, &netdev->napi_list, dev_list) napi_set_defer_hard_irqs(napi, defer); + + for (i = 0; i < count; i++) + netdev->napi_config[i].defer_hard_irqs = defer; } /** @@ -217,11 +223,17 @@ static inline void napi_set_gro_flush_timeout(struct napi_struct *n, static inline void netdev_set_gro_flush_timeout(struct net_device *netdev, unsigned long timeout) { + unsigned int count = max(netdev->num_rx_queues, + netdev->num_tx_queues); struct napi_struct *napi; + int i; WRITE_ONCE(netdev->gro_flush_timeout, timeout); list_for_each_entry(napi, &netdev->napi_list, dev_list) napi_set_gro_flush_timeout(napi, timeout); + + for (i = 0; i < count; i++) + netdev->napi_config[i].gro_flush_timeout = timeout; } int rps_cpumask_housekeeping(struct cpumask *mask); -- cgit v1.3 From 1287c1ae0fc227e5acef11a539eb4e75646e31c7 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Fri, 11 Oct 2024 18:45:01 +0000 Subject: netdev-genl: Support setting per-NAPI config values Add support to set per-NAPI defer_hard_irqs and gro_flush_timeout. Signed-off-by: Joe Damato Reviewed-by: Jakub Kicinski Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241011184527.16393-7-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 11 ++++++++ include/uapi/linux/netdev.h | 1 + net/core/netdev-genl-gen.c | 18 +++++++++++++ net/core/netdev-genl-gen.h | 1 + net/core/netdev-genl.c | 45 +++++++++++++++++++++++++++++++++ tools/include/uapi/linux/netdev.h | 1 + 6 files changed, 77 insertions(+) (limited to 'net') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 7b47454c51dd..f9cb97d6106c 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -693,6 +693,17 @@ operations: reply: attributes: - id + - + name: napi-set + doc: Set configurable NAPI instance settings. + attribute-set: napi + flags: [ admin-perm ] + do: + request: + attributes: + - id + - defer-hard-irqs + - gro-flush-timeout kernel-family: headers: [ "linux/list.h"] diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index cacd33359c76..e3ebb49f60d2 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -201,6 +201,7 @@ enum { NETDEV_CMD_NAPI_GET, NETDEV_CMD_QSTATS_GET, NETDEV_CMD_BIND_RX, + NETDEV_CMD_NAPI_SET, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index b28424ae06d5..e197bd84997c 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -22,6 +22,10 @@ static const struct netlink_range_validation netdev_a_page_pool_ifindex_range = .max = 2147483647ULL, }; +static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range = { + .max = 2147483647ULL, +}; + /* Common nested types */ const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = { [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range), @@ -87,6 +91,13 @@ static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] [NETDEV_A_DMABUF_QUEUES] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy), }; +/* NETDEV_CMD_NAPI_SET - do */ +static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT + 1] = { + [NETDEV_A_NAPI_ID] = { .type = NLA_U32, }, + [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range), + [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, }, +}; + /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -171,6 +182,13 @@ static const struct genl_split_ops netdev_nl_ops[] = { .maxattr = NETDEV_A_DMABUF_FD, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, + { + .cmd = NETDEV_CMD_NAPI_SET, + .doit = netdev_nl_napi_set_doit, + .policy = netdev_napi_set_nl_policy, + .maxattr = NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index 8cda334fd042..e09dd7539ff2 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -33,6 +33,7 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info); enum { NETDEV_NLGRP_MGMT, diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index ac19f2e6cfbe..b49c3b4e5fbe 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -303,6 +303,51 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) return err; } +static int +netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info) +{ + u64 gro_flush_timeout = 0; + u32 defer = 0; + + if (info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]) { + defer = nla_get_u32(info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]); + napi_set_defer_hard_irqs(napi, defer); + } + + if (info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]) { + gro_flush_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]); + napi_set_gro_flush_timeout(napi, gro_flush_timeout); + } + + return 0; +} + +int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct napi_struct *napi; + unsigned int napi_id; + int err; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID)) + return -EINVAL; + + napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]); + + rtnl_lock(); + + napi = napi_by_id(napi_id); + if (napi) { + err = netdev_nl_napi_set_config(napi, info); + } else { + NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]); + err = -ENOENT; + } + + rtnl_unlock(); + + return err; +} + static int netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index cacd33359c76..e3ebb49f60d2 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -201,6 +201,7 @@ enum { NETDEV_CMD_NAPI_GET, NETDEV_CMD_QSTATS_GET, NETDEV_CMD_BIND_RX, + NETDEV_CMD_NAPI_SET, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) -- cgit v1.3 From 6c959fd5e17387201dba3619b2e6af213939a0a7 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 30 Sep 2024 02:58:54 -0700 Subject: netfilter: Make legacy configs user selectable This option makes legacy Netfilter Kconfig user selectable, giving users the option to configure iptables without enabling any other config. Make the following KConfig entries user selectable: * BRIDGE_NF_EBTABLES_LEGACY * IP_NF_ARPTABLES * IP_NF_IPTABLES_LEGACY * IP6_NF_IPTABLES_LEGACY Signed-off-by: Breno Leitao Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/Kconfig | 8 +++++++- net/ipv4/netfilter/Kconfig | 16 ++++++++++++++-- net/ipv6/netfilter/Kconfig | 9 ++++++++- 3 files changed, 29 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index 104c0125e32e..f16bbbbb9481 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -41,7 +41,13 @@ config NF_CONNTRACK_BRIDGE # old sockopt interface and eval loop config BRIDGE_NF_EBTABLES_LEGACY - tristate + tristate "Legacy EBTABLES support" + depends on BRIDGE && NETFILTER_XTABLES + default n + help + Legacy ebtables packet/frame classifier. + This is not needed if you are using ebtables over nftables + (iptables-nft). menuconfig BRIDGE_NF_EBTABLES tristate "Ethernet Bridge tables (ebtables) support" diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 1b991b889506..ef8009281da5 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -12,7 +12,13 @@ config NF_DEFRAG_IPV4 # old sockopt interface and eval loop config IP_NF_IPTABLES_LEGACY - tristate + tristate "Legacy IP tables support" + default n + select NETFILTER_XTABLES + help + iptables is a legacy packet classifier. + This is not needed if you are using iptables over nftables + (iptables-nft). config NF_SOCKET_IPV4 tristate "IPv4 socket lookup support" @@ -318,7 +324,13 @@ endif # IP_NF_IPTABLES # ARP tables config IP_NF_ARPTABLES - tristate + tristate "Legacy ARPTABLES support" + depends on NETFILTER_XTABLES + default n + help + arptables is a legacy packet classifier. + This is not needed if you are using arptables over nftables + (iptables-nft). config NFT_COMPAT_ARP tristate diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index f3c8e2d918e1..e087a8e97ba7 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -8,7 +8,14 @@ menu "IPv6: Netfilter Configuration" # old sockopt interface and eval loop config IP6_NF_IPTABLES_LEGACY - tristate + tristate "Legacy IP6 tables support" + depends on INET && IPV6 + select NETFILTER_XTABLES + default n + help + ip6tables is a legacy packet classifier. + This is not needed if you are using iptables over nftables + (iptables-nft). config NF_SOCKET_IPV6 tristate "IPv6 socket lookup support" -- cgit v1.3 From bcbbfaa2612d7d0e9c3eafade5f6e93c3672f34f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 10 Oct 2024 08:12:48 -0700 Subject: tools: ynl-gen: use names of constants in generated limits YNL specs can use string expressions for limits, like s32-min or u16-max. We convert all of those into their numeric values when generating the code, which isn't always helpful. Try to retain the string representations in the output. Any sort of calculations still need the integers. Signed-off-by: Jakub Kicinski Reviewed-by: Joe Damato Link: https://patch.msgid.link/20241010151248.2049755-1-kuba@kernel.org [pabeni@redhat.com: regenerated netdev-genl-gen.c] Signed-off-by: Paolo Abeni --- net/core/netdev-genl-gen.c | 6 +++--- tools/net/ynl/ynl-gen-c.py | 36 +++++++++++++++++++++++------------- 2 files changed, 26 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index e197bd84997c..21de7e10be16 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -14,16 +14,16 @@ /* Integer value ranges */ static const struct netlink_range_validation netdev_a_page_pool_id_range = { .min = 1ULL, - .max = 4294967295ULL, + .max = U32_MAX, }; static const struct netlink_range_validation netdev_a_page_pool_ifindex_range = { .min = 1ULL, - .max = 2147483647ULL, + .max = S32_MAX, }; static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range = { - .max = 2147483647ULL, + .max = S32_MAX, }; /* Common nested types */ diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index 9e8254aad578..d64cb2b49c44 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -80,11 +80,21 @@ class Type(SpecAttr): value = self.checks.get(limit, default) if value is None: return value - elif value in self.family.consts: + if isinstance(value, int): + return value + if value in self.family.consts: + raise Exception("Resolving family constants not implemented, yet") + return limit_to_number(value) + + def get_limit_str(self, limit, default=None, suffix=''): + value = self.checks.get(limit, default) + if value is None: + return '' + if isinstance(value, int): + return str(value) + suffix + if value in self.family.consts: return c_upper(f"{self.family['name']}-{value}") - if not isinstance(value, int): - value = limit_to_number(value) - return value + return c_upper(value) def resolve(self): if 'name-prefix' in self.attr: @@ -358,11 +368,11 @@ class TypeScalar(Type): elif 'full-range' in self.checks: return f"NLA_POLICY_FULL_RANGE({policy}, &{c_lower(self.enum_name)}_range)" elif 'range' in self.checks: - return f"NLA_POLICY_RANGE({policy}, {self.get_limit('min')}, {self.get_limit('max')})" + return f"NLA_POLICY_RANGE({policy}, {self.get_limit_str('min')}, {self.get_limit_str('max')})" elif 'min' in self.checks: - return f"NLA_POLICY_MIN({policy}, {self.get_limit('min')})" + return f"NLA_POLICY_MIN({policy}, {self.get_limit_str('min')})" elif 'max' in self.checks: - return f"NLA_POLICY_MAX({policy}, {self.get_limit('max')})" + return f"NLA_POLICY_MAX({policy}, {self.get_limit_str('max')})" return super()._attr_policy(policy) def _attr_typol(self): @@ -413,11 +423,11 @@ class TypeString(Type): def _attr_policy(self, policy): if 'exact-len' in self.checks: - mem = 'NLA_POLICY_EXACT_LEN(' + str(self.get_limit('exact-len')) + ')' + mem = 'NLA_POLICY_EXACT_LEN(' + self.get_limit_str('exact-len') + ')' else: mem = '{ .type = ' + policy if 'max-len' in self.checks: - mem += ', .len = ' + str(self.get_limit('max-len')) + mem += ', .len = ' + self.get_limit_str('max-len') mem += ', }' return mem @@ -476,9 +486,9 @@ class TypeBinary(Type): if len(self.checks) == 0: mem = '{ .type = NLA_BINARY, }' elif 'exact-len' in self.checks: - mem = 'NLA_POLICY_EXACT_LEN(' + str(self.get_limit('exact-len')) + ')' + mem = 'NLA_POLICY_EXACT_LEN(' + self.get_limit_str('exact-len') + ')' elif 'min-len' in self.checks: - mem = '{ .len = ' + str(self.get_limit('min-len')) + ', }' + mem = '{ .len = ' + self.get_limit_str('min-len') + ', }' return mem @@ -2166,9 +2176,9 @@ def print_kernel_policy_ranges(family, cw): cw.block_start(line=f'static const struct netlink_range_validation{sign} {c_lower(attr.enum_name)}_range =') members = [] if 'min' in attr.checks: - members.append(('min', str(attr.get_limit('min')) + suffix)) + members.append(('min', attr.get_limit_str('min', suffix=suffix))) if 'max' in attr.checks: - members.append(('max', str(attr.get_limit('max')) + suffix)) + members.append(('max', attr.get_limit_str('max', suffix=suffix))) cw.write_struct_init(members) cw.block_end(line=';') cw.nl() -- cgit v1.3 From bb9df28e6fcda6a96860e7b77f3912ef50e06793 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 10 Oct 2024 10:24:33 -0700 Subject: rtnl_net_debug: Remove rtnl_net_debug_exit(). kernel test robot reported section mismatch in rtnl_net_debug_exit(). WARNING: modpost: vmlinux: section mismatch in reference: rtnl_net_debug_exit+0x20 (section: .exit.text) -> rtnl_net_debug_net_ops (section: .init.data) rtnl_net_debug_exit() uses rtnl_net_debug_net_ops() that is annotated as __net_initdata, but this file is always built-in. Let's remove rtnl_net_debug_exit(). Fixes: 03fa53485659 ("rtnetlink: Add ASSERT_RTNL_NET() placeholder for netdev notifier.") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202410101854.i0vQCaDz-lkp@intel.com/ Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241010172433.67694-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/core/rtnl_net_debug.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net') diff --git a/net/core/rtnl_net_debug.c b/net/core/rtnl_net_debug.c index e90a32242e22..f406045cbd0e 100644 --- a/net/core/rtnl_net_debug.c +++ b/net/core/rtnl_net_debug.c @@ -122,10 +122,4 @@ static int __init rtnl_net_debug_init(void) return ret; } -static void __exit rtnl_net_debug_exit(void) -{ - unregister_netdevice_notifier(&rtnl_net_debug_block); - unregister_pernet_device(&rtnl_net_debug_net_ops); -} - subsys_initcall(rtnl_net_debug_init); -- cgit v1.3 From 0741f55593547d7f25ec003b355a21d6d5fef01e Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 29 Aug 2024 17:29:32 +0200 Subject: netfilter: nf_tables: Fix percpu address space issues in nf_tables_api.c Compiling nf_tables_api.c results in several sparse warnings: nf_tables_api.c:2077:31: warning: incorrect type in return expression (different address spaces) nf_tables_api.c:2080:31: warning: incorrect type in return expression (different address spaces) nf_tables_api.c:2084:31: warning: incorrect type in return expression (different address spaces) nf_tables_api.c:2740:23: warning: incorrect type in assignment (different address spaces) nf_tables_api.c:2752:38: warning: incorrect type in assignment (different address spaces) nf_tables_api.c:2798:21: warning: incorrect type in argument 1 (different address spaces) Use {ERR_PTR,IS_ERR,PTR_ERR}_PCPU() macros when crossing between generic and percpu address spaces and add __percpu annotation to *stats pointer to fix these warnings. Found by GCC's named address space checks. There were no changes in the resulting object files. Signed-off-by: Uros Bizjak Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a24fe62650a7..6552ec616745 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2082,14 +2082,14 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) err = nla_parse_nested_deprecated(tb, NFTA_COUNTER_MAX, attr, nft_counter_policy, NULL); if (err < 0) - return ERR_PTR(err); + return ERR_PTR_PCPU(err); if (!tb[NFTA_COUNTER_BYTES] || !tb[NFTA_COUNTER_PACKETS]) - return ERR_PTR(-EINVAL); + return ERR_PTR_PCPU(-EINVAL); newstats = netdev_alloc_pcpu_stats(struct nft_stats); if (newstats == NULL) - return ERR_PTR(-ENOMEM); + return ERR_PTR_PCPU(-ENOMEM); /* Restore old counters on this cpu, no problem. Per-cpu statistics * are not exposed to userspace. @@ -2533,10 +2533,10 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (nla[NFTA_CHAIN_COUNTERS]) { stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); - if (IS_ERR(stats)) { + if (IS_ERR_PCPU(stats)) { nft_chain_release_hook(&hook); kfree(basechain); - return PTR_ERR(stats); + return PTR_ERR_PCPU(stats); } rcu_assign_pointer(basechain->stats, stats); } @@ -2650,7 +2650,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, struct nft_table *table = ctx->table; struct nft_chain *chain = ctx->chain; struct nft_chain_hook hook = {}; - struct nft_stats *stats = NULL; + struct nft_stats __percpu *stats = NULL; struct nft_hook *h, *next; struct nf_hook_ops *ops; struct nft_trans *trans; @@ -2746,8 +2746,8 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, } stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); - if (IS_ERR(stats)) { - err = PTR_ERR(stats); + if (IS_ERR_PCPU(stats)) { + err = PTR_ERR_PCPU(stats); goto err_hooks; } } -- cgit v1.3 From 544dded8cb6317c2d3ecf4bba8412e616e70bb86 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Mon, 9 Sep 2024 15:48:39 -0700 Subject: netfilter: nf_tables: replace deprecated strncpy with strscpy_pad strncpy() is deprecated for use on NUL-terminated destination strings [1] and as such we should prefer more robust and less ambiguous string interfaces. In this particular instance, the usage of strncpy() is fine and works as expected. However, towards the goal of [2], we should consider replacing it with an alternative as many instances of strncpy() are bug-prone. Its removal from the kernel promotes better long term health for the codebase. The current usage of strncpy() likely just wants the NUL-padding behavior offered by strncpy() and doesn't care about the NUL-termination. Since the compiler doesn't know the size of @dest, we can't use strtomem_pad(). Instead, use strscpy_pad() which behaves functionally the same as strncpy() in this context -- as we expect br_dev->name to be NUL-terminated itself. Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strncpy-on-nul-terminated-strings [1] Link: https://github.com/KSPP/linux/issues/90 [2] Link: https://manpages.debian.org/testing/linux-manual-4.8/strscpy.9.en.html Cc: Kees Cook Cc: linux-hardening@vger.kernel.org Signed-off-by: Justin Stitt Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/nft_meta_bridge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index d12a221366d6..5adced1e7d0c 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -63,7 +63,7 @@ static void nft_meta_bridge_get_eval(const struct nft_expr *expr, return nft_meta_get_eval(expr, regs, pkt); } - strncpy((char *)dest, br_dev ? br_dev->name : "", IFNAMSIZ); + strscpy_pad((char *)dest, br_dev ? br_dev->name : "", IFNAMSIZ); return; err: regs->verdict.code = NFT_BREAK; -- cgit v1.3 From 08e52cccae11a4148a5810f0bd5614796994d221 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 23 Jul 2024 15:08:16 +0200 Subject: netfilter: nf_tables: prefer nft_trans_elem_alloc helper Reduce references to sizeof(struct nft_trans_elem). Preparation patch to move this to a flexiable array to store elem references. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 6552ec616745..30331688301e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6409,7 +6409,7 @@ err: nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); } -static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, +static struct nft_trans *nft_trans_elem_alloc(const struct nft_ctx *ctx, int msg_type, struct nft_set *set) { @@ -7471,13 +7471,11 @@ static int __nft_set_catchall_flush(const struct nft_ctx *ctx, { struct nft_trans *trans; - trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM, - sizeof(struct nft_trans_elem), GFP_KERNEL); + trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); if (!trans) return -ENOMEM; nft_setelem_data_deactivate(ctx->net, set, elem_priv); - nft_trans_elem_set(trans) = set; nft_trans_elem_priv(trans) = elem_priv; nft_trans_commit_list_add_tail(ctx->net, trans); -- cgit v1.3 From 2cee3e6e2e4b74bec96694169f01cd3feec1f264 Mon Sep 17 00:00:00 2001 From: Gur Stavi Date: Sun, 13 Oct 2024 10:15:25 +0300 Subject: af_packet: allow fanout_add when socket is not RUNNING PACKET socket can retain its fanout membership through link down and up and leave a fanout while closed regardless of link state. However, socket was forbidden from joining a fanout while it was not RUNNING. This patch allows PACKET socket to join fanout while not RUNNING. Socket can be RUNNING if it has a specified protocol. Either directly from packet_create (being implicitly bound to any interface) or following a successful bind. Socket RUNNING state is switched off if it is bound to an interface that went down. Instead of the test for RUNNING, this patch adds a test that socket can become RUNNING. Signed-off-by: Gur Stavi Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/4f1a3c37dbef980ef044c4d2adf91c76e2eca14b.1728802323.git.gur.stavi@huawei.com Signed-off-by: Jakub Kicinski --- net/packet/af_packet.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index f8942062f776..2ff4b251842d 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1846,21 +1846,22 @@ static int fanout_add(struct sock *sk, struct fanout_args *args) err = -EINVAL; spin_lock(&po->bind_lock); - if (packet_sock_flag(po, PACKET_SOCK_RUNNING) && + if (po->num && match->type == type && match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) { err = -ENOSPC; if (refcount_read(&match->sk_ref) < match->max_num_members) { - __dev_remove_pack(&po->prot_hook); - /* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */ WRITE_ONCE(po->fanout, match); po->rollover = rollover; rollover = NULL; refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1); - __fanout_link(sk, po); + if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) { + __dev_remove_pack(&po->prot_hook); + __fanout_link(sk, po); + } err = 0; } } -- cgit v1.3 From 497e17d807593bbeedc466029b30c2daa25004ba Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 13 Oct 2024 22:16:49 +0200 Subject: ipv4: replace call_rcu by kfree_rcu for simple kmem_cache_free callback Since SLOB was removed and since commit 6c6c47b063b5 ("mm, slab: call kvfree_rcu_barrier() from kmem_cache_destroy()"), it is not necessary to use call_rcu when the callback only performs kmem_cache_free. Use kfree_rcu() directly. The changes were made using Coccinelle. Signed-off-by: Julia Lawall Reviewed-by: Eric Dumazet Acked-by: Paul E. McKenney Acked-by: Vlastimil Babka Link: https://patch.msgid.link/20241013201704.49576-3-Julia.Lawall@inria.fr Signed-off-by: Jakub Kicinski --- net/ipv4/fib_trie.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 09e31757e96c..161f5526b86c 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -292,15 +292,9 @@ static const int inflate_threshold = 50; static const int halve_threshold_root = 15; static const int inflate_threshold_root = 30; -static void __alias_free_mem(struct rcu_head *head) -{ - struct fib_alias *fa = container_of(head, struct fib_alias, rcu); - kmem_cache_free(fn_alias_kmem, fa); -} - static inline void alias_free_mem_rcu(struct fib_alias *fa) { - call_rcu(&fa->rcu, __alias_free_mem); + kfree_rcu(fa, rcu); } #define TNODE_VMALLOC_MAX \ -- cgit v1.3 From bb5810d4236be8750505ffc9f74e2403e7e7d617 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 13 Oct 2024 22:16:50 +0200 Subject: inetpeer: replace call_rcu by kfree_rcu for simple kmem_cache_free callback Since SLOB was removed and since commit 6c6c47b063b5 ("mm, slab: call kvfree_rcu_barrier() from kmem_cache_destroy()"), it is not necessary to use call_rcu when the callback only performs kmem_cache_free. Use kfree_rcu() directly. The changes were made using Coccinelle. Signed-off-by: Julia Lawall Reviewed-by: Eric Dumazet Acked-by: Paul E. McKenney Acked-by: Vlastimil Babka Link: https://patch.msgid.link/20241013201704.49576-4-Julia.Lawall@inria.fr Signed-off-by: Jakub Kicinski --- net/ipv4/inetpeer.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 5bd759963451..5ab56f4cb529 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -128,11 +128,6 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr, return NULL; } -static void inetpeer_free_rcu(struct rcu_head *head) -{ - kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu)); -} - /* perform garbage collect on all items stacked during a lookup */ static void inet_peer_gc(struct inet_peer_base *base, struct inet_peer *gc_stack[], @@ -168,7 +163,7 @@ static void inet_peer_gc(struct inet_peer_base *base, if (p) { rb_erase(&p->rb_node, &base->rb_root); base->total--; - call_rcu(&p->rcu, inetpeer_free_rcu); + kfree_rcu(p, rcu); } } } @@ -242,7 +237,7 @@ void inet_putpeer(struct inet_peer *p) WRITE_ONCE(p->dtime, (__u32)jiffies); if (refcount_dec_and_test(&p->refcnt)) - call_rcu(&p->rcu, inetpeer_free_rcu); + kfree_rcu(p, rcu); } EXPORT_SYMBOL_GPL(inet_putpeer); -- cgit v1.3 From 85e48bcf294caf2915c16d89dbfbf936653415a3 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 13 Oct 2024 22:16:51 +0200 Subject: ipv6: replace call_rcu by kfree_rcu for simple kmem_cache_free callback Since SLOB was removed and since commit 6c6c47b063b5 ("mm, slab: call kvfree_rcu_barrier() from kmem_cache_destroy()"), it is not necessary to use call_rcu when the callback only performs kmem_cache_free. Use kfree_rcu() directly. The changes were made using Coccinelle. Signed-off-by: Julia Lawall Reviewed-by: Eric Dumazet Acked-by: Paul E. McKenney Acked-by: Vlastimil Babka Link: https://patch.msgid.link/20241013201704.49576-5-Julia.Lawall@inria.fr Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_fib.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index cea160b249d2..c9da10d971fa 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -198,16 +198,9 @@ static void node_free_immediate(struct net *net, struct fib6_node *fn) net->ipv6.rt6_stats->fib_nodes--; } -static void node_free_rcu(struct rcu_head *head) -{ - struct fib6_node *fn = container_of(head, struct fib6_node, rcu); - - kmem_cache_free(fib6_node_kmem, fn); -} - static void node_free(struct net *net, struct fib6_node *fn) { - call_rcu(&fn->rcu, node_free_rcu); + kfree_rcu(fn, rcu); net->ipv6.rt6_stats->fib_nodes--; } -- cgit v1.3 From 4ac64e570c337cfbff80f16334b10901168107eb Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 13 Oct 2024 22:16:55 +0200 Subject: net: bridge: replace call_rcu by kfree_rcu for simple kmem_cache_free callback Since SLOB was removed and since commit 6c6c47b063b5 ("mm, slab: call kvfree_rcu_barrier() from kmem_cache_destroy()"), it is not necessary to use call_rcu when the callback only performs kmem_cache_free. Use kfree_rcu() directly. The changes were made using Coccinelle. Signed-off-by: Julia Lawall Acked-by: Nikolay Aleksandrov Acked-by: Paul E. McKenney Acked-by: Vlastimil Babka Link: https://patch.msgid.link/20241013201704.49576-9-Julia.Lawall@inria.fr Signed-off-by: Jakub Kicinski --- net/bridge/br_fdb.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 642b8ccaae8e..1cd7bade9b3b 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -73,13 +73,6 @@ static inline int has_expired(const struct net_bridge *br, time_before_eq(fdb->updated + hold_time(br), jiffies); } -static void fdb_rcu_free(struct rcu_head *head) -{ - struct net_bridge_fdb_entry *ent - = container_of(head, struct net_bridge_fdb_entry, rcu); - kmem_cache_free(br_fdb_cache, ent); -} - static int fdb_to_nud(const struct net_bridge *br, const struct net_bridge_fdb_entry *fdb) { @@ -329,7 +322,7 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f, if (test_and_clear_bit(BR_FDB_DYNAMIC_LEARNED, &f->flags)) atomic_dec(&br->fdb_n_learned); fdb_notify(br, f, RTM_DELNEIGH, swdev_notify); - call_rcu(&f->rcu, fdb_rcu_free); + kfree_rcu(f, rcu); } /* Delete a local entry if no other port had the same address. -- cgit v1.3 From 7bb3ecbc2b6b146d244789025c892eb28c212d5c Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 13 Oct 2024 22:17:01 +0200 Subject: kcm: replace call_rcu by kfree_rcu for simple kmem_cache_free callback Since SLOB was removed and since commit 6c6c47b063b5 ("mm, slab: call kvfree_rcu_barrier() from kmem_cache_destroy()"), it is not necessary to use call_rcu when the callback only performs kmem_cache_free. Use kfree_rcu() directly. The changes were made using Coccinelle. Signed-off-by: Julia Lawall Acked-by: Paul E. McKenney Acked-by: Vlastimil Babka Link: https://patch.msgid.link/20241013201704.49576-15-Julia.Lawall@inria.fr Signed-off-by: Jakub Kicinski --- net/kcm/kcmsock.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'net') diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index d4118c796290..24aec295a51c 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1584,14 +1584,6 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return err; } -static void free_mux(struct rcu_head *rcu) -{ - struct kcm_mux *mux = container_of(rcu, - struct kcm_mux, rcu); - - kmem_cache_free(kcm_muxp, mux); -} - static void release_mux(struct kcm_mux *mux) { struct kcm_net *knet = mux->knet; @@ -1619,7 +1611,7 @@ static void release_mux(struct kcm_mux *mux) knet->count--; mutex_unlock(&knet->mutex); - call_rcu(&mux->rcu, free_mux); + kfree_rcu(mux, rcu); } static void kcm_done(struct kcm_sock *kcm) -- cgit v1.3 From 95b3120a485f77a9bb8060bf3398311e3dcb6c65 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Oct 2024 16:52:16 -0700 Subject: neighbour: Remove NEIGH_DN_TABLE. Since commit 1202cdd66531 ("Remove DECnet support from kernel"), NEIGH_DN_TABLE is no longer used. MPLS has implicit dependency on it in nla_put_via(), but nla_get_via() does not support DECnet. Let's remove NEIGH_DN_TABLE. Now, neigh_tables[] has only 2 elements and no extra iteration for DECnet in many places. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241014235216.10785-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 1 - net/mpls/af_mpls.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index a44f262a7384..3887ed9e5026 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -239,7 +239,6 @@ struct neigh_table { enum { NEIGH_ARP_TABLE = 0, NEIGH_ND_TABLE = 1, - NEIGH_DN_TABLE = 2, NEIGH_NR_TABLES, NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */ }; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index df62638b6498..a0573847bc55 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1664,7 +1664,7 @@ static int nla_put_via(struct sk_buff *skb, u8 table, const void *addr, int alen) { static const int table_to_family[NEIGH_NR_TABLES + 1] = { - AF_INET, AF_INET6, AF_DECnet, AF_PACKET, + AF_INET, AF_INET6, AF_PACKET, }; struct nlattr *nla; struct rtvia *via; -- cgit v1.3 From 397006ba5d918f9b74e734867e8fddbc36dc2282 Mon Sep 17 00:00:00 2001 From: Elena Salomatkina Date: Sun, 13 Oct 2024 15:45:29 +0300 Subject: net/sched: cbs: Fix integer overflow in cbs_set_port_rate() The subsequent calculation of port_rate = speed * 1000 * BYTES_PER_KBIT, where the BYTES_PER_KBIT is of type LL, may cause an overflow. At least when speed = SPEED_20000, the expression to the left of port_rate will be greater than INT_MAX. Found by Linux Verification Center (linuxtesting.org) with SVACE. Signed-off-by: Elena Salomatkina Link: https://patch.msgid.link/20241013124529.1043-1-esalomatkina@ispras.ru Signed-off-by: Jakub Kicinski --- net/sched/sch_cbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index 939425da1895..8c9a0400c862 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -310,7 +310,7 @@ static void cbs_set_port_rate(struct net_device *dev, struct cbs_sched_data *q) { struct ethtool_link_ksettings ecmd; int speed = SPEED_10; - int port_rate; + s64 port_rate; int err; err = __ethtool_get_link_ksettings(dev, &ecmd); -- cgit v1.3 From 46f2a11cb82b657fd15bab1c47821b635e03838b Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Mon, 14 Oct 2024 16:38:00 +0100 Subject: af_packet: avoid erroring out after sock_init_data() in packet_create() After sock_init_data() the allocated sk object is attached to the provided sock object. On error, packet_create() frees the sk object leaving the dangling pointer in the sock object on return. Some other code may try to use this pointer and cause use-after-free. Suggested-by: Eric Dumazet Signed-off-by: Ignat Korchagin Reviewed-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014153808.51894-2-ignat@cloudflare.com Signed-off-by: Jakub Kicinski --- net/packet/af_packet.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 2ff4b251842d..886c0dd47b66 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3422,17 +3422,17 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, if (sock->type == SOCK_PACKET) sock->ops = &packet_ops_spkt; + po = pkt_sk(sk); + err = packet_alloc_pending(po); + if (err) + goto out_sk_free; + sock_init_data(sock, sk); - po = pkt_sk(sk); init_completion(&po->skb_completion); sk->sk_family = PF_PACKET; po->num = proto; - err = packet_alloc_pending(po); - if (err) - goto out2; - packet_cached_dev_reset(po); sk->sk_destruct = packet_sock_destruct; @@ -3464,7 +3464,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, sock_prot_inuse_add(net, &packet_proto, 1); return 0; -out2: +out_sk_free: sk_free(sk); out: return err; -- cgit v1.3 From 7c4f78cdb8e7501e9f92d291a7d956591bf73be9 Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Mon, 14 Oct 2024 16:38:01 +0100 Subject: Bluetooth: L2CAP: do not leave dangling sk pointer on error in l2cap_sock_create() bt_sock_alloc() allocates the sk object and attaches it to the provided sock object. On error l2cap_sock_alloc() frees the sk object, but the dangling pointer is still attached to the sock object, which may create use-after-free in other code. Signed-off-by: Ignat Korchagin Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014153808.51894-3-ignat@cloudflare.com Signed-off-by: Jakub Kicinski --- net/bluetooth/l2cap_sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index ba437c6f6ee5..18e89e764f3b 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1886,6 +1886,7 @@ static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, chan = l2cap_chan_create(); if (!chan) { sk_free(sk); + sock->sk = NULL; return NULL; } -- cgit v1.3 From 3945c799f12b8d1f49a3b48369ca494d981ac465 Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Mon, 14 Oct 2024 16:38:02 +0100 Subject: Bluetooth: RFCOMM: avoid leaving dangling sk pointer in rfcomm_sock_alloc() bt_sock_alloc() attaches allocated sk object to the provided sock object. If rfcomm_dlc_alloc() fails, we release the sk object, but leave the dangling pointer in the sock object, which may cause use-after-free. Fix this by swapping calls to bt_sock_alloc() and rfcomm_dlc_alloc(). Signed-off-by: Ignat Korchagin Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014153808.51894-4-ignat@cloudflare.com Signed-off-by: Jakub Kicinski --- net/bluetooth/rfcomm/sock.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index f48250e3f2e1..355e1a1698f5 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -274,13 +274,13 @@ static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, struct rfcomm_dlc *d; struct sock *sk; - sk = bt_sock_alloc(net, sock, &rfcomm_proto, proto, prio, kern); - if (!sk) + d = rfcomm_dlc_alloc(prio); + if (!d) return NULL; - d = rfcomm_dlc_alloc(prio); - if (!d) { - sk_free(sk); + sk = bt_sock_alloc(net, sock, &rfcomm_proto, proto, prio, kern); + if (!sk) { + rfcomm_dlc_free(d); return NULL; } -- cgit v1.3 From 811a7ca7320c062e15d0f5b171fe6ad8592d1434 Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Mon, 14 Oct 2024 16:38:03 +0100 Subject: net: af_can: do not leave a dangling sk pointer in can_create() On error can_create() frees the allocated sk object, but sock_init_data() has already attached it to the provided sock object. This will leave a dangling sk pointer in the sock object and may cause use-after-free later. Signed-off-by: Ignat Korchagin Reviewed-by: Vincent Mailhol Reviewed-by: Kuniyuki Iwashima Reviewed-by: Marc Kleine-Budde Link: https://patch.msgid.link/20241014153808.51894-5-ignat@cloudflare.com Signed-off-by: Jakub Kicinski --- net/can/af_can.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/can/af_can.c b/net/can/af_can.c index 707576eeeb58..01f3fbb3b67d 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -171,6 +171,7 @@ static int can_create(struct net *net, struct socket *sock, int protocol, /* release sk on errors */ sock_orphan(sk); sock_put(sk); + sock->sk = NULL; } errout: -- cgit v1.3 From b4fcd63f6ef79c73cafae8cf4a114def5fc3d80d Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Mon, 14 Oct 2024 16:38:04 +0100 Subject: net: ieee802154: do not leave a dangling sk pointer in ieee802154_create() sock_init_data() attaches the allocated sk object to the provided sock object. If ieee802154_create() fails later, the allocated sk object is freed, but the dangling pointer remains in the provided sock object, which may allow use-after-free. Clear the sk pointer in the sock object on error. Signed-off-by: Ignat Korchagin Reviewed-by: Miquel Raynal Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014153808.51894-6-ignat@cloudflare.com Signed-off-by: Jakub Kicinski --- net/ieee802154/socket.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index 990a83455dcf..18d267921bb5 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -1043,19 +1043,21 @@ static int ieee802154_create(struct net *net, struct socket *sock, if (sk->sk_prot->hash) { rc = sk->sk_prot->hash(sk); - if (rc) { - sk_common_release(sk); - goto out; - } + if (rc) + goto out_sk_release; } if (sk->sk_prot->init) { rc = sk->sk_prot->init(sk); if (rc) - sk_common_release(sk); + goto out_sk_release; } out: return rc; +out_sk_release: + sk_common_release(sk); + sock->sk = NULL; + goto out; } static const struct net_proto_family ieee802154_family_ops = { -- cgit v1.3 From 9365fa510c6f82e3aa550a09d0c5c6b44dbc78ff Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Mon, 14 Oct 2024 16:38:05 +0100 Subject: net: inet: do not leave a dangling sk pointer in inet_create() sock_init_data() attaches the allocated sk object to the provided sock object. If inet_create() fails later, the sk object is freed, but the sock object retains the dangling pointer, which may create use-after-free later. Clear the sk pointer in the sock object on error. Signed-off-by: Ignat Korchagin Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014153808.51894-7-ignat@cloudflare.com Signed-off-by: Jakub Kicinski --- net/ipv4/af_inet.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index b24d74616637..8095e82de808 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -376,32 +376,30 @@ lookup_protocol: inet->inet_sport = htons(inet->inet_num); /* Add to protocol hash chains. */ err = sk->sk_prot->hash(sk); - if (err) { - sk_common_release(sk); - goto out; - } + if (err) + goto out_sk_release; } if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); - if (err) { - sk_common_release(sk); - goto out; - } + if (err) + goto out_sk_release; } if (!kern) { err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk); - if (err) { - sk_common_release(sk); - goto out; - } + if (err) + goto out_sk_release; } out: return err; out_rcu_unlock: rcu_read_unlock(); goto out; +out_sk_release: + sk_common_release(sk); + sock->sk = NULL; + goto out; } -- cgit v1.3 From 9df99c395d0f55fb444ef39f4d6f194ca437d884 Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Mon, 14 Oct 2024 16:38:06 +0100 Subject: net: inet6: do not leave a dangling sk pointer in inet6_create() sock_init_data() attaches the allocated sk pointer to the provided sock object. If inet6_create() fails later, the sk object is released, but the sock object retains the dangling sk pointer, which may cause use-after-free later. Clear the sock sk pointer on error. Signed-off-by: Ignat Korchagin Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014153808.51894-8-ignat@cloudflare.com Signed-off-by: Jakub Kicinski --- net/ipv6/af_inet6.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index ba69b86f1c7d..f60ec8b0f8ea 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -252,31 +252,29 @@ lookup_protocol: */ inet->inet_sport = htons(inet->inet_num); err = sk->sk_prot->hash(sk); - if (err) { - sk_common_release(sk); - goto out; - } + if (err) + goto out_sk_release; } if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); - if (err) { - sk_common_release(sk); - goto out; - } + if (err) + goto out_sk_release; } if (!kern) { err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk); - if (err) { - sk_common_release(sk); - goto out; - } + if (err) + goto out_sk_release; } out: return err; out_rcu_unlock: rcu_read_unlock(); goto out; +out_sk_release: + sk_common_release(sk); + sock->sk = NULL; + goto out; } static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, -- cgit v1.3 From 48156296a08c615a6baae514096c4b2e543d1157 Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Mon, 14 Oct 2024 16:38:07 +0100 Subject: net: warn, if pf->create does not clear sock->sk on error All pf->create implementations have been fixed now to clear sock->sk on error, when they deallocate the allocated sk object. Put a warning in place to make sure we don't break this promise in the future. Suggested-by: Kuniyuki Iwashima Signed-off-by: Ignat Korchagin Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014153808.51894-9-ignat@cloudflare.com Signed-off-by: Jakub Kicinski --- net/socket.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index 24b404299015..9a8e4452b9b2 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1576,9 +1576,9 @@ int __sock_create(struct net *net, int family, int type, int protocol, err = pf->create(net, sock, protocol, kern); if (err < 0) { /* ->create should release the allocated sock->sk object on error - * but it may leave the dangling pointer + * and make sure sock->sk is set to NULL to avoid use-after-free */ - sock->sk = NULL; + DEBUG_NET_WARN_ON_ONCE(sock->sk); goto out_module_put; } -- cgit v1.3 From 18429e6e0c2ad26250862a786964d8c73400d9a0 Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Mon, 14 Oct 2024 16:38:08 +0100 Subject: Revert "net: do not leave a dangling sk pointer, when socket creation fails" This reverts commit 6cd4a78d962bebbaf8beb7d2ead3f34120e3f7b2. inet/inet6->create() implementations have been fixed to explicitly NULL the allocated sk object on error. A warning was put in place to make sure any future changes will not leave a dangling pointer in pf->create() implementations. So this code is now redundant. Suggested-by: Kuniyuki Iwashima Signed-off-by: Ignat Korchagin Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014153808.51894-10-ignat@cloudflare.com Signed-off-by: Jakub Kicinski --- net/core/sock.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index f8c0d4eda888..756f8e8e0ac7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3827,9 +3827,6 @@ void sk_common_release(struct sock *sk) sk->sk_prot->unhash(sk); - if (sk->sk_socket) - sk->sk_socket->sk = NULL; - /* * In this point socket cannot receive new packets, but it is possible * that some packets are in flight because some CPU runs receiver and -- cgit v1.3 From 09aec57d8379f14ffde566621b920d97cc0c46e1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Oct 2024 13:18:18 -0700 Subject: rtnetlink: Panic when __rtnl_register_many() fails for builtin callers. We will replace all rtnl_register() and rtnl_register_module() with rtnl_register_many(). Currently, rtnl_register() returns nothing and prints an error message when it fails to register a rtnetlink message type and handlers. The failure happens only when rtnl_register_internal() fails to allocate rtnl_msg_handlers[protocol][msgtype], but it's unlikely for built-in callers on boot time. rtnl_register_many() unwinds the previous successful registrations on failure and returns an error, but it will be useless for built-in callers, especially some subsystems that do not have the legacy ioctl() interface and do not work without rtnetlink. Instead of booting up without rtnetlink functionality, let's panic on failure for built-in rtnl_register_many() callers. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014201828.91221-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index edcb6d43723e..8f2cdb0de4a9 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -464,6 +464,10 @@ int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n) handler->msgtype, handler->doit, handler->dumpit, handler->flags); if (err) { + if (!handler->owner) + panic("Unable to register rtnetlink message " + "handlers, %pS\n", handlers); + __rtnl_unregister_many(handlers, i); break; } -- cgit v1.3 From 181bc7875b71e75a49d75fb6f50915ef28ddcc49 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Oct 2024 13:18:19 -0700 Subject: rtnetlink: Use rtnl_register_many(). We will remove rtnl_register() in favour of rtnl_register_many(). When it succeeds, rtnl_register_many() guarantees all rtnetlink types in the passed array are supported, and there is no chance that a part of message types is not supported. Let's use rtnl_register_many() instead. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241014201828.91221-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 63 +++++++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 8f2cdb0de4a9..0fbbfeb2cb50 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -6843,6 +6843,38 @@ static struct pernet_operations rtnetlink_net_ops = { .exit = rtnetlink_net_exit, }; +static const struct rtnl_msg_handler rtnetlink_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWLINK, .doit = rtnl_newlink}, + {.msgtype = RTM_DELLINK, .doit = rtnl_dellink}, + {.msgtype = RTM_GETLINK, .doit = rtnl_getlink, + .dumpit = rtnl_dump_ifinfo, .flags = RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, + {.msgtype = RTM_SETLINK, .doit = rtnl_setlink}, + {.msgtype = RTM_GETADDR, .dumpit = rtnl_dump_all}, + {.msgtype = RTM_GETROUTE, .dumpit = rtnl_dump_all}, + {.msgtype = RTM_GETNETCONF, .dumpit = rtnl_dump_all}, + {.msgtype = RTM_GETSTATS, .doit = rtnl_stats_get, + .dumpit = rtnl_stats_dump}, + {.msgtype = RTM_SETSTATS, .doit = rtnl_stats_set}, + {.msgtype = RTM_NEWLINKPROP, .doit = rtnl_newlinkprop}, + {.msgtype = RTM_DELLINKPROP, .doit = rtnl_dellinkprop}, + {.protocol = PF_BRIDGE, .msgtype = RTM_GETLINK, + .dumpit = rtnl_bridge_getlink}, + {.protocol = PF_BRIDGE, .msgtype = RTM_DELLINK, + .doit = rtnl_bridge_dellink}, + {.protocol = PF_BRIDGE, .msgtype = RTM_SETLINK, + .doit = rtnl_bridge_setlink}, + {.protocol = PF_BRIDGE, .msgtype = RTM_NEWNEIGH, .doit = rtnl_fdb_add}, + {.protocol = PF_BRIDGE, .msgtype = RTM_DELNEIGH, .doit = rtnl_fdb_del, + .flags = RTNL_FLAG_BULK_DEL_SUPPORTED}, + {.protocol = PF_BRIDGE, .msgtype = RTM_GETNEIGH, .doit = rtnl_fdb_get, + .dumpit = rtnl_fdb_dump}, + {.protocol = PF_BRIDGE, .msgtype = RTM_NEWMDB, .doit = rtnl_mdb_add}, + {.protocol = PF_BRIDGE, .msgtype = RTM_DELMDB, .doit = rtnl_mdb_del, + .flags = RTNL_FLAG_BULK_DEL_SUPPORTED}, + {.protocol = PF_BRIDGE, .msgtype = RTM_GETMDB, .doit = rtnl_mdb_get, + .dumpit = rtnl_mdb_dump}, +}; + void __init rtnetlink_init(void) { if (register_pernet_subsys(&rtnetlink_net_ops)) @@ -6850,34 +6882,5 @@ void __init rtnetlink_init(void) register_netdevice_notifier(&rtnetlink_dev_notifier); - rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, - rtnl_dump_ifinfo, RTNL_FLAG_DUMP_SPLIT_NLM_DONE); - rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, 0); - - rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, 0); - rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, 0); - rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, 0); - - rtnl_register(PF_UNSPEC, RTM_NEWLINKPROP, rtnl_newlinkprop, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELLINKPROP, rtnl_dellinkprop, NULL, 0); - - rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0); - rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, - RTNL_FLAG_BULK_DEL_SUPPORTED); - rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0); - - rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, 0); - rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, 0); - rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, 0); - - rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump, - 0); - rtnl_register(PF_UNSPEC, RTM_SETSTATS, rtnl_stats_set, NULL, 0); - - rtnl_register(PF_BRIDGE, RTM_GETMDB, rtnl_mdb_get, rtnl_mdb_dump, 0); - rtnl_register(PF_BRIDGE, RTM_NEWMDB, rtnl_mdb_add, NULL, 0); - rtnl_register(PF_BRIDGE, RTM_DELMDB, rtnl_mdb_del, NULL, - RTNL_FLAG_BULK_DEL_SUPPORTED); + rtnl_register_many(rtnetlink_rtnl_msg_handlers); } -- cgit v1.3 From d0d14aef50a6184426c5a05b9815fb2697d6d42c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Oct 2024 13:18:20 -0700 Subject: neighbour: Use rtnl_register_many(). We will remove rtnl_register() in favour of rtnl_register_many(). When it succeeds, rtnl_register_many() guarantees all rtnetlink types in the passed array are supported, and there is no chance that a part of message types is not supported. Let's use rtnl_register_many() instead. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014201828.91221-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/neighbour.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 77b819cd995b..395ae1626eef 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3886,17 +3886,18 @@ EXPORT_SYMBOL(neigh_sysctl_unregister); #endif /* CONFIG_SYSCTL */ +static const struct rtnl_msg_handler neigh_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWNEIGH, .doit = neigh_add}, + {.msgtype = RTM_DELNEIGH, .doit = neigh_delete}, + {.msgtype = RTM_GETNEIGH, .doit = neigh_get, .dumpit = neigh_dump_info, + .flags = RTNL_FLAG_DUMP_UNLOCKED}, + {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info}, + {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set}, +}; + static int __init neigh_init(void) { - rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETNEIGH, neigh_get, neigh_dump_info, - RTNL_FLAG_DUMP_UNLOCKED); - - rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info, - 0); - rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL, 0); - + rtnl_register_many(neigh_rtnl_msg_handlers); return 0; } -- cgit v1.3 From cc72bb03032568f034c9fb82c63ec847938d6b99 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Oct 2024 13:18:21 -0700 Subject: net: sched: Use rtnl_register_many(). We will remove rtnl_register() in favour of rtnl_register_many(). When it succeeds, rtnl_register_many() guarantees all rtnetlink types in the passed array are supported, and there is no chance that a part of message types is not supported. Let's use rtnl_register_many() instead. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Acked-by: Jamal Hadi Salim Link: https://patch.msgid.link/20241014201828.91221-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/sched/act_api.c | 13 ++++++++----- net/sched/cls_api.c | 25 ++++++++++++++----------- net/sched/sch_api.c | 20 ++++++++++++-------- 3 files changed, 34 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 2714c4ed928e..5bbfb83ed600 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -2243,13 +2243,16 @@ out_module_put: return skb->len; } +static const struct rtnl_msg_handler tc_action_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWACTION, .doit = tc_ctl_action}, + {.msgtype = RTM_DELACTION, .doit = tc_ctl_action}, + {.msgtype = RTM_GETACTION, .doit = tc_ctl_action, + .dumpit = tc_dump_action}, +}; + static int __init tc_action_init(void) { - rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action, - 0); - + rtnl_register_many(tc_action_rtnl_msg_handlers); return 0; } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 17d97bbe890f..7637f979d689 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -4055,6 +4055,19 @@ static struct pernet_operations tcf_net_ops = { .size = sizeof(struct tcf_net), }; +static const struct rtnl_msg_handler tc_filter_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWTFILTER, .doit = tc_new_tfilter, + .flags = RTNL_FLAG_DOIT_UNLOCKED}, + {.msgtype = RTM_DELTFILTER, .doit = tc_del_tfilter, + .flags = RTNL_FLAG_DOIT_UNLOCKED}, + {.msgtype = RTM_GETTFILTER, .doit = tc_get_tfilter, + .dumpit = tc_dump_tfilter, .flags = RTNL_FLAG_DOIT_UNLOCKED}, + {.msgtype = RTM_NEWCHAIN, .doit = tc_ctl_chain}, + {.msgtype = RTM_DELCHAIN, .doit = tc_ctl_chain}, + {.msgtype = RTM_GETCHAIN, .doit = tc_ctl_chain, + .dumpit = tc_dump_chain}, +}; + static int __init tc_filter_init(void) { int err; @@ -4068,17 +4081,7 @@ static int __init tc_filter_init(void) goto err_register_pernet_subsys; xa_init_flags(&tcf_exts_miss_cookies_xa, XA_FLAGS_ALLOC1); - - rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, - RTNL_FLAG_DOIT_UNLOCKED); - rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, - RTNL_FLAG_DOIT_UNLOCKED); - rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter, - tc_dump_tfilter, RTNL_FLAG_DOIT_UNLOCKED); - rtnl_register(PF_UNSPEC, RTM_NEWCHAIN, tc_ctl_chain, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELCHAIN, tc_ctl_chain, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETCHAIN, tc_ctl_chain, - tc_dump_chain, 0); + rtnl_register_many(tc_filter_rtnl_msg_handlers); return 0; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 2eefa4783879..da2da2ab858b 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -2420,6 +2420,17 @@ static struct pernet_operations psched_net_ops = { DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper); #endif +static const struct rtnl_msg_handler psched_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWQDISC, .doit = tc_modify_qdisc}, + {.msgtype = RTM_DELQDISC, .doit = tc_get_qdisc}, + {.msgtype = RTM_GETQDISC, .doit = tc_get_qdisc, + .dumpit = tc_dump_qdisc}, + {.msgtype = RTM_NEWTCLASS, .doit = tc_ctl_tclass}, + {.msgtype = RTM_DELTCLASS, .doit = tc_ctl_tclass}, + {.msgtype = RTM_GETTCLASS, .doit = tc_ctl_tclass, + .dumpit = tc_dump_tclass}, +}; + static int __init pktsched_init(void) { int err; @@ -2438,14 +2449,7 @@ static int __init pktsched_init(void) register_qdisc(&mq_qdisc_ops); register_qdisc(&noqueue_qdisc_ops); - rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, - 0); - rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, - 0); + rtnl_register_many(psched_rtnl_msg_handlers); tc_wrapper_init(); -- cgit v1.3 From 803838a5f6c8d0f0cfc29e9eaa768ad88485ac7f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Oct 2024 13:18:22 -0700 Subject: net: Use rtnl_register_many(). We will remove rtnl_register() in favour of rtnl_register_many(). When it succeeds, rtnl_register_many() guarantees all rtnetlink types in the passed array are supported, and there is no chance that a part of message types is not supported. Let's use rtnl_register_many() instead. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014201828.91221-6-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/net_namespace.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 0a86aff17f51..809b48c0a528 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1169,6 +1169,14 @@ static void __init netns_ipv4_struct_check(void) } #endif +static const struct rtnl_msg_handler net_ns_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWNSID, .doit = rtnl_net_newid, + .flags = RTNL_FLAG_DOIT_UNLOCKED}, + {.msgtype = RTM_GETNSID, .doit = rtnl_net_getid, + .dumpit = rtnl_net_dumpid, + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, +}; + void __init net_ns_init(void) { struct net_generic *ng; @@ -1206,11 +1214,7 @@ void __init net_ns_init(void) if (register_pernet_subsys(&net_ns_ops)) panic("Could not register network namespace subsystems"); - rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL, - RTNL_FLAG_DOIT_UNLOCKED); - rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid, - RTNL_FLAG_DOIT_UNLOCKED | - RTNL_FLAG_DUMP_UNLOCKED); + rtnl_register_many(net_ns_rtnl_msg_handlers); } static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list) -- cgit v1.3 From 465bac91f953d343f5906db1d5f2d58e31b9ab4f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Oct 2024 13:18:23 -0700 Subject: ipv4: Use rtnl_register_many(). We will remove rtnl_register() in favour of rtnl_register_many(). When it succeeds, rtnl_register_many() guarantees all rtnetlink types in the passed array are supported, and there is no chance that a part of message types is not supported. Let's use rtnl_register_many() instead. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014201828.91221-7-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/fib_rules.c | 17 ++++++++++------- net/ipv4/devinet.c | 18 +++++++++++------- net/ipv4/fib_frontend.c | 14 ++++++++++---- net/ipv4/nexthop.c | 31 ++++++++++++++++++------------- net/ipv4/route.c | 8 ++++++-- 5 files changed, 55 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 82ef090c0037..d0de9677f450 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -1291,13 +1291,18 @@ static struct pernet_operations fib_rules_net_ops = { .exit = fib_rules_net_exit, }; +static const struct rtnl_msg_handler fib_rules_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWRULE, .doit = fib_nl_newrule}, + {.msgtype = RTM_DELRULE, .doit = fib_nl_delrule}, + {.msgtype = RTM_GETRULE, .dumpit = fib_nl_dumprule, + .flags = RTNL_FLAG_DUMP_UNLOCKED}, +}; + static int __init fib_rules_init(void) { int err; - rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, - RTNL_FLAG_DUMP_UNLOCKED); + + rtnl_register_many(fib_rules_rtnl_msg_handlers); err = register_pernet_subsys(&fib_rules_net_ops); if (err < 0) @@ -1312,9 +1317,7 @@ static int __init fib_rules_init(void) fail_unregister: unregister_pernet_subsys(&fib_rules_net_ops); fail: - rtnl_unregister(PF_UNSPEC, RTM_NEWRULE); - rtnl_unregister(PF_UNSPEC, RTM_DELRULE); - rtnl_unregister(PF_UNSPEC, RTM_GETRULE); + rtnl_unregister_many(fib_rules_rtnl_msg_handlers); return err; } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 7c156f85b7d2..d81fff93d208 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2797,6 +2797,16 @@ static struct rtnl_af_ops inet_af_ops __read_mostly = { .set_link_af = inet_set_link_af, }; +static const struct rtnl_msg_handler devinet_rtnl_msg_handlers[] __initconst = { + {.protocol = PF_INET, .msgtype = RTM_NEWADDR, .doit = inet_rtm_newaddr}, + {.protocol = PF_INET, .msgtype = RTM_DELADDR, .doit = inet_rtm_deladdr}, + {.protocol = PF_INET, .msgtype = RTM_GETADDR, .dumpit = inet_dump_ifaddr, + .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, + {.protocol = PF_INET, .msgtype = RTM_GETNETCONF, + .doit = inet_netconf_get_devconf, .dumpit = inet_netconf_dump_devconf, + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, +}; + void __init devinet_init(void) { register_pernet_subsys(&devinet_ops); @@ -2804,11 +2814,5 @@ void __init devinet_init(void) rtnl_af_register(&inet_af_ops); - rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, 0); - rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, 0); - rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, - RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE); - rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf, - inet_netconf_dump_devconf, - RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED); + rtnl_register_many(devinet_rtnl_msg_handlers); } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 8353518b110a..53bd26315df5 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1649,6 +1649,15 @@ static struct pernet_operations fib_net_ops = { .exit_batch = fib_net_exit_batch, }; +static const struct rtnl_msg_handler fib_rtnl_msg_handlers[] __initconst = { + {.protocol = PF_INET, .msgtype = RTM_NEWROUTE, + .doit = inet_rtm_newroute}, + {.protocol = PF_INET, .msgtype = RTM_DELROUTE, + .doit = inet_rtm_delroute}, + {.protocol = PF_INET, .msgtype = RTM_GETROUTE, .dumpit = inet_dump_fib, + .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, +}; + void __init ip_fib_init(void) { fib_trie_init(); @@ -1658,8 +1667,5 @@ void __init ip_fib_init(void) register_netdevice_notifier(&fib_netdev_notifier); register_inetaddr_notifier(&fib_inetaddr_notifier); - rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, 0); - rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, 0); - rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, - RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE); + rtnl_register_many(fib_rtnl_msg_handlers); } diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 93aaea0006ba..570e450e008c 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -4042,25 +4042,30 @@ static struct pernet_operations nexthop_net_ops = { .exit_batch_rtnl = nexthop_net_exit_batch_rtnl, }; +static const struct rtnl_msg_handler nexthop_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWNEXTHOP, .doit = rtm_new_nexthop}, + {.msgtype = RTM_DELNEXTHOP, .doit = rtm_del_nexthop}, + {.msgtype = RTM_GETNEXTHOP, .doit = rtm_get_nexthop, + .dumpit = rtm_dump_nexthop}, + {.msgtype = RTM_GETNEXTHOPBUCKET, .doit = rtm_get_nexthop_bucket, + .dumpit = rtm_dump_nexthop_bucket}, + {.protocol = PF_INET, .msgtype = RTM_NEWNEXTHOP, + .doit = rtm_new_nexthop}, + {.protocol = PF_INET, .msgtype = RTM_GETNEXTHOP, + .dumpit = rtm_dump_nexthop}, + {.protocol = PF_INET6, .msgtype = RTM_NEWNEXTHOP, + .doit = rtm_new_nexthop}, + {.protocol = PF_INET6, .msgtype = RTM_GETNEXTHOP, + .dumpit = rtm_dump_nexthop}, +}; + static int __init nexthop_init(void) { register_pernet_subsys(&nexthop_net_ops); register_netdevice_notifier(&nh_netdev_notifier); - rtnl_register(PF_UNSPEC, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELNEXTHOP, rtm_del_nexthop, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETNEXTHOP, rtm_get_nexthop, - rtm_dump_nexthop, 0); - - rtnl_register(PF_INET, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); - rtnl_register(PF_INET, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0); - - rtnl_register(PF_INET6, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); - rtnl_register(PF_INET6, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0); - - rtnl_register(PF_UNSPEC, RTM_GETNEXTHOPBUCKET, rtm_get_nexthop_bucket, - rtm_dump_nexthop_bucket, 0); + rtnl_register_many(nexthop_rtnl_msg_handlers); return 0; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a0b091a7df87..18a08b4f4a5a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3632,6 +3632,11 @@ static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; #endif /* CONFIG_IP_ROUTE_CLASSID */ +static const struct rtnl_msg_handler ip_rt_rtnl_msg_handlers[] __initconst = { + {.protocol = PF_INET, .msgtype = RTM_GETROUTE, + .doit = inet_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, +}; + int __init ip_rt_init(void) { void *idents_hash; @@ -3689,8 +3694,7 @@ int __init ip_rt_init(void) xfrm_init(); xfrm4_init(); #endif - rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, - RTNL_FLAG_DOIT_UNLOCKED); + rtnl_register_many(ip_rt_rtnl_msg_handlers); #ifdef CONFIG_SYSCTL register_pernet_subsys(&sysctl_route_ops); -- cgit v1.3 From a37b0e4eca0436ebc17d512d70b1409956340688 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Oct 2024 13:18:24 -0700 Subject: ipv6: Use rtnl_register_many(). We will remove rtnl_register_module() in favour of rtnl_register_many(). rtnl_register_many() will unwind the previous successful registrations on failure and simplify module error handling. Let's use rtnl_register_many() instead. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014201828.91221-8-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 57 +++++++++++++++++++++------------------------------- net/ipv6/addrlabel.c | 28 ++++++++++---------------- net/ipv6/ip6_fib.c | 10 ++++++--- net/ipv6/route.c | 23 +++++++++------------ 4 files changed, 51 insertions(+), 67 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f31528d4f694..ac8645ad2537 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -7406,6 +7406,27 @@ static struct rtnl_af_ops inet6_ops __read_mostly = { .set_link_af = inet6_set_link_af, }; +static const struct rtnl_msg_handler addrconf_rtnl_msg_handlers[] __initconst_or_module = { + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETLINK, + .dumpit = inet6_dump_ifinfo, .flags = RTNL_FLAG_DUMP_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWADDR, + .doit = inet6_rtm_newaddr}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELADDR, + .doit = inet6_rtm_deladdr}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETADDR, + .doit = inet6_rtm_getaddr, .dumpit = inet6_dump_ifaddr, + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETMULTICAST, + .dumpit = inet6_dump_ifmcaddr, + .flags = RTNL_FLAG_DUMP_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETANYCAST, + .dumpit = inet6_dump_ifacaddr, + .flags = RTNL_FLAG_DUMP_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETNETCONF, + .doit = inet6_netconf_get_devconf, .dumpit = inet6_netconf_dump_devconf, + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, +}; + /* * Init / cleanup code */ @@ -7449,42 +7470,10 @@ int __init addrconf_init(void) rtnl_af_register(&inet6_ops); - err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETLINK, - NULL, inet6_dump_ifinfo, RTNL_FLAG_DUMP_UNLOCKED); - if (err < 0) + err = rtnl_register_many(addrconf_rtnl_msg_handlers); + if (err) goto errout; - err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWADDR, - inet6_rtm_newaddr, NULL, 0); - if (err < 0) - goto errout; - err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELADDR, - inet6_rtm_deladdr, NULL, 0); - if (err < 0) - goto errout; - err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDR, - inet6_rtm_getaddr, inet6_dump_ifaddr, - RTNL_FLAG_DOIT_UNLOCKED | - RTNL_FLAG_DUMP_UNLOCKED); - if (err < 0) - goto errout; - err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETMULTICAST, - NULL, inet6_dump_ifmcaddr, - RTNL_FLAG_DUMP_UNLOCKED); - if (err < 0) - goto errout; - err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETANYCAST, - NULL, inet6_dump_ifacaddr, - RTNL_FLAG_DUMP_UNLOCKED); - if (err < 0) - goto errout; - err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETNETCONF, - inet6_netconf_get_devconf, - inet6_netconf_dump_devconf, - RTNL_FLAG_DOIT_UNLOCKED | - RTNL_FLAG_DUMP_UNLOCKED); - if (err < 0) - goto errout; err = ipv6_addr_label_rtnl_register(); if (err < 0) goto errout; diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index acd70b5992a7..ab054f329e12 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -634,23 +634,17 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, return err; } +static const struct rtnl_msg_handler ipv6_adddr_label_rtnl_msg_handlers[] __initconst_or_module = { + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWADDRLABEL, + .doit = ip6addrlbl_newdel, .flags = RTNL_FLAG_DOIT_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELADDRLABEL, + .doit = ip6addrlbl_newdel, .flags = RTNL_FLAG_DOIT_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETADDRLABEL, + .doit = ip6addrlbl_get, .dumpit = ip6addrlbl_dump, + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, +}; + int __init ipv6_addr_label_rtnl_register(void) { - int ret; - - ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWADDRLABEL, - ip6addrlbl_newdel, - NULL, RTNL_FLAG_DOIT_UNLOCKED); - if (ret < 0) - return ret; - ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELADDRLABEL, - ip6addrlbl_newdel, - NULL, RTNL_FLAG_DOIT_UNLOCKED); - if (ret < 0) - return ret; - ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDRLABEL, - ip6addrlbl_get, - ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED | - RTNL_FLAG_DUMP_UNLOCKED); - return ret; + return rtnl_register_many(ipv6_adddr_label_rtnl_msg_handlers); } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index c9da10d971fa..6383263bfd04 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2493,6 +2493,12 @@ static struct pernet_operations fib6_net_ops = { .exit = fib6_net_exit, }; +static const struct rtnl_msg_handler fib6_rtnl_msg_handlers[] __initconst_or_module = { + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE, + .dumpit = inet6_dump_fib, + .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, +}; + int __init fib6_init(void) { int ret = -ENOMEM; @@ -2506,9 +2512,7 @@ int __init fib6_init(void) if (ret) goto out_kmem_cache_create; - ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL, - inet6_dump_fib, RTNL_FLAG_DUMP_UNLOCKED | - RTNL_FLAG_DUMP_SPLIT_NLM_DONE); + ret = rtnl_register_many(fib6_rtnl_msg_handlers); if (ret) goto out_unregister_subsys; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b4251915585f..d7ce5cf2017a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6680,6 +6680,15 @@ static void bpf_iter_unregister(void) #endif #endif +static const struct rtnl_msg_handler ip6_route_rtnl_msg_handlers[] __initconst_or_module = { + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWROUTE, + .doit = inet6_rtm_newroute}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELROUTE, + .doit = inet6_rtm_delroute}, + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE, + .doit = inet6_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, +}; + int __init ip6_route_init(void) { int ret; @@ -6722,19 +6731,7 @@ int __init ip6_route_init(void) if (ret) goto fib6_rules_init; - ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE, - inet6_rtm_newroute, NULL, 0); - if (ret < 0) - goto out_register_late_subsys; - - ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE, - inet6_rtm_delroute, NULL, 0); - if (ret < 0) - goto out_register_late_subsys; - - ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, - inet6_rtm_getroute, NULL, - RTNL_FLAG_DOIT_UNLOCKED); + ret = rtnl_register_many(ip6_route_rtnl_msg_handlers); if (ret < 0) goto out_register_late_subsys; -- cgit v1.3 From 3ac84e31b33e2051e59245b8ceb25d707fa0e553 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Oct 2024 13:18:25 -0700 Subject: ipmr: Use rtnl_register_many(). We will remove rtnl_register() and rtnl_register_module() in favour of rtnl_register_many(). When it succeeds for built-in callers, rtnl_register_many() guarantees all rtnetlink types in the passed array are supported, and there is no chance that a part of message types is not supported. Let's use rtnl_register_many() instead. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014201828.91221-9-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/ipmr.c | 22 +++++++++++++--------- net/ipv6/ip6mr.c | 13 +++++++++---- 2 files changed, 22 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 7a95daeb1946..b4fc443481ce 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -3137,6 +3137,17 @@ static struct pernet_operations ipmr_net_ops = { .exit_batch = ipmr_net_exit_batch, }; +static const struct rtnl_msg_handler ipmr_rtnl_msg_handlers[] __initconst = { + {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_GETLINK, + .dumpit = ipmr_rtm_dumplink}, + {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_NEWROUTE, + .doit = ipmr_rtm_route}, + {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_DELROUTE, + .doit = ipmr_rtm_route}, + {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_GETROUTE, + .doit = ipmr_rtm_getroute, .dumpit = ipmr_rtm_dumproute}, +}; + int __init ip_mr_init(void) { int err; @@ -3157,15 +3168,8 @@ int __init ip_mr_init(void) goto add_proto_fail; } #endif - rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, - ipmr_rtm_getroute, ipmr_rtm_dumproute, 0); - rtnl_register(RTNL_FAMILY_IPMR, RTM_NEWROUTE, - ipmr_rtm_route, NULL, 0); - rtnl_register(RTNL_FAMILY_IPMR, RTM_DELROUTE, - ipmr_rtm_route, NULL, 0); - - rtnl_register(RTNL_FAMILY_IPMR, RTM_GETLINK, - NULL, ipmr_rtm_dumplink, 0); + rtnl_register_many(ipmr_rtnl_msg_handlers); + return 0; #ifdef CONFIG_IP_PIMSM_V2 diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 9528e17665fd..437a9fdb67f5 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1367,6 +1367,12 @@ static struct pernet_operations ip6mr_net_ops = { .exit_batch = ip6mr_net_exit_batch, }; +static const struct rtnl_msg_handler ip6mr_rtnl_msg_handlers[] __initconst_or_module = { + {.owner = THIS_MODULE, .protocol = RTNL_FAMILY_IP6MR, + .msgtype = RTM_GETROUTE, + .doit = ip6mr_rtm_getroute, .dumpit = ip6mr_rtm_dumproute}, +}; + int __init ip6_mr_init(void) { int err; @@ -1389,9 +1395,8 @@ int __init ip6_mr_init(void) goto add_proto_fail; } #endif - err = rtnl_register_module(THIS_MODULE, RTNL_FAMILY_IP6MR, RTM_GETROUTE, - ip6mr_rtm_getroute, ip6mr_rtm_dumproute, 0); - if (err == 0) + err = rtnl_register_many(ip6mr_rtnl_msg_handlers); + if (!err) return 0; #ifdef CONFIG_IPV6_PIMSM_V2 @@ -1408,7 +1413,7 @@ reg_pernet_fail: void ip6_mr_cleanup(void) { - rtnl_unregister(RTNL_FAMILY_IP6MR, RTM_GETROUTE); + rtnl_unregister_many(ip6mr_rtnl_msg_handlers); #ifdef CONFIG_IPV6_PIMSM_V2 inet6_del_protocol(&pim6_protocol, IPPROTO_PIM); #endif -- cgit v1.3 From c82b031dcb19d0c899c6e209c0ae8c0f3fffcd39 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Oct 2024 13:18:26 -0700 Subject: dcb: Use rtnl_register_many(). We will remove rtnl_register() in favour of rtnl_register_many(). When it succeeds, rtnl_register_many() guarantees all rtnetlink types in the passed array are supported, and there is no chance that a part of message types is not supported. Let's use rtnl_register_many() instead. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014201828.91221-10-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/dcb/dcbnl.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 2e6b8c8fd2de..03eb1d941fca 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -2408,6 +2408,11 @@ static struct notifier_block dcbnl_nb __read_mostly = { .notifier_call = dcbnl_netdevice_event, }; +static const struct rtnl_msg_handler dcbnl_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_GETDCB, .doit = dcb_doit}, + {.msgtype = RTM_SETDCB, .doit = dcb_doit}, +}; + static int __init dcbnl_init(void) { int err; @@ -2416,8 +2421,7 @@ static int __init dcbnl_init(void) if (err) return err; - rtnl_register(PF_UNSPEC, RTM_GETDCB, dcb_doit, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_SETDCB, dcb_doit, NULL, 0); + rtnl_register_many(dcbnl_rtnl_msg_handlers); return 0; } -- cgit v1.3 From df96b8f45aa5808052088bbd2337f837784f06de Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Oct 2024 13:18:27 -0700 Subject: can: gw: Use rtnl_register_many(). We will remove rtnl_register_module() in favour of rtnl_register_many(). rtnl_register_many() will unwind the previous successful registrations on failure and simplify module error handling. Let's use rtnl_register_many() instead. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Marc Kleine-Budde Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014201828.91221-11-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/can/gw.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/can/gw.c b/net/can/gw.c index 37528826935e..ef93293c1fae 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -1265,6 +1265,15 @@ static struct pernet_operations cangw_pernet_ops = { .exit_batch = cangw_pernet_exit_batch, }; +static const struct rtnl_msg_handler cgw_rtnl_msg_handlers[] __initconst_or_module = { + {.owner = THIS_MODULE, .protocol = PF_CAN, .msgtype = RTM_NEWROUTE, + .doit = cgw_create_job}, + {.owner = THIS_MODULE, .protocol = PF_CAN, .msgtype = RTM_DELROUTE, + .doit = cgw_remove_job}, + {.owner = THIS_MODULE, .protocol = PF_CAN, .msgtype = RTM_GETROUTE, + .dumpit = cgw_dump_jobs}, +}; + static __init int cgw_module_init(void) { int ret; @@ -1290,27 +1299,13 @@ static __init int cgw_module_init(void) if (ret) goto out_register_notifier; - ret = rtnl_register_module(THIS_MODULE, PF_CAN, RTM_GETROUTE, - NULL, cgw_dump_jobs, 0); - if (ret) - goto out_rtnl_register1; - - ret = rtnl_register_module(THIS_MODULE, PF_CAN, RTM_NEWROUTE, - cgw_create_job, NULL, 0); - if (ret) - goto out_rtnl_register2; - ret = rtnl_register_module(THIS_MODULE, PF_CAN, RTM_DELROUTE, - cgw_remove_job, NULL, 0); + ret = rtnl_register_many(cgw_rtnl_msg_handlers); if (ret) - goto out_rtnl_register3; + goto out_rtnl_register; return 0; -out_rtnl_register3: - rtnl_unregister(PF_CAN, RTM_NEWROUTE); -out_rtnl_register2: - rtnl_unregister(PF_CAN, RTM_GETROUTE); -out_rtnl_register1: +out_rtnl_register: unregister_netdevice_notifier(¬ifier); out_register_notifier: kmem_cache_destroy(cgw_cache); -- cgit v1.3 From e1c6c383123ab1caadbfe39b3362ce0cc09dd766 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Oct 2024 13:18:28 -0700 Subject: rtnetlink: Remove rtnl_register() and rtnl_register_module(). No one uses rtnl_register() and rtnl_register_module(). Let's remove them. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241014201828.91221-12-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/rtnetlink.h | 15 ++++++---- net/core/rtnetlink.c | 74 ++++++++++++++----------------------------------- 2 files changed, 31 insertions(+), 58 deletions(-) (limited to 'net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 2d3eb7cb4dff..bb49c5708ce7 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -29,6 +29,16 @@ static inline enum rtnl_kinds rtnl_msgtype_kind(int msgtype) return msgtype & RTNL_KIND_MASK; } +/** + * struct rtnl_msg_handler - rtnetlink message type and handlers + * + * @owner: NULL for built-in, THIS_MODULE for module + * @protocol: Protocol family or PF_UNSPEC + * @msgtype: rtnetlink message type + * @doit: Function pointer called for each request message + * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message + * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions + */ struct rtnl_msg_handler { struct module *owner; int protocol; @@ -38,11 +48,6 @@ struct rtnl_msg_handler { int flags; }; -void rtnl_register(int protocol, int msgtype, - rtnl_doit_func, rtnl_dumpit_func, unsigned int flags); -int rtnl_register_module(struct module *owner, int protocol, int msgtype, - rtnl_doit_func, rtnl_dumpit_func, unsigned int flags); -int rtnl_unregister(int protocol, int msgtype); void rtnl_unregister_all(int protocol); int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 0fbbfeb2cb50..a9c92392fb1d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -338,57 +338,6 @@ unlock: return ret; } -/** - * rtnl_register_module - Register a rtnetlink message type - * - * @owner: module registering the hook (THIS_MODULE) - * @protocol: Protocol family or PF_UNSPEC - * @msgtype: rtnetlink message type - * @doit: Function pointer called for each request message - * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message - * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions - * - * Like rtnl_register, but for use by removable modules. - */ -int rtnl_register_module(struct module *owner, - int protocol, int msgtype, - rtnl_doit_func doit, rtnl_dumpit_func dumpit, - unsigned int flags) -{ - return rtnl_register_internal(owner, protocol, msgtype, - doit, dumpit, flags); -} -EXPORT_SYMBOL_GPL(rtnl_register_module); - -/** - * rtnl_register - Register a rtnetlink message type - * @protocol: Protocol family or PF_UNSPEC - * @msgtype: rtnetlink message type - * @doit: Function pointer called for each request message - * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message - * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions - * - * Registers the specified function pointers (at least one of them has - * to be non-NULL) to be called whenever a request message for the - * specified protocol family and message type is received. - * - * The special protocol family PF_UNSPEC may be used to define fallback - * function pointers for the case when no entry for the specific protocol - * family exists. - */ -void rtnl_register(int protocol, int msgtype, - rtnl_doit_func doit, rtnl_dumpit_func dumpit, - unsigned int flags) -{ - int err; - - err = rtnl_register_internal(NULL, protocol, msgtype, doit, dumpit, - flags); - if (err) - pr_err("Unable to register rtnetlink message handler, " - "protocol = %d, message type = %d\n", protocol, msgtype); -} - /** * rtnl_unregister - Unregister a rtnetlink message type * @protocol: Protocol family or PF_UNSPEC @@ -396,7 +345,7 @@ void rtnl_register(int protocol, int msgtype, * * Returns 0 on success or a negative error code. */ -int rtnl_unregister(int protocol, int msgtype) +static int rtnl_unregister(int protocol, int msgtype) { struct rtnl_link __rcu **tab; struct rtnl_link *link; @@ -419,7 +368,6 @@ int rtnl_unregister(int protocol, int msgtype) return 0; } -EXPORT_SYMBOL_GPL(rtnl_unregister); /** * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol @@ -454,6 +402,26 @@ void rtnl_unregister_all(int protocol) } EXPORT_SYMBOL_GPL(rtnl_unregister_all); +/** + * __rtnl_register_many - Register rtnetlink message types + * @handlers: Array of struct rtnl_msg_handlers + * @n: The length of @handlers + * + * Registers the specified function pointers (at least one of them has + * to be non-NULL) to be called whenever a request message for the + * specified protocol family and message type is received. + * + * The special protocol family PF_UNSPEC may be used to define fallback + * function pointers for the case when no entry for the specific protocol + * family exists. + * + * When one element of @handlers fails to register, + * 1) built-in: panics. + * 2) modules : the previous successful registrations are unwinded + * and an error is returned. + * + * Use rtnl_register_many(). + */ int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n) { const struct rtnl_msg_handler *handler; -- cgit v1.3 From 42dc431f5d0ea9e9c9caf74dcb5b290ce4dd80b4 Mon Sep 17 00:00:00 2001 From: Daniel Zahka Date: Fri, 11 Oct 2024 11:35:47 -0700 Subject: ethtool: rss: prevent rss ctx deletion when in use ntuple filters can specify an rss context to use for packet hashing and queue selection. When a filter is referencing an rss context, it should be invalid for that context to be deleted. A list of active ntuple filters and their associated rss contexts can be compiled by querying a device's ethtool_ops.get_rxnfc. This patch checks to see if any ntuple filters are referencing an rss context during context deletion, and prevents the deletion if the requested context is still in use. Signed-off-by: Daniel Zahka Signed-off-by: Paolo Abeni --- net/ethtool/common.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ net/ethtool/common.h | 1 + net/ethtool/ioctl.c | 7 +++++++ 3 files changed, 56 insertions(+) (limited to 'net') diff --git a/net/ethtool/common.c b/net/ethtool/common.c index dd345efa114b..0d62363dbd9d 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -684,6 +684,54 @@ int ethtool_check_max_channel(struct net_device *dev, return 0; } +int ethtool_check_rss_ctx_busy(struct net_device *dev, u32 rss_context) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_rxnfc *info; + int rc, i, rule_cnt; + + if (!ops->get_rxnfc) + return 0; + + rule_cnt = ethtool_get_rxnfc_rule_count(dev); + if (!rule_cnt) + return 0; + + if (rule_cnt < 0) + return -EINVAL; + + info = kvzalloc(struct_size(info, rule_locs, rule_cnt), GFP_KERNEL); + if (!info) + return -ENOMEM; + + info->cmd = ETHTOOL_GRXCLSRLALL; + info->rule_cnt = rule_cnt; + rc = ops->get_rxnfc(dev, info, info->rule_locs); + if (rc) + goto out_free; + + for (i = 0; i < rule_cnt; i++) { + struct ethtool_rxnfc rule_info = { + .cmd = ETHTOOL_GRXCLSRULE, + .fs.location = info->rule_locs[i], + }; + + rc = ops->get_rxnfc(dev, &rule_info, NULL); + if (rc) + goto out_free; + + if (rule_info.fs.flow_type & FLOW_RSS && + rule_info.rss_context == rss_context) { + rc = -EBUSY; + goto out_free; + } + } + +out_free: + kvfree(info); + return rc; +} + int ethtool_check_ops(const struct ethtool_ops *ops) { if (WARN_ON(ops->set_coalesce && !ops->supported_coalesce_params)) diff --git a/net/ethtool/common.h b/net/ethtool/common.h index d55d5201b085..4a2de3ce7354 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -47,6 +47,7 @@ bool convert_legacy_settings_to_link_ksettings( int ethtool_check_max_channel(struct net_device *dev, struct ethtool_channels channels, struct genl_info *info); +int ethtool_check_rss_ctx_busy(struct net_device *dev, u32 rss_context); int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info); extern const struct ethtool_phy_ops *ethtool_phy_ops; diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 04b34dc6b369..5cc131cdb1bc 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1462,6 +1462,13 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, mutex_lock(&dev->ethtool->rss_lock); locked = true; } + + if (rxfh.rss_context && rxfh_dev.rss_delete) { + ret = ethtool_check_rss_ctx_busy(dev, rxfh.rss_context); + if (ret) + goto out; + } + if (create) { if (rxfh_dev.rss_delete) { ret = -EINVAL; -- cgit v1.3 From ecb595ebba0e72fd2137260281b3c773171c8317 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 16 Oct 2024 10:58:24 +0100 Subject: net: dsa: remove dsa_port_phylink_mac_select_pcs() There is no longer any reason to implement the mac_select_pcs() callback in DSA. Returning ERR_PTR(-EOPNOTSUPP) is functionally equivalent to not providing the function. Signed-off-by: Russell King (Oracle) Signed-off-by: Andrew Lunn --- net/dsa/port.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'net') diff --git a/net/dsa/port.c b/net/dsa/port.c index f1e96706a701..ee0aaec4c8e0 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -1575,13 +1575,6 @@ void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp, cpu_dp->tag_ops = tag_ops; } -static struct phylink_pcs * -dsa_port_phylink_mac_select_pcs(struct phylink_config *config, - phy_interface_t interface) -{ - return ERR_PTR(-EOPNOTSUPP); -} - static void dsa_port_phylink_mac_config(struct phylink_config *config, unsigned int mode, const struct phylink_link_state *state) @@ -1604,7 +1597,6 @@ static void dsa_port_phylink_mac_link_up(struct phylink_config *config, } static const struct phylink_mac_ops dsa_port_phylink_mac_ops = { - .mac_select_pcs = dsa_port_phylink_mac_select_pcs, .mac_config = dsa_port_phylink_mac_config, .mac_link_down = dsa_port_phylink_mac_link_down, .mac_link_up = dsa_port_phylink_mac_link_up, -- cgit v1.3 From fa8ef258da2b05a673eb8dc0160a514c80b6ab8c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:44 -0700 Subject: rtnetlink: Allocate linkinfo[] as struct rtnl_newlink_tbs. We will move linkinfo to rtnl_newlink() and pass it down to other functions. Let's pack it into rtnl_newlink_tbs. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/core/rtnetlink.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a9c92392fb1d..37193402a42c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3622,6 +3622,7 @@ out_unregister: struct rtnl_newlink_tbs { struct nlattr *tb[IFLA_MAX + 1]; + struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; struct nlattr *attr[RTNL_MAX_TYPE + 1]; struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; }; @@ -3630,7 +3631,7 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct rtnl_newlink_tbs *tbs, struct netlink_ext_ack *extack) { - struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; + struct nlattr ** const linkinfo = tbs->linkinfo; struct nlattr ** const tb = tbs->tb; const struct rtnl_link_ops *m_ops; struct net_device *master_dev; @@ -3685,8 +3686,9 @@ replay: ifla_info_policy, NULL); if (err < 0) return err; - } else - memset(linkinfo, 0, sizeof(linkinfo)); + } else { + memset(linkinfo, 0, sizeof(tbs->linkinfo)); + } if (linkinfo[IFLA_INFO_KIND]) { nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); -- cgit v1.3 From a5838cf9b2ee59f2a55b1e486f2250a18a43ee14 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:45 -0700 Subject: rtnetlink: Call validate_linkmsg() in do_setlink(). There are 3 paths that finally call do_setlink(), and validate_linkmsg() is called in each path. 1. RTM_NEWLINK 1-1. dev is found in __rtnl_newlink() 1-2. dev isn't found, but IFLA_GROUP is specified in rtnl_group_changelink() 2. RTM_SETLINK The next patch factorises 1-1 to a separate function. As a preparation, let's move validate_linkmsg() calls to do_setlink(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/core/rtnetlink.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 37193402a42c..76593de7014c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2855,6 +2855,10 @@ static int do_setlink(const struct sk_buff *skb, char ifname[IFNAMSIZ]; int err; + err = validate_linkmsg(dev, tb, extack); + if (err < 0) + goto errout; + if (tb[IFLA_IFNAME]) nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else @@ -3269,10 +3273,6 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - err = validate_linkmsg(dev, tb, extack); - if (err < 0) - goto errout; - err = do_setlink(skb, dev, ifm, extack, tb, 0); errout: return err; @@ -3516,9 +3516,6 @@ static int rtnl_group_changelink(const struct sk_buff *skb, for_each_netdev_safe(net, dev, aux) { if (dev->group == group) { - err = validate_linkmsg(dev, tb, extack); - if (err < 0) - return err; err = do_setlink(skb, dev, ifm, extack, tb, 0); if (err < 0) return err; @@ -3744,10 +3741,6 @@ replay: if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - err = validate_linkmsg(dev, tb, extack); - if (err < 0) - return err; - if (linkinfo[IFLA_INFO_DATA]) { if (!ops || ops != dev->rtnl_link_ops || !ops->changelink) -- cgit v1.3 From cc47bcdf0d2ea6a2f7b10566d9b6776bf61b2d4b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:46 -0700 Subject: rtnetlink: Factorise do_setlink() path from __rtnl_newlink(). __rtnl_newlink() got too long to maintain. For example, netdev_master_upper_dev_get()->rtnl_link_ops is fetched even when IFLA_INFO_SLAVE_DATA is not specified. Let's factorise the single dev do_setlink() path to a separate function. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/core/rtnetlink.c | 142 +++++++++++++++++++++++++++------------------------ 1 file changed, 74 insertions(+), 68 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 76593de7014c..21165cc2b697 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3505,6 +3505,78 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, } EXPORT_SYMBOL(rtnl_create_link); +struct rtnl_newlink_tbs { + struct nlattr *tb[IFLA_MAX + 1]; + struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; + struct nlattr *attr[RTNL_MAX_TYPE + 1]; + struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; +}; + +static int rtnl_changelink(const struct sk_buff *skb, struct nlmsghdr *nlh, + const struct rtnl_link_ops *ops, + struct net_device *dev, + struct rtnl_newlink_tbs *tbs, + struct nlattr **data, + struct netlink_ext_ack *extack) +{ + struct nlattr ** const linkinfo = tbs->linkinfo; + struct nlattr ** const tb = tbs->tb; + int status = 0; + int err; + + if (nlh->nlmsg_flags & NLM_F_EXCL) + return -EEXIST; + + if (nlh->nlmsg_flags & NLM_F_REPLACE) + return -EOPNOTSUPP; + + if (linkinfo[IFLA_INFO_DATA]) { + if (!ops || ops != dev->rtnl_link_ops || !ops->changelink) + return -EOPNOTSUPP; + + err = ops->changelink(dev, tb, data, extack); + if (err < 0) + return err; + + status |= DO_SETLINK_NOTIFY; + } + + if (linkinfo[IFLA_INFO_SLAVE_DATA]) { + const struct rtnl_link_ops *m_ops = NULL; + struct nlattr **slave_data = NULL; + struct net_device *master_dev; + + master_dev = netdev_master_upper_dev_get(dev); + if (master_dev) + m_ops = master_dev->rtnl_link_ops; + + if (!m_ops || !m_ops->slave_changelink) + return -EOPNOTSUPP; + + if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) + return -EINVAL; + + if (m_ops->slave_maxtype) { + err = nla_parse_nested_deprecated(tbs->slave_attr, + m_ops->slave_maxtype, + linkinfo[IFLA_INFO_SLAVE_DATA], + m_ops->slave_policy, extack); + if (err < 0) + return err; + + slave_data = tbs->slave_attr; + } + + err = m_ops->slave_changelink(master_dev, dev, tb, slave_data, extack); + if (err < 0) + return err; + + status |= DO_SETLINK_NOTIFY; + } + + return do_setlink(skb, dev, nlmsg_data(nlh), extack, tb, status); +} + static int rtnl_group_changelink(const struct sk_buff *skb, struct net *net, int group, struct ifinfomsg *ifm, @@ -3617,24 +3689,14 @@ out_unregister: goto out; } -struct rtnl_newlink_tbs { - struct nlattr *tb[IFLA_MAX + 1]; - struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; - struct nlattr *attr[RTNL_MAX_TYPE + 1]; - struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; -}; - static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct rtnl_newlink_tbs *tbs, struct netlink_ext_ack *extack) { struct nlattr ** const linkinfo = tbs->linkinfo; struct nlattr ** const tb = tbs->tb; - const struct rtnl_link_ops *m_ops; - struct net_device *master_dev; struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; - struct nlattr **slave_data; char kind[MODULE_NAME_LEN]; struct net_device *dev; struct ifinfomsg *ifm; @@ -3669,14 +3731,6 @@ replay: dev = NULL; } - master_dev = NULL; - m_ops = NULL; - if (dev) { - master_dev = netdev_master_upper_dev_get(dev); - if (master_dev) - m_ops = master_dev->rtnl_link_ops; - } - if (tb[IFLA_LINKINFO]) { err = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX, tb[IFLA_LINKINFO], @@ -3715,56 +3769,8 @@ replay: } } - slave_data = NULL; - if (m_ops) { - if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) - return -EINVAL; - - if (m_ops->slave_maxtype && - linkinfo[IFLA_INFO_SLAVE_DATA]) { - err = nla_parse_nested_deprecated(tbs->slave_attr, - m_ops->slave_maxtype, - linkinfo[IFLA_INFO_SLAVE_DATA], - m_ops->slave_policy, - extack); - if (err < 0) - return err; - slave_data = tbs->slave_attr; - } - } - - if (dev) { - int status = 0; - - if (nlh->nlmsg_flags & NLM_F_EXCL) - return -EEXIST; - if (nlh->nlmsg_flags & NLM_F_REPLACE) - return -EOPNOTSUPP; - - if (linkinfo[IFLA_INFO_DATA]) { - if (!ops || ops != dev->rtnl_link_ops || - !ops->changelink) - return -EOPNOTSUPP; - - err = ops->changelink(dev, tb, data, extack); - if (err < 0) - return err; - status |= DO_SETLINK_NOTIFY; - } - - if (linkinfo[IFLA_INFO_SLAVE_DATA]) { - if (!m_ops || !m_ops->slave_changelink) - return -EOPNOTSUPP; - - err = m_ops->slave_changelink(master_dev, dev, tb, - slave_data, extack); - if (err < 0) - return err; - status |= DO_SETLINK_NOTIFY; - } - - return do_setlink(skb, dev, ifm, extack, tb, status); - } + if (dev) + return rtnl_changelink(skb, nlh, ops, dev, tbs, data, extack); if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { /* No dev found and NLM_F_CREATE not set. Requested dev does not exist, -- cgit v1.3 From 7fea1a8cb4dfab059547f801ebbe7e79c60bd75a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:47 -0700 Subject: rtnetlink: Move simple validation from __rtnl_newlink() to rtnl_newlink(). We will push RTNL down to rtnl_newlink(). Let's move RTNL-independent validation to rtnl_newlink(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/core/rtnetlink.c | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 21165cc2b697..97d6ad65647c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3707,15 +3707,6 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, #ifdef CONFIG_MODULES replay: #endif - err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, - ifla_policy, extack); - if (err < 0) - return err; - - err = rtnl_ensure_unique_netns(tb, extack, false); - if (err < 0) - return err; - ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) { link_specified = true; @@ -3731,16 +3722,6 @@ replay: dev = NULL; } - if (tb[IFLA_LINKINFO]) { - err = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX, - tb[IFLA_LINKINFO], - ifla_info_policy, NULL); - if (err < 0) - return err; - } else { - memset(linkinfo, 0, sizeof(tbs->linkinfo)); - } - if (linkinfo[IFLA_INFO_KIND]) { nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); ops = rtnl_link_ops_get(kind); @@ -3809,6 +3790,7 @@ replay: static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct nlattr **tb, **linkinfo; struct rtnl_newlink_tbs *tbs; int ret; @@ -3816,7 +3798,30 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (!tbs) return -ENOMEM; + tb = tbs->tb; + ret = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), tb, + IFLA_MAX, ifla_policy, extack); + if (ret < 0) + goto free; + + ret = rtnl_ensure_unique_netns(tb, extack, false); + if (ret < 0) + goto free; + + linkinfo = tbs->linkinfo; + if (tb[IFLA_LINKINFO]) { + ret = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX, + tb[IFLA_LINKINFO], + ifla_info_policy, NULL); + if (ret < 0) + goto free; + } else { + memset(linkinfo, 0, sizeof(tbs->linkinfo)); + } + ret = __rtnl_newlink(skb, nlh, tbs, extack); + +free: kfree(tbs); return ret; } -- cgit v1.3 From 331fe31c50ef5ec1d9161986fd06b934f94176a3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:48 -0700 Subject: rtnetlink: Move rtnl_link_ops_get() and retry to rtnl_newlink(). Currently, if neither dev nor rtnl_link_ops is found in __rtnl_newlink(), we release RTNL and redo the whole process after request_module(), which complicates the logic. The ops will be RTNL-independent later. Let's move the ops lookup to rtnl_newlink() and do the retry earlier. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/core/rtnetlink.c | 42 ++++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 97d6ad65647c..e708f0852602 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3690,23 +3690,19 @@ out_unregister: } static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + const struct rtnl_link_ops *ops, struct rtnl_newlink_tbs *tbs, struct netlink_ext_ack *extack) { struct nlattr ** const linkinfo = tbs->linkinfo; struct nlattr ** const tb = tbs->tb; struct net *net = sock_net(skb->sk); - const struct rtnl_link_ops *ops; - char kind[MODULE_NAME_LEN]; struct net_device *dev; struct ifinfomsg *ifm; struct nlattr **data; bool link_specified; int err; -#ifdef CONFIG_MODULES -replay: -#endif ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) { link_specified = true; @@ -3722,14 +3718,6 @@ replay: dev = NULL; } - if (linkinfo[IFLA_INFO_KIND]) { - nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); - ops = rtnl_link_ops_get(kind); - } else { - kind[0] = '\0'; - ops = NULL; - } - data = NULL; if (ops) { if (ops->maxtype > RTNL_MAX_TYPE) @@ -3770,16 +3758,6 @@ replay: return -EOPNOTSUPP; if (!ops) { -#ifdef CONFIG_MODULES - if (kind[0]) { - __rtnl_unlock(); - request_module("rtnl-link-%s", kind); - rtnl_lock(); - ops = rtnl_link_ops_get(kind); - if (ops) - goto replay; - } -#endif NL_SET_ERR_MSG(extack, "Unknown device type"); return -EOPNOTSUPP; } @@ -3790,6 +3768,7 @@ replay: static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + const struct rtnl_link_ops *ops = NULL; struct nlattr **tb, **linkinfo; struct rtnl_newlink_tbs *tbs; int ret; @@ -3819,7 +3798,22 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, memset(linkinfo, 0, sizeof(tbs->linkinfo)); } - ret = __rtnl_newlink(skb, nlh, tbs, extack); + if (linkinfo[IFLA_INFO_KIND]) { + char kind[MODULE_NAME_LEN]; + + nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); + ops = rtnl_link_ops_get(kind); +#ifdef CONFIG_MODULES + if (!ops) { + __rtnl_unlock(); + request_module("rtnl-link-%s", kind); + rtnl_lock(); + ops = rtnl_link_ops_get(kind); + } +#endif + } + + ret = __rtnl_newlink(skb, nlh, ops, tbs, extack); free: kfree(tbs); -- cgit v1.3 From 0d3008d1a9aefb89e09e8dd39134512d678e3461 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:49 -0700 Subject: rtnetlink: Move ops->validate to rtnl_newlink(). ops->validate() does not require RTNL. Let's move it to rtnl_newlink(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/core/rtnetlink.c | 49 ++++++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e708f0852602..9c9290a6c271 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3692,16 +3692,14 @@ out_unregister: static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, const struct rtnl_link_ops *ops, struct rtnl_newlink_tbs *tbs, + struct nlattr **data, struct netlink_ext_ack *extack) { - struct nlattr ** const linkinfo = tbs->linkinfo; struct nlattr ** const tb = tbs->tb; struct net *net = sock_net(skb->sk); struct net_device *dev; struct ifinfomsg *ifm; - struct nlattr **data; bool link_specified; - int err; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) { @@ -3718,26 +3716,6 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, dev = NULL; } - data = NULL; - if (ops) { - if (ops->maxtype > RTNL_MAX_TYPE) - return -EINVAL; - - if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { - err = nla_parse_nested_deprecated(tbs->attr, ops->maxtype, - linkinfo[IFLA_INFO_DATA], - ops->policy, extack); - if (err < 0) - return err; - data = tbs->attr; - } - if (ops->validate) { - err = ops->validate(tb, data, extack); - if (err < 0) - return err; - } - } - if (dev) return rtnl_changelink(skb, nlh, ops, dev, tbs, data, extack); @@ -3768,8 +3746,8 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct nlattr **tb, **linkinfo, **data = NULL; const struct rtnl_link_ops *ops = NULL; - struct nlattr **tb, **linkinfo; struct rtnl_newlink_tbs *tbs; int ret; @@ -3813,7 +3791,28 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, #endif } - ret = __rtnl_newlink(skb, nlh, ops, tbs, extack); + if (ops) { + if (ops->maxtype > RTNL_MAX_TYPE) + return -EINVAL; + + if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { + ret = nla_parse_nested_deprecated(tbs->attr, ops->maxtype, + linkinfo[IFLA_INFO_DATA], + ops->policy, extack); + if (ret < 0) + goto free; + + data = tbs->attr; + } + + if (ops->validate) { + ret = ops->validate(tb, data, extack); + if (ret < 0) + goto free; + } + } + + ret = __rtnl_newlink(skb, nlh, ops, tbs, data, extack); free: kfree(tbs); -- cgit v1.3 From 43c7ce69d28e185f62fe2b8be2c681c5cac0bc6b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:50 -0700 Subject: rtnetlink: Protect struct rtnl_link_ops with SRCU. Once RTNL is replaced with rtnl_net_lock(), we need a mechanism to guarantee that rtnl_link_ops is alive during inflight RTM_NEWLINK even when its module is being unloaded. Let's use SRCU to protect ops. rtnl_link_ops_get() now iterates link_ops under RCU and returns SRCU-protected ops pointer. The caller must call rtnl_link_ops_put() to release the pointer after the use. Also, __rtnl_link_unregister() unlinks the ops first and calls synchronize_srcu() to wait for inflight RTM_NEWLINK requests to complete. Note that link_ops needs to be protected by its dedicated lock when RTNL is removed. Suggested-by: Eric Dumazet Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- include/net/rtnetlink.h | 5 ++- net/core/rtnetlink.c | 83 ++++++++++++++++++++++++++++++++++++------------- 2 files changed, 65 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index bb49c5708ce7..1a6aa5ca74f3 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -3,6 +3,7 @@ #define __NET_RTNETLINK_H #include +#include #include typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *, @@ -69,7 +70,8 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) /** * struct rtnl_link_ops - rtnetlink link operations * - * @list: Used internally + * @list: Used internally, protected by RTNL and SRCU + * @srcu: Used internally * @kind: Identifier * @netns_refund: Physical device, move to init_net on netns exit * @maxtype: Highest device specific netlink attribute number @@ -100,6 +102,7 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) */ struct rtnl_link_ops { struct list_head list; + struct srcu_struct srcu; const char *kind; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 9c9290a6c271..31b105b3a834 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -457,15 +457,29 @@ EXPORT_SYMBOL_GPL(__rtnl_unregister_many); static LIST_HEAD(link_ops); -static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) +static struct rtnl_link_ops *rtnl_link_ops_get(const char *kind, int *srcu_index) { - const struct rtnl_link_ops *ops; + struct rtnl_link_ops *ops; - list_for_each_entry(ops, &link_ops, list) { - if (!strcmp(ops->kind, kind)) - return ops; + rcu_read_lock(); + + list_for_each_entry_rcu(ops, &link_ops, list) { + if (!strcmp(ops->kind, kind)) { + *srcu_index = srcu_read_lock(&ops->srcu); + goto unlock; + } } - return NULL; + + ops = NULL; +unlock: + rcu_read_unlock(); + + return ops; +} + +static void rtnl_link_ops_put(struct rtnl_link_ops *ops, int srcu_index) +{ + srcu_read_unlock(&ops->srcu, srcu_index); } /** @@ -480,8 +494,16 @@ static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) */ int __rtnl_link_register(struct rtnl_link_ops *ops) { - if (rtnl_link_ops_get(ops->kind)) - return -EEXIST; + struct rtnl_link_ops *tmp; + int err; + + /* When RTNL is removed, add lock for link_ops. */ + ASSERT_RTNL(); + + list_for_each_entry(tmp, &link_ops, list) { + if (!strcmp(ops->kind, tmp->kind)) + return -EEXIST; + } /* The check for alloc/setup is here because if ops * does not have that filled up, it is not possible @@ -491,7 +513,12 @@ int __rtnl_link_register(struct rtnl_link_ops *ops) if ((ops->alloc || ops->setup) && !ops->dellink) ops->dellink = unregister_netdevice_queue; - list_add_tail(&ops->list, &link_ops); + err = init_srcu_struct(&ops->srcu); + if (err) + return err; + + list_add_tail_rcu(&ops->list, &link_ops); + return 0; } EXPORT_SYMBOL_GPL(__rtnl_link_register); @@ -542,10 +569,12 @@ void __rtnl_link_unregister(struct rtnl_link_ops *ops) { struct net *net; - for_each_net(net) { + list_del_rcu(&ops->list); + synchronize_srcu(&ops->srcu); + cleanup_srcu_struct(&ops->srcu); + + for_each_net(net) __rtnl_kill_links(net, ops); - } - list_del(&ops->list); } EXPORT_SYMBOL_GPL(__rtnl_link_unregister); @@ -2158,10 +2187,11 @@ static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { [IFLA_XDP_PROG_ID] = { .type = NLA_U32 }, }; -static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla) +static struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla, + int *ops_srcu_index) { - const struct rtnl_link_ops *ops = NULL; struct nlattr *linfo[IFLA_INFO_MAX + 1]; + struct rtnl_link_ops *ops = NULL; if (nla_parse_nested_deprecated(linfo, IFLA_INFO_MAX, nla, ifla_info_policy, NULL) < 0) return NULL; @@ -2170,7 +2200,7 @@ static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla char kind[MODULE_NAME_LEN]; nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); - ops = rtnl_link_ops_get(kind); + ops = rtnl_link_ops_get(kind, ops_srcu_index); } return ops; @@ -2290,8 +2320,8 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { - const struct rtnl_link_ops *kind_ops = NULL; struct netlink_ext_ack *extack = cb->extack; + struct rtnl_link_ops *kind_ops = NULL; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int flags = NLM_F_MULTI; @@ -2302,6 +2332,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) struct net *tgt_net = net; u32 ext_filter_mask = 0; struct net_device *dev; + int ops_srcu_index; int master_idx = 0; int netnsid = -1; int err, i; @@ -2335,7 +2366,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) master_idx = nla_get_u32(tb[i]); break; case IFLA_LINKINFO: - kind_ops = linkinfo_to_kind_ops(tb[i]); + kind_ops = linkinfo_to_kind_ops(tb[i], &ops_srcu_index); break; default: if (cb->strict_check) { @@ -2361,6 +2392,10 @@ walk_entries: if (err < 0) break; } + + if (kind_ops) + rtnl_link_ops_put(kind_ops, ops_srcu_index); + cb->seq = tgt_net->dev_base_seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); if (netnsid >= 0) @@ -3747,8 +3782,9 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr **tb, **linkinfo, **data = NULL; - const struct rtnl_link_ops *ops = NULL; + struct rtnl_link_ops *ops = NULL; struct rtnl_newlink_tbs *tbs; + int ops_srcu_index; int ret; tbs = kmalloc(sizeof(*tbs), GFP_KERNEL); @@ -3780,13 +3816,13 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, char kind[MODULE_NAME_LEN]; nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); - ops = rtnl_link_ops_get(kind); + ops = rtnl_link_ops_get(kind, &ops_srcu_index); #ifdef CONFIG_MODULES if (!ops) { __rtnl_unlock(); request_module("rtnl-link-%s", kind); rtnl_lock(); - ops = rtnl_link_ops_get(kind); + ops = rtnl_link_ops_get(kind, &ops_srcu_index); } #endif } @@ -3800,7 +3836,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, linkinfo[IFLA_INFO_DATA], ops->policy, extack); if (ret < 0) - goto free; + goto put_ops; data = tbs->attr; } @@ -3808,12 +3844,15 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (ops->validate) { ret = ops->validate(tb, data, extack); if (ret < 0) - goto free; + goto put_ops; } } ret = __rtnl_newlink(skb, nlh, ops, tbs, data, extack); +put_ops: + if (ops) + rtnl_link_ops_put(ops, ops_srcu_index); free: kfree(tbs); return ret; -- cgit v1.3 From 0fef2a1212f1ff68fc3834abd41928b4353f8af6 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:51 -0700 Subject: rtnetlink: Call rtnl_link_get_net_capable() in rtnl_newlink(). As a prerequisite of per-netns RTNL, we must fetch netns before looking up dev or moving it to another netns. rtnl_link_get_net_capable() is called in rtnl_newlink_create() and do_setlink(), but both of them need to be moved to the RTNL-independent region, which will be rtnl_newlink(). Let's call rtnl_link_get_net_capable() in rtnl_newlink() and pass the netns down to where needed. Note that the latter two have not passed the nets to do_setlink() yet but will do so after the remaining rtnl_link_get_net_capable() is moved to rtnl_setlink() later. While at it, dest_net is renamed to tgt_net in rtnl_newlink_create() to align with rtnl_{del,set}link(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/core/rtnetlink.c | 51 ++++++++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 31b105b3a834..f6823c8d21ad 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3549,7 +3549,7 @@ struct rtnl_newlink_tbs { static int rtnl_changelink(const struct sk_buff *skb, struct nlmsghdr *nlh, const struct rtnl_link_ops *ops, - struct net_device *dev, + struct net_device *dev, struct net *tgt_net, struct rtnl_newlink_tbs *tbs, struct nlattr **data, struct netlink_ext_ack *extack) @@ -3613,10 +3613,10 @@ static int rtnl_changelink(const struct sk_buff *skb, struct nlmsghdr *nlh, } static int rtnl_group_changelink(const struct sk_buff *skb, - struct net *net, int group, - struct ifinfomsg *ifm, - struct netlink_ext_ack *extack, - struct nlattr **tb) + struct net *net, struct net *tgt_net, + int group, struct ifinfomsg *ifm, + struct netlink_ext_ack *extack, + struct nlattr **tb) { struct net_device *dev, *aux; int err; @@ -3634,6 +3634,7 @@ static int rtnl_group_changelink(const struct sk_buff *skb, static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, const struct rtnl_link_ops *ops, + struct net *tgt_net, const struct nlmsghdr *nlh, struct nlattr **tb, struct nlattr **data, struct netlink_ext_ack *extack) @@ -3641,9 +3642,9 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, unsigned char name_assign_type = NET_NAME_USER; struct net *net = sock_net(skb->sk); u32 portid = NETLINK_CB(skb).portid; - struct net *dest_net, *link_net; struct net_device *dev; char ifname[IFNAMSIZ]; + struct net *link_net; int err; if (!ops->alloc && !ops->setup) @@ -3656,14 +3657,10 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, name_assign_type = NET_NAME_ENUM; } - dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); - if (IS_ERR(dest_net)) - return PTR_ERR(dest_net); - if (tb[IFLA_LINK_NETNSID]) { int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); - link_net = get_net_ns_by_id(dest_net, id); + link_net = get_net_ns_by_id(tgt_net, id); if (!link_net) { NL_SET_ERR_MSG(extack, "Unknown network namespace id"); err = -EINVAL; @@ -3676,7 +3673,7 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, link_net = NULL; } - dev = rtnl_create_link(link_net ? : dest_net, ifname, + dev = rtnl_create_link(link_net ? : tgt_net, ifname, name_assign_type, ops, tb, extack); if (IS_ERR(dev)) { err = PTR_ERR(dev); @@ -3698,7 +3695,7 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, if (err < 0) goto out_unregister; if (link_net) { - err = dev_change_net_namespace(dev, dest_net, ifname); + err = dev_change_net_namespace(dev, tgt_net, ifname); if (err < 0) goto out_unregister; } @@ -3710,7 +3707,7 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, out: if (link_net) put_net(link_net); - put_net(dest_net); + return err; out_unregister: if (ops->newlink) { @@ -3726,6 +3723,7 @@ out_unregister: static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, const struct rtnl_link_ops *ops, + struct net *tgt_net, struct rtnl_newlink_tbs *tbs, struct nlattr **data, struct netlink_ext_ack *extack) @@ -3752,19 +3750,18 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, } if (dev) - return rtnl_changelink(skb, nlh, ops, dev, tbs, data, extack); + return rtnl_changelink(skb, nlh, ops, dev, tgt_net, tbs, data, extack); if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { /* No dev found and NLM_F_CREATE not set. Requested dev does not exist, * or it's for a group */ - if (link_specified) + if (link_specified || !tb[IFLA_GROUP]) return -ENODEV; - if (tb[IFLA_GROUP]) - return rtnl_group_changelink(skb, net, - nla_get_u32(tb[IFLA_GROUP]), - ifm, extack, tb); - return -ENODEV; + + return rtnl_group_changelink(skb, net, tgt_net, + nla_get_u32(tb[IFLA_GROUP]), + ifm, extack, tb); } if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) @@ -3775,7 +3772,7 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, return -EOPNOTSUPP; } - return rtnl_newlink_create(skb, ifm, ops, nlh, tb, data, extack); + return rtnl_newlink_create(skb, ifm, ops, tgt_net, nlh, tb, data, extack); } static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -3784,6 +3781,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **tb, **linkinfo, **data = NULL; struct rtnl_link_ops *ops = NULL; struct rtnl_newlink_tbs *tbs; + struct net *tgt_net; int ops_srcu_index; int ret; @@ -3848,8 +3846,15 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, } } - ret = __rtnl_newlink(skb, nlh, ops, tbs, data, extack); + tgt_net = rtnl_link_get_net_capable(skb, sock_net(skb->sk), tb, CAP_NET_ADMIN); + if (IS_ERR(tgt_net)) { + ret = PTR_ERR(tgt_net); + goto put_ops; + } + + ret = __rtnl_newlink(skb, nlh, ops, tgt_net, tbs, data, extack); + put_net(tgt_net); put_ops: if (ops) rtnl_link_ops_put(ops, ops_srcu_index); -- cgit v1.3 From f7774eec20b41fae36a58e8ab04ff4dd48bb1845 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:52 -0700 Subject: rtnetlink: Fetch IFLA_LINK_NETNSID in rtnl_newlink(). Another netns option for RTM_NEWLINK is IFLA_LINK_NETNSID and is fetched in rtnl_newlink_create(). This must be done before holding rtnl_net_lock(). Let's move IFLA_LINK_NETNSID processing to rtnl_newlink(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/core/rtnetlink.c | 49 ++++++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f6823c8d21ad..eee0f820ddf6 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3634,7 +3634,7 @@ static int rtnl_group_changelink(const struct sk_buff *skb, static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, const struct rtnl_link_ops *ops, - struct net *tgt_net, + struct net *tgt_net, struct net *link_net, const struct nlmsghdr *nlh, struct nlattr **tb, struct nlattr **data, struct netlink_ext_ack *extack) @@ -3644,7 +3644,6 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, u32 portid = NETLINK_CB(skb).portid; struct net_device *dev; char ifname[IFNAMSIZ]; - struct net *link_net; int err; if (!ops->alloc && !ops->setup) @@ -3657,22 +3656,6 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, name_assign_type = NET_NAME_ENUM; } - if (tb[IFLA_LINK_NETNSID]) { - int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); - - link_net = get_net_ns_by_id(tgt_net, id); - if (!link_net) { - NL_SET_ERR_MSG(extack, "Unknown network namespace id"); - err = -EINVAL; - goto out; - } - err = -EPERM; - if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) - goto out; - } else { - link_net = NULL; - } - dev = rtnl_create_link(link_net ? : tgt_net, ifname, name_assign_type, ops, tb, extack); if (IS_ERR(dev)) { @@ -3705,9 +3688,6 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, goto out_unregister; } out: - if (link_net) - put_net(link_net); - return err; out_unregister: if (ops->newlink) { @@ -3723,7 +3703,7 @@ out_unregister: static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, const struct rtnl_link_ops *ops, - struct net *tgt_net, + struct net *tgt_net, struct net *link_net, struct rtnl_newlink_tbs *tbs, struct nlattr **data, struct netlink_ext_ack *extack) @@ -3772,16 +3752,16 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, return -EOPNOTSUPP; } - return rtnl_newlink_create(skb, ifm, ops, tgt_net, nlh, tb, data, extack); + return rtnl_newlink_create(skb, ifm, ops, tgt_net, link_net, nlh, tb, data, extack); } static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr **tb, **linkinfo, **data = NULL; + struct net *tgt_net, *link_net = NULL; struct rtnl_link_ops *ops = NULL; struct rtnl_newlink_tbs *tbs; - struct net *tgt_net; int ops_srcu_index; int ret; @@ -3852,8 +3832,27 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto put_ops; } - ret = __rtnl_newlink(skb, nlh, ops, tgt_net, tbs, data, extack); + if (tb[IFLA_LINK_NETNSID]) { + int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); + + link_net = get_net_ns_by_id(tgt_net, id); + if (!link_net) { + NL_SET_ERR_MSG(extack, "Unknown network namespace id"); + ret = -EINVAL; + goto put_net; + } + + if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + goto put_net; + } + } + + ret = __rtnl_newlink(skb, nlh, ops, tgt_net, link_net, tbs, data, extack); +put_net: + if (link_net) + put_net(link_net); put_net(tgt_net); put_ops: if (ops) -- cgit v1.3 From 175cfc5cd373b6665ec145cafe742252453a7c0e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:53 -0700 Subject: rtnetlink: Clean up rtnl_dellink(). We will push RTNL down to rtnl_delink(). Let's unify the error path to make it easy to place rtnl_net_lock(). While at it, keep the variables in reverse xmas order. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/core/rtnetlink.c | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index eee0f820ddf6..a19b2eb2727e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3368,14 +3368,14 @@ EXPORT_SYMBOL_GPL(rtnl_delete_link); static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct ifinfomsg *ifm = nlmsg_data(nlh); struct net *net = sock_net(skb->sk); u32 portid = NETLINK_CB(skb).portid; - struct net *tgt_net = net; - struct net_device *dev = NULL; - struct ifinfomsg *ifm; struct nlattr *tb[IFLA_MAX+1]; - int err; + struct net_device *dev = NULL; + struct net *tgt_net = net; int netnsid = -1; + int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); @@ -3393,27 +3393,20 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, return PTR_ERR(tgt_net); } - err = -EINVAL; - ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(tgt_net, tb); + + if (dev) + err = rtnl_delete_link(dev, portid, nlh); + else if (ifm->ifi_index > 0 || tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + err = -ENODEV; else if (tb[IFLA_GROUP]) err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP])); else - goto out; - - if (!dev) { - if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME] || ifm->ifi_index > 0) - err = -ENODEV; - - goto out; - } - - err = rtnl_delete_link(dev, portid, nlh); + err = -EINVAL; -out: if (netnsid >= 0) put_net(tgt_net); -- cgit v1.3 From 6e495fad88ef7bdea70b0e4a1a714f6eccab2a5a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:54 -0700 Subject: rtnetlink: Clean up rtnl_setlink(). We will push RTNL down to rtnl_setlink(). Let's unify the error path to make it easy to place rtnl_net_lock(). While at it, keep the variables in reverse xmas order. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/core/rtnetlink.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a19b2eb2727e..f89a902458d6 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3279,11 +3279,11 @@ static struct net_device *rtnl_dev_get(struct net *net, static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct ifinfomsg *ifm = nlmsg_data(nlh); struct net *net = sock_net(skb->sk); - struct ifinfomsg *ifm; - struct net_device *dev; - int err; struct nlattr *tb[IFLA_MAX+1]; + struct net_device *dev = NULL; + int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); @@ -3294,21 +3294,18 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; - err = -EINVAL; - ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(net, tb); else - goto errout; + err = -EINVAL; - if (dev == NULL) { + if (dev) + err = do_setlink(skb, dev, ifm, extack, tb, 0); + else if (!err) err = -ENODEV; - goto errout; - } - err = do_setlink(skb, dev, ifm, extack, tb, 0); errout: return err; } -- cgit v1.3 From a0b63c6457e100b84b1ff9179bc328c0924de75c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:55 -0700 Subject: rtnetlink: Call rtnl_link_get_net_capable() in do_setlink(). We will push RTNL down to rtnl_setlink(). RTM_SETLINK could call rtnl_link_get_net_capable() in do_setlink() to move a dev to a new netns, but the netns needs to be fetched before holding rtnl_net_lock(). Let's move it to rtnl_setlink() and pass the netns to do_setlink(). Now, RTM_NEWLINK paths (rtnl_changelink() and rtnl_group_changelink()) can pass the prefetched netns to do_setlink(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/core/rtnetlink.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f89a902458d6..445e6ffed75e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2881,8 +2881,8 @@ static int do_set_proto_down(struct net_device *dev, #define DO_SETLINK_MODIFIED 0x01 /* notify flag means notify + modified. */ #define DO_SETLINK_NOTIFY 0x03 -static int do_setlink(const struct sk_buff *skb, - struct net_device *dev, struct ifinfomsg *ifm, +static int do_setlink(const struct sk_buff *skb, struct net_device *dev, + struct net *tgt_net, struct ifinfomsg *ifm, struct netlink_ext_ack *extack, struct nlattr **tb, int status) { @@ -2899,27 +2899,19 @@ static int do_setlink(const struct sk_buff *skb, else ifname[0] = '\0'; - if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) { + if (!net_eq(tgt_net, dev_net(dev))) { const char *pat = ifname[0] ? ifname : NULL; - struct net *net; int new_ifindex; - net = rtnl_link_get_net_capable(skb, dev_net(dev), - tb, CAP_NET_ADMIN); - if (IS_ERR(net)) { - err = PTR_ERR(net); - goto errout; - } - if (tb[IFLA_NEW_IFINDEX]) new_ifindex = nla_get_s32(tb[IFLA_NEW_IFINDEX]); else new_ifindex = 0; - err = __dev_change_net_namespace(dev, net, pat, new_ifindex); - put_net(net); + err = __dev_change_net_namespace(dev, tgt_net, pat, new_ifindex); if (err) goto errout; + status |= DO_SETLINK_MODIFIED; } @@ -3283,6 +3275,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *net = sock_net(skb->sk); struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; + struct net *tgt_net; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, @@ -3294,6 +3287,12 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; + tgt_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); + if (IS_ERR(tgt_net)) { + err = PTR_ERR(tgt_net); + goto errout; + } + if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) @@ -3302,10 +3301,11 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, err = -EINVAL; if (dev) - err = do_setlink(skb, dev, ifm, extack, tb, 0); + err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0); else if (!err) err = -ENODEV; + put_net(tgt_net); errout: return err; } @@ -3599,7 +3599,7 @@ static int rtnl_changelink(const struct sk_buff *skb, struct nlmsghdr *nlh, status |= DO_SETLINK_NOTIFY; } - return do_setlink(skb, dev, nlmsg_data(nlh), extack, tb, status); + return do_setlink(skb, dev, tgt_net, nlmsg_data(nlh), extack, tb, status); } static int rtnl_group_changelink(const struct sk_buff *skb, @@ -3613,7 +3613,7 @@ static int rtnl_group_changelink(const struct sk_buff *skb, for_each_netdev_safe(net, dev, aux) { if (dev->group == group) { - err = do_setlink(skb, dev, ifm, extack, tb, 0); + err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0); if (err < 0) return err; } -- cgit v1.3 From 26eebdc4b005ccd4cf63f4fef4c9c0adf9bfa380 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:56 -0700 Subject: rtnetlink: Return int from rtnl_af_register(). The next patch will add init_srcu_struct() in rtnl_af_register(), then we need to handle its error. Let's add the error handling in advance to make the following patch cleaner. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Matt Johnston Signed-off-by: Paolo Abeni --- include/net/rtnetlink.h | 2 +- net/bridge/br_netlink.c | 6 +++++- net/core/rtnetlink.c | 4 +++- net/ipv4/devinet.c | 3 ++- net/ipv6/addrconf.c | 5 ++++- net/mctp/device.c | 16 +++++++++++----- net/mpls/af_mpls.c | 5 ++++- 7 files changed, 30 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 1a6aa5ca74f3..969138ae2f4b 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -204,7 +204,7 @@ struct rtnl_af_ops { size_t (*get_stats_af_size)(const struct net_device *dev); }; -void rtnl_af_register(struct rtnl_af_ops *ops); +int rtnl_af_register(struct rtnl_af_ops *ops); void rtnl_af_unregister(struct rtnl_af_ops *ops); struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 6b97ae47f855..3e0f47203f2a 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1924,7 +1924,9 @@ int __init br_netlink_init(void) if (err) goto out; - rtnl_af_register(&br_af_ops); + err = rtnl_af_register(&br_af_ops); + if (err) + goto out_vlan; err = rtnl_link_register(&br_link_ops); if (err) @@ -1934,6 +1936,8 @@ int __init br_netlink_init(void) out_af: rtnl_af_unregister(&br_af_ops); +out_vlan: + br_vlan_rtnl_uninit(); out: return err; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 445e6ffed75e..70b663aca209 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -686,11 +686,13 @@ static const struct rtnl_af_ops *rtnl_af_lookup(const int family) * * Returns 0 on success or a negative error code. */ -void rtnl_af_register(struct rtnl_af_ops *ops) +int rtnl_af_register(struct rtnl_af_ops *ops) { rtnl_lock(); list_add_tail_rcu(&ops->list, &rtnl_af_ops); rtnl_unlock(); + + return 0; } EXPORT_SYMBOL_GPL(rtnl_af_register); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index ec29ead83e74..0ff9c0abfaa0 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2827,7 +2827,8 @@ void __init devinet_init(void) register_pernet_subsys(&devinet_ops); register_netdevice_notifier(&ip_netdev_notifier); - rtnl_af_register(&inet_af_ops); + if (rtnl_af_register(&inet_af_ops)) + panic("Unable to register inet_af_ops\n"); rtnl_register_many(devinet_rtnl_msg_handlers); } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ac8645ad2537..d0a99710d65d 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -7468,7 +7468,9 @@ int __init addrconf_init(void) addrconf_verify(&init_net); - rtnl_af_register(&inet6_ops); + err = rtnl_af_register(&inet6_ops); + if (err) + goto erraf; err = rtnl_register_many(addrconf_rtnl_msg_handlers); if (err) @@ -7482,6 +7484,7 @@ int __init addrconf_init(void) errout: rtnl_unregister_all(PF_INET6); rtnl_af_unregister(&inet6_ops); +erraf: unregister_netdevice_notifier(&ipv6_dev_notf); errlo: destroy_workqueue(addrconf_wq); diff --git a/net/mctp/device.c b/net/mctp/device.c index 85cc5f31f1e7..3d75b919995d 100644 --- a/net/mctp/device.c +++ b/net/mctp/device.c @@ -535,14 +535,20 @@ int __init mctp_device_init(void) int err; register_netdevice_notifier(&mctp_dev_nb); - rtnl_af_register(&mctp_af_ops); + + err = rtnl_af_register(&mctp_af_ops); + if (err) + goto err_notifier; err = rtnl_register_many(mctp_device_rtnl_msg_handlers); - if (err) { - rtnl_af_unregister(&mctp_af_ops); - unregister_netdevice_notifier(&mctp_dev_nb); - } + if (err) + goto err_af; + return 0; +err_af: + rtnl_af_unregister(&mctp_af_ops); +err_notifier: + unregister_netdevice_notifier(&mctp_dev_nb); return err; } diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index a0573847bc55..1f63b32d76d6 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -2753,7 +2753,9 @@ static int __init mpls_init(void) dev_add_pack(&mpls_packet_type); - rtnl_af_register(&mpls_af_ops); + err = rtnl_af_register(&mpls_af_ops); + if (err) + goto out_unregister_dev_type; err = rtnl_register_many(mpls_rtnl_msg_handlers); if (err) @@ -2773,6 +2775,7 @@ out_unregister_rtnl: rtnl_unregister_many(mpls_rtnl_msg_handlers); out_unregister_rtnl_af: rtnl_af_unregister(&mpls_af_ops); +out_unregister_dev_type: dev_remove_pack(&mpls_packet_type); out_unregister_pernet: unregister_pernet_subsys(&mpls_net_ops); -- cgit v1.3 From 6ab0f866948323724e95cf14d9e47fd77703c192 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:57 -0700 Subject: rtnetlink: Protect struct rtnl_af_ops with SRCU. Once RTNL is replaced with rtnl_net_lock(), we need a mechanism to guarantee that rtnl_af_ops is alive during inflight RTM_SETLINK even when its module is being unloaded. Let's use SRCU to protect ops. rtnl_af_lookup() now iterates rtnl_af_ops under RCU and returns SRCU-protected ops pointer. The caller must call rtnl_af_put() to release the pointer after the use. Also, rtnl_af_unregister() unlinks the ops first and calls synchronize_srcu() to wait for inflight RTM_SETLINK requests to complete. Note that rtnl_af_ops needs to be protected by its dedicated lock when RTNL is removed. Note also that BUG_ON() in do_setlink() is changed to the normal error handling as a different af_ops might be found after validate_linkmsg(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- include/net/rtnetlink.h | 5 +++- net/core/rtnetlink.c | 63 ++++++++++++++++++++++++++++++++++++------------- 2 files changed, 51 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 969138ae2f4b..e0d9a8eae6b6 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -172,7 +172,8 @@ void rtnl_link_unregister(struct rtnl_link_ops *ops); /** * struct rtnl_af_ops - rtnetlink address family operations * - * @list: Used internally + * @list: Used internally, protected by RTNL and SRCU + * @srcu: Used internally * @family: Address family * @fill_link_af: Function to fill IFLA_AF_SPEC with address family * specific netlink attributes. @@ -185,6 +186,8 @@ void rtnl_link_unregister(struct rtnl_link_ops *ops); */ struct rtnl_af_ops { struct list_head list; + struct srcu_struct srcu; + int family; int (*fill_link_af)(struct sk_buff *skb, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 70b663aca209..194a81e5f608 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -666,18 +666,31 @@ static size_t rtnl_link_get_size(const struct net_device *dev) static LIST_HEAD(rtnl_af_ops); -static const struct rtnl_af_ops *rtnl_af_lookup(const int family) +static struct rtnl_af_ops *rtnl_af_lookup(const int family, int *srcu_index) { - const struct rtnl_af_ops *ops; + struct rtnl_af_ops *ops; ASSERT_RTNL(); - list_for_each_entry(ops, &rtnl_af_ops, list) { - if (ops->family == family) - return ops; + rcu_read_lock(); + + list_for_each_entry_rcu(ops, &rtnl_af_ops, list) { + if (ops->family == family) { + *srcu_index = srcu_read_lock(&ops->srcu); + goto unlock; + } } - return NULL; + ops = NULL; +unlock: + rcu_read_unlock(); + + return ops; +} + +static void rtnl_af_put(struct rtnl_af_ops *ops, int srcu_index) +{ + srcu_read_unlock(&ops->srcu, srcu_index); } /** @@ -688,6 +701,11 @@ static const struct rtnl_af_ops *rtnl_af_lookup(const int family) */ int rtnl_af_register(struct rtnl_af_ops *ops) { + int err = init_srcu_struct(&ops->srcu); + + if (err) + return err; + rtnl_lock(); list_add_tail_rcu(&ops->list, &rtnl_af_ops); rtnl_unlock(); @@ -707,6 +725,8 @@ void rtnl_af_unregister(struct rtnl_af_ops *ops) rtnl_unlock(); synchronize_rcu(); + synchronize_srcu(&ops->srcu); + cleanup_srcu_struct(&ops->srcu); } EXPORT_SYMBOL_GPL(rtnl_af_unregister); @@ -2579,20 +2599,24 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], int rem, err; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { - const struct rtnl_af_ops *af_ops; + struct rtnl_af_ops *af_ops; + int af_ops_srcu_index; - af_ops = rtnl_af_lookup(nla_type(af)); + af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index); if (!af_ops) return -EAFNOSUPPORT; if (!af_ops->set_link_af) - return -EOPNOTSUPP; - - if (af_ops->validate_link_af) { + err = -EOPNOTSUPP; + else if (af_ops->validate_link_af) err = af_ops->validate_link_af(dev, af, extack); - if (err < 0) - return err; - } + else + err = 0; + + rtnl_af_put(af_ops, af_ops_srcu_index); + + if (err < 0) + return err; } } @@ -3172,11 +3196,18 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev, int rem; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { - const struct rtnl_af_ops *af_ops; + struct rtnl_af_ops *af_ops; + int af_ops_srcu_index; - BUG_ON(!(af_ops = rtnl_af_lookup(nla_type(af)))); + af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index); + if (!af_ops) { + err = -EAFNOSUPPORT; + goto errout; + } err = af_ops->set_link_af(dev, af, extack); + rtnl_af_put(af_ops, af_ops_srcu_index); + if (err < 0) goto errout; -- cgit v1.3 From 867d13a75488d5c20256e93186d9cb6361fb75a4 Mon Sep 17 00:00:00 2001 From: Asbjørn Sloth Tønnesen Date: Thu, 17 Oct 2024 09:47:02 +0000 Subject: tools: ynl-gen: use big-endian netlink attribute types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change ynl-gen-c.py to use NLA_BE16 and NLA_BE32 types to represent big-endian u16 and u32 ynl types. Doing this enables those attributes to have range checks applied, as the validator will then convert to host endianness prior to validation. The autogenerated kernel/uapi code have been regenerated by running: ./tools/net/ynl/ynl-regen.sh -f This changes the policy types of the following attributes: FOU_ATTR_PORT (NLA_U16 -> NLA_BE16) FOU_ATTR_PEER_PORT (NLA_U16 -> NLA_BE16) These two are used with nla_get_be16/nla_put_be16(). MPTCP_PM_ADDR_ATTR_ADDR4 (NLA_U32 -> NLA_BE32) This one is used with nla_get_in_addr/nla_put_in_addr(), which uses nla_get_be32/nla_put_be32(). IOWs the generated changes are AFAICT aligned with their implementations. The generated userspace code remains identical, and have been verified by comparing the output generated by the following command: make -C tools/net/ynl/generated Signed-off-by: Asbjørn Sloth Tønnesen Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241017094704.3222173-1-ast@fiberby.net Signed-off-by: Paolo Abeni --- net/ipv4/fou_nl.c | 4 ++-- net/mptcp/mptcp_pm_gen.c | 2 +- tools/net/ynl/ynl-gen-c.py | 5 ++++- 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/fou_nl.c b/net/ipv4/fou_nl.c index 98b90107b5ab..3d9614609b2d 100644 --- a/net/ipv4/fou_nl.c +++ b/net/ipv4/fou_nl.c @@ -12,7 +12,7 @@ /* Global operation policy for fou */ const struct nla_policy fou_nl_policy[FOU_ATTR_IFINDEX + 1] = { - [FOU_ATTR_PORT] = { .type = NLA_U16, }, + [FOU_ATTR_PORT] = { .type = NLA_BE16, }, [FOU_ATTR_AF] = { .type = NLA_U8, }, [FOU_ATTR_IPPROTO] = { .type = NLA_U8, }, [FOU_ATTR_TYPE] = { .type = NLA_U8, }, @@ -21,7 +21,7 @@ const struct nla_policy fou_nl_policy[FOU_ATTR_IFINDEX + 1] = { [FOU_ATTR_LOCAL_V6] = { .len = 16, }, [FOU_ATTR_PEER_V4] = { .type = NLA_U32, }, [FOU_ATTR_PEER_V6] = { .len = 16, }, - [FOU_ATTR_PEER_PORT] = { .type = NLA_U16, }, + [FOU_ATTR_PEER_PORT] = { .type = NLA_BE16, }, [FOU_ATTR_IFINDEX] = { .type = NLA_S32, }, }; diff --git a/net/mptcp/mptcp_pm_gen.c b/net/mptcp/mptcp_pm_gen.c index c30a2a90a192..5a6b2b4510d3 100644 --- a/net/mptcp/mptcp_pm_gen.c +++ b/net/mptcp/mptcp_pm_gen.c @@ -14,7 +14,7 @@ const struct nla_policy mptcp_pm_address_nl_policy[MPTCP_PM_ADDR_ATTR_IF_IDX + 1] = { [MPTCP_PM_ADDR_ATTR_FAMILY] = { .type = NLA_U16, }, [MPTCP_PM_ADDR_ATTR_ID] = { .type = NLA_U8, }, - [MPTCP_PM_ADDR_ATTR_ADDR4] = { .type = NLA_U32, }, + [MPTCP_PM_ADDR_ATTR_ADDR4] = { .type = NLA_BE32, }, [MPTCP_PM_ADDR_ATTR_ADDR6] = NLA_POLICY_EXACT_LEN(16), [MPTCP_PM_ADDR_ATTR_PORT] = { .type = NLA_U16, }, [MPTCP_PM_ADDR_ATTR_FLAGS] = { .type = NLA_U32, }, diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index d64cb2b49c44..1a825b4081b2 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -167,7 +167,10 @@ class Type(SpecAttr): return '{ .type = ' + policy + ', }' def attr_policy(self, cw): - policy = c_upper('nla-' + self.attr['type']) + policy = f'NLA_{c_upper(self.type)}' + if self.attr.get('byte-order') == 'big-endian': + if self.type in {'u16', 'u32'}: + policy = f'NLA_BE{self.type[1:]}' spec = self._attr_policy(policy) cw.p(f"\t[{self.enum_name}] = {spec},") -- cgit v1.3 From 6886c14bdc309fa1c92b22f9587c5ca78f1920b7 Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Thu, 17 Oct 2024 21:34:35 +0800 Subject: net: use sock_valbool_flag() only in __sock_set_timestamps() sock_{,re}set_flag() are contained in sock_valbool_flag(), it would be cleaner to just use sock_valbool_flag(). Signed-off-by: Yajun Deng Link: https://patch.msgid.link/20241017133435.2552-1-yajun.deng@linux.dev Signed-off-by: Paolo Abeni --- net/core/sock.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 756f8e8e0ac7..7f398bd07fb7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -820,14 +820,11 @@ EXPORT_SYMBOL(sock_set_sndtimeo); static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) { + sock_valbool_flag(sk, SOCK_RCVTSTAMP, val); + sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns); if (val) { sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); - sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); - sock_set_flag(sk, SOCK_RCVTSTAMP); sock_enable_timestamp(sk, SOCK_TIMESTAMP); - } else { - sock_reset_flag(sk, SOCK_RCVTSTAMP); - sock_reset_flag(sk, SOCK_RCVTSTAMPNS); } } -- cgit v1.3 From d631094e4d20d136f159c6e0f723b7aecbc12d2f Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 17 Oct 2024 17:24:17 +0200 Subject: net: sysctl: remove always-true condition Before adding a new line at the end of the temporary buffer in dump_cpumask, a length check is performed to ensure there is space for it. len = min(sizeof(kbuf) - 1, *lenp); len = scnprintf(kbuf, len, ...); if (len < *lenp) kbuf[len++] = '\n'; Note that the check is currently logically wrong, the written length is compared against the output buffer, not the temporary one. However this has no consequence as this is always true, even if fixed: scnprintf includes a null char at the end of the buffer but the returned length do not include it and there is always space for overriding it with a newline. Remove the condition. Signed-off-by: Antoine Tenart Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- net/core/sysctl_net_core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index b60fac380cec..e7c0121dfaa1 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -69,8 +69,10 @@ static void dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos, return; } - if (len < *lenp) - kbuf[len++] = '\n'; + /* scnprintf writes a trailing null char not counted in the returned + * length, override it with a newline. + */ + kbuf[len++] = '\n'; memcpy(buffer, kbuf, len); *lenp = len; *ppos += len; -- cgit v1.3 From a8cc8fa14541d6f8f1fbe78607a096e97c80179e Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 17 Oct 2024 17:24:18 +0200 Subject: net: sysctl: do not reserve an extra char in dump_cpumask temporary buffer When computing the length we'll be able to use out of the buffers, one char is removed from the temporary one to make room for a newline. It should be removed from the output buffer length too, but in reality this is not needed as the later call to scnprintf makes sure a null char is written at the end of the buffer which we override with the newline. Signed-off-by: Antoine Tenart Signed-off-by: Paolo Abeni --- net/core/sysctl_net_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index e7c0121dfaa1..8dc07f7b1772 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -62,7 +62,7 @@ static void dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos, return; } - len = min(sizeof(kbuf) - 1, *lenp); + len = min(sizeof(kbuf), *lenp); len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask)); if (!len) { *lenp = 0; -- cgit v1.3 From 124afe773b1ad6cddb8f661a14a32c9e76ca92a6 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 17 Oct 2024 17:24:19 +0200 Subject: net: sysctl: allow dump_cpumask to handle higher numbers of CPUs This fixes the output of rps_default_mask and flow_limit_cpu_bitmap when the CPU count is > 448, as it was truncated. The underlying values are actually stored correctly when writing to these sysctl but displaying them uses a fixed length temporary buffer in dump_cpumask. This buffer can be too small if the CPU count is > 448. Fix this by dynamically allocating the buffer in dump_cpumask, using a guesstimate of what we need. Signed-off-by: Antoine Tenart Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- net/core/sysctl_net_core.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 8dc07f7b1772..cb8d32e5c14e 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -51,22 +51,32 @@ int sysctl_devconf_inherit_init_net __read_mostly; EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); #if IS_ENABLED(CONFIG_NET_FLOW_LIMIT) || IS_ENABLED(CONFIG_RPS) -static void dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos, - struct cpumask *mask) +static int dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos, + struct cpumask *mask) { - char kbuf[128]; + char *kbuf; int len; if (*ppos || !*lenp) { *lenp = 0; - return; + return 0; + } + + /* CPUs are displayed as a hex bitmap + a comma between each groups of 8 + * nibbles (except the last one which has a newline instead). + * Guesstimate the buffer size at the group granularity level. + */ + len = min(DIV_ROUND_UP(nr_cpumask_bits, 32) * (8 + 1), *lenp); + kbuf = kmalloc(len, GFP_KERNEL); + if (!kbuf) { + *lenp = 0; + return -ENOMEM; } - len = min(sizeof(kbuf), *lenp); len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask)); if (!len) { *lenp = 0; - return; + goto free_buf; } /* scnprintf writes a trailing null char not counted in the returned @@ -76,6 +86,10 @@ static void dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos, memcpy(buffer, kbuf, len); *lenp = len; *ppos += len; + +free_buf: + kfree(kbuf); + return 0; } #endif @@ -119,8 +133,8 @@ static int rps_default_mask_sysctl(const struct ctl_table *table, int write, if (err) goto done; } else { - dump_cpumask(buffer, lenp, ppos, - net->core.rps_default_mask ? : cpu_none_mask); + err = dump_cpumask(buffer, lenp, ppos, + net->core.rps_default_mask ? : cpu_none_mask); } done: @@ -249,7 +263,7 @@ write_unlock: } rcu_read_unlock(); - dump_cpumask(buffer, lenp, ppos, mask); + ret = dump_cpumask(buffer, lenp, ppos, mask); } done: -- cgit v1.3 From 83c289e81e88d01e55d6d56531502ed7b4886a05 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 17 Oct 2024 19:19:34 +0300 Subject: net/sched: act_api: unexport tcf_action_dump_1() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This isn't used outside act_api.c, but is called by tcf_dump_walker() prior to its definition. So move it upwards and make it static. Simultaneously, reorder the variable declarations so that they follow the networking "reverse Christmas tree" coding style. Signed-off-by: Vladimir Oltean Reviewed-by: Simon Horman Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20241017161934.3599046-1-vladimir.oltean@nxp.com Signed-off-by: Paolo Abeni --- include/net/act_api.h | 1 - net/sched/act_api.c | 89 +++++++++++++++++++++++++-------------------------- 2 files changed, 44 insertions(+), 46 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 77ee0c657e2c..404df8557f6a 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -219,7 +219,6 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, int ref, bool terse); int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); -int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); static inline void tcf_action_update_bstats(struct tc_action *a, struct sk_buff *skb) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 5bbfb83ed600..c3f8dd7f8e65 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -504,6 +504,50 @@ nla_put_failure: return -1; } +static int +tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nest; + int err = -EINVAL; + u32 flags; + + if (tcf_action_dump_terse(skb, a, false)) + goto nla_put_failure; + + if (a->hw_stats != TCA_ACT_HW_STATS_ANY && + nla_put_bitfield32(skb, TCA_ACT_HW_STATS, + a->hw_stats, TCA_ACT_HW_STATS_ANY)) + goto nla_put_failure; + + if (a->used_hw_stats_valid && + nla_put_bitfield32(skb, TCA_ACT_USED_HW_STATS, + a->used_hw_stats, TCA_ACT_HW_STATS_ANY)) + goto nla_put_failure; + + flags = a->tcfa_flags & TCA_ACT_FLAGS_USER_MASK; + if (flags && + nla_put_bitfield32(skb, TCA_ACT_FLAGS, + flags, flags)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_ACT_IN_HW_COUNT, a->in_hw_count)) + goto nla_put_failure; + + nest = nla_nest_start_noflag(skb, TCA_ACT_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + err = tcf_action_dump_old(skb, a, bind, ref); + if (err > 0) { + nla_nest_end(skb, nest); + return err; + } + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, struct netlink_callback *cb) { @@ -1190,51 +1234,6 @@ tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref) return a->ops->dump(skb, a, bind, ref); } -int -tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) -{ - int err = -EINVAL; - unsigned char *b = skb_tail_pointer(skb); - struct nlattr *nest; - u32 flags; - - if (tcf_action_dump_terse(skb, a, false)) - goto nla_put_failure; - - if (a->hw_stats != TCA_ACT_HW_STATS_ANY && - nla_put_bitfield32(skb, TCA_ACT_HW_STATS, - a->hw_stats, TCA_ACT_HW_STATS_ANY)) - goto nla_put_failure; - - if (a->used_hw_stats_valid && - nla_put_bitfield32(skb, TCA_ACT_USED_HW_STATS, - a->used_hw_stats, TCA_ACT_HW_STATS_ANY)) - goto nla_put_failure; - - flags = a->tcfa_flags & TCA_ACT_FLAGS_USER_MASK; - if (flags && - nla_put_bitfield32(skb, TCA_ACT_FLAGS, - flags, flags)) - goto nla_put_failure; - - if (nla_put_u32(skb, TCA_ACT_IN_HW_COUNT, a->in_hw_count)) - goto nla_put_failure; - - nest = nla_nest_start_noflag(skb, TCA_ACT_OPTIONS); - if (nest == NULL) - goto nla_put_failure; - err = tcf_action_dump_old(skb, a, bind, ref); - if (err > 0) { - nla_nest_end(skb, nest); - return err; - } - -nla_put_failure: - nlmsg_trim(skb, b); - return -1; -} -EXPORT_SYMBOL(tcf_action_dump_1); - int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, int ref, bool terse) { -- cgit v1.3 From 7213a1c417d2c690de2c5aaa05b9dbec0d68a1b1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Oct 2024 10:47:32 -0700 Subject: ip6mr: Add __init to ip6_mr_cleanup(). kernel test robot reported a section mismatch in ip6_mr_cleanup(). WARNING: modpost: vmlinux: section mismatch in reference: ip6_mr_cleanup+0x0 (section: .text) -> 0xffffffff (section: .init.rodata) WARNING: modpost: vmlinux: section mismatch in reference: ip6_mr_cleanup+0x14 (section: .text) -> ip6mr_rtnl_msg_handlers (section: .init.rodata) ip6_mr_cleanup() uses ip6mr_rtnl_msg_handlers[] that has __initconst_or_module qualifier. ip6_mr_cleanup() is only called from inet6_init() but does not have __init qualifier. Let's add __init to ip6_mr_cleanup(). Fixes: 3ac84e31b33e ("ipmr: Use rtnl_register_many().") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202410180139.B3HeemsC-lkp@intel.com/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20241017174732.39487-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/ipv6/ip6mr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 437a9fdb67f5..8add0f45aa52 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1411,7 +1411,7 @@ reg_pernet_fail: return err; } -void ip6_mr_cleanup(void) +void __init ip6_mr_cleanup(void) { rtnl_unregister_many(ip6mr_rtnl_msg_handlers); #ifdef CONFIG_IPV6_PIMSM_V2 -- cgit v1.3 From c972c1c41d9b20fb38b54e77dcee763e27e715a9 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Oct 2024 18:41:00 -0700 Subject: ipv4: Switch inet_addr_hash() to less predictable hash. Recently, commit 4a0ec2aa0704 ("ipv6: switch inet6_addr_hash() to less predictable hash") and commit 4daf4dc275f1 ("ipv6: switch inet6_acaddr_hash() to less predictable hash") hardened IPv6 address hash functions. inet_addr_hash() is also highly predictable, and a malicious use could abuse a specific bucket. Let's follow the change on IPv4 by using jhash_1word(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241018014100.93776-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- include/net/ip.h | 5 +++++ net/ipv4/devinet.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/ip.h b/include/net/ip.h index 4be0a6a603b2..0e548c1f2a0e 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -690,6 +690,11 @@ static inline unsigned int ipv4_addr_hash(__be32 ip) return (__force unsigned int) ip; } +static inline u32 __ipv4_addr_hash(const __be32 ip, const u32 initval) +{ + return jhash_1word((__force u32)ip, initval); +} + static inline u32 ipv4_portaddr_hash(const struct net *net, __be32 saddr, unsigned int port) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 0ff9c0abfaa0..5f859d01cbbe 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -121,7 +121,7 @@ struct inet_fill_args { static u32 inet_addr_hash(const struct net *net, __be32 addr) { - u32 val = (__force u32) addr ^ net_hash_mix(net); + u32 val = __ipv4_addr_hash(addr, net_hash_mix(net)); return hash_32(val, IN4_ADDR_HSIZE_SHIFT); } -- cgit v1.3 From e44ef3f66c5472c2cbc6957c684d7279c26b0db1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 18 Oct 2024 05:21:08 +0000 Subject: netpoll: remove ndo_netpoll_setup() second argument npinfo is not used in any of the ndo_netpoll_setup() methods. Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241018052108.2610827-1-edumazet@google.com Signed-off-by: Paolo Abeni --- drivers/net/bonding/bond_main.c | 2 +- drivers/net/macvlan.c | 2 +- drivers/net/team/team_core.c | 3 +-- include/linux/netdevice.h | 3 +-- net/8021q/vlan_dev.c | 2 +- net/bridge/br_device.c | 2 +- net/core/netpoll.c | 2 +- net/dsa/user.c | 3 +-- 8 files changed, 8 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index b1bffd8e9a95..3928287f5865 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1476,7 +1476,7 @@ static void bond_netpoll_cleanup(struct net_device *bond_dev) slave_disable_netpoll(slave); } -static int bond_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) +static int bond_netpoll_setup(struct net_device *dev) { struct bonding *bond = netdev_priv(dev); struct list_head *iter; diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index cf18e66de142..edbd5afcec41 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -1117,7 +1117,7 @@ static void macvlan_dev_poll_controller(struct net_device *dev) return; } -static int macvlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo) +static int macvlan_dev_netpoll_setup(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); struct net_device *real_dev = vlan->lowerdev; diff --git a/drivers/net/team/team_core.c b/drivers/net/team/team_core.c index 18191d5a8bd4..a1b27b69f010 100644 --- a/drivers/net/team/team_core.c +++ b/drivers/net/team/team_core.c @@ -1946,8 +1946,7 @@ static void team_netpoll_cleanup(struct net_device *dev) mutex_unlock(&team->lock); } -static int team_netpoll_setup(struct net_device *dev, - struct netpoll_info *npifo) +static int team_netpoll_setup(struct net_device *dev) { struct team *team = netdev_priv(dev); struct team_port *port; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8feaca12655e..86a0b7eb9461 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1425,8 +1425,7 @@ struct net_device_ops { __be16 proto, u16 vid); #ifdef CONFIG_NET_POLL_CONTROLLER void (*ndo_poll_controller)(struct net_device *dev); - int (*ndo_netpoll_setup)(struct net_device *dev, - struct netpoll_info *info); + int (*ndo_netpoll_setup)(struct net_device *dev); void (*ndo_netpoll_cleanup)(struct net_device *dev); #endif int (*ndo_set_vf_mac)(struct net_device *dev, diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 458040e8a0e0..91d134961357 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -725,7 +725,7 @@ static void vlan_dev_poll_controller(struct net_device *dev) return; } -static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo) +static int vlan_dev_netpoll_setup(struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 26b79feb385d..0ab4613aa07a 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -328,7 +328,7 @@ int br_netpoll_enable(struct net_bridge_port *p) return __br_netpoll_enable(p); } -static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) +static int br_netpoll_setup(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *p; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index aa49b92e9194..94b7f07a952f 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -641,7 +641,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) ops = ndev->netdev_ops; if (ops->ndo_netpoll_setup) { - err = ops->ndo_netpoll_setup(ndev, npinfo); + err = ops->ndo_netpoll_setup(ndev); if (err) goto free_npinfo; } diff --git a/net/dsa/user.c b/net/dsa/user.c index 64f660d2334b..91a1fa5f8ab0 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -1308,8 +1308,7 @@ static int dsa_user_set_pauseparam(struct net_device *dev, } #ifdef CONFIG_NET_POLL_CONTROLLER -static int dsa_user_netpoll_setup(struct net_device *dev, - struct netpoll_info *ni) +static int dsa_user_netpoll_setup(struct net_device *dev) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_user_priv *p = netdev_priv(dev); -- cgit v1.3 From 074a8b54dacc1920f54381f3661ecee6786b0c21 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 7 Oct 2024 15:00:45 +0300 Subject: wifi: mac80211: Add support to indicate that a new interface is to be added Add support to indicate to the driver that an interface is about to be added so that the driver could prepare its resources early if it needs so. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241007144851.e0e8563e1c30.Ifccc96a46a347eb15752caefc9f4eff31f75ed47@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 8 ++++++++ net/mac80211/cfg.c | 18 ++++++++++++++++++ net/mac80211/driver-ops.h | 12 ++++++++++++ net/mac80211/trace.h | 19 +++++++++++++++++++ 4 files changed, 57 insertions(+) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 333e0fae6796..0b8df8ec5a3b 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4444,6 +4444,12 @@ struct ieee80211_prep_tx_info { * if the requested TID-To-Link mapping can be accepted or not. * If it's not accepted the driver may suggest a preferred mapping and * modify @ttlm parameter with the suggested TID-to-Link mapping. + * @prep_add_interface: prepare for interface addition. This can be used by + * drivers to prepare for the addition of a new interface, e.g., allocate + * the needed resources etc. This callback doesn't guarantee that an + * interface with the specified type would be added, and thus drivers that + * implement this callback need to handle such cases. The type is the full + * &enum nl80211_iftype. */ struct ieee80211_ops { void (*tx)(struct ieee80211_hw *hw, @@ -4828,6 +4834,8 @@ struct ieee80211_ops { enum ieee80211_neg_ttlm_res (*can_neg_ttlm)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_neg_ttlm *ttlm); + void (*prep_add_interface)(struct ieee80211_hw *hw, + enum nl80211_iftype type); }; /** diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 847304a3a29a..ce9558cd1576 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -194,6 +194,24 @@ static struct wireless_dev *ieee80211_add_iface(struct wiphy *wiphy, } } + /* Let the driver know that an interface is going to be added. + * Indicate so only for interface types that will be added to the + * driver. + */ + switch (type) { + case NL80211_IFTYPE_AP_VLAN: + break; + case NL80211_IFTYPE_MONITOR: + if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) || + !(params->flags & MONITOR_FLAG_ACTIVE)) + break; + fallthrough; + default: + drv_prep_add_interface(local, + ieee80211_vif_type_p2p(&sdata->vif)); + break; + } + return wdev; } diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index d382d9729e85..48bc2da728c0 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1728,4 +1728,16 @@ drv_can_neg_ttlm(struct ieee80211_local *local, return res; } + +static inline void +drv_prep_add_interface(struct ieee80211_local *local, + enum nl80211_iftype type) +{ + trace_drv_prep_add_interface(local, type); + if (local->ops->prep_add_interface) + local->ops->prep_add_interface(&local->hw, type); + + trace_drv_return_void(local); +} + #endif /* __MAC80211_DRIVER_OPS */ diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index dc498cd8cd91..e6f0ce8e5d43 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -3154,6 +3154,25 @@ TRACE_EVENT(drv_neg_ttlm_res, LOCAL_PR_ARG, VIF_PR_ARG, __entry->res ) ); + +TRACE_EVENT(drv_prep_add_interface, + TP_PROTO(struct ieee80211_local *local, + enum nl80211_iftype type), + + TP_ARGS(local, type), + TP_STRUCT__entry(LOCAL_ENTRY + __field(u32, type) + ), + + TP_fast_assign(LOCAL_ASSIGN; + __entry->type = type; + ), + + TP_printk(LOCAL_PR_FMT " type: %u\n ", + LOCAL_PR_ARG, __entry->type + ) +); + #endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH -- cgit v1.3 From 62262dd00c319195f2e14022903b7ebbb53119bc Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 15:00:46 +0300 Subject: wifi: cfg80211: disallow SMPS in AP mode In practice, userspace hasn't been able to set this for many years, and mac80211 has already rejected it (which is now no longer needed), so reject SMPS mode (other than "OFF" to be a bit more compatible) in AP mode. Also remove the parameter from the AP settings struct. Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241007144851.fe1fc46484cf.I8676fb52b818a4bedeb9c25b901e1396277ffc0b@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/quantenna/qtnfmac/commands.c | 2 +- include/net/cfg80211.h | 2 -- net/mac80211/cfg.c | 3 --- net/wireless/nl80211.c | 30 +++-------------------- 4 files changed, 4 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/drivers/net/wireless/quantenna/qtnfmac/commands.c b/drivers/net/wireless/quantenna/qtnfmac/commands.c index 9540ad6196d7..956c5763662f 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/commands.c +++ b/drivers/net/wireless/quantenna/qtnfmac/commands.c @@ -257,7 +257,7 @@ int qtnf_cmd_send_start_ap(struct qtnf_vif *vif, cmd->beacon_interval = cpu_to_le16(s->beacon_interval); cmd->hidden_ssid = qlink_hidden_ssid_nl2q(s->hidden_ssid); cmd->inactivity_timeout = cpu_to_le16(s->inactivity_timeout); - cmd->smps_mode = s->smps_mode; + cmd->smps_mode = NL80211_SMPS_OFF; cmd->p2p_ctwindow = s->p2p_ctwindow; cmd->p2p_opp_ps = s->p2p_opp_ps; cmd->pbss = s->pbss; diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 69ec1eb41a09..c8ce5c2e14f4 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1460,7 +1460,6 @@ struct cfg80211_unsol_bcast_probe_resp { * @crypto: crypto settings * @privacy: the BSS uses privacy * @auth_type: Authentication type (algorithm) - * @smps_mode: SMPS mode * @inactivity_timeout: time in seconds to determine station's inactivity. * @p2p_ctwindow: P2P CT Window * @p2p_opp_ps: P2P opportunistic PS @@ -1498,7 +1497,6 @@ struct cfg80211_ap_settings { struct cfg80211_crypto_settings crypto; bool privacy; enum nl80211_auth_type auth_type; - enum nl80211_smps_mode smps_mode; int inactivity_timeout; u8 p2p_ctwindow; bool p2p_opp_ps; diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index ce9558cd1576..548b9bbdac04 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1312,9 +1312,6 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, if (old) return -EALREADY; - if (params->smps_mode != NL80211_SMPS_OFF) - return -EOPNOTSUPP; - link->smps_mode = IEEE80211_SMPS_OFF; link->needed_rx_chains = sdata->local->rx_chains; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 4e3609176880..fb35c03af34c 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6227,33 +6227,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) goto out; } - if (info->attrs[NL80211_ATTR_SMPS_MODE]) { - params->smps_mode = - nla_get_u8(info->attrs[NL80211_ATTR_SMPS_MODE]); - switch (params->smps_mode) { - case NL80211_SMPS_OFF: - break; - case NL80211_SMPS_STATIC: - if (!(rdev->wiphy.features & - NL80211_FEATURE_STATIC_SMPS)) { - err = -EINVAL; - goto out; - } - break; - case NL80211_SMPS_DYNAMIC: - if (!(rdev->wiphy.features & - NL80211_FEATURE_DYNAMIC_SMPS)) { - err = -EINVAL; - goto out; - } - break; - default: - err = -EINVAL; - goto out; - } - } else { - params->smps_mode = NL80211_SMPS_OFF; - } + if (info->attrs[NL80211_ATTR_SMPS_MODE] && + nla_get_u8(info->attrs[NL80211_ATTR_SMPS_MODE]) != NL80211_SMPS_OFF) + return -EOPNOTSUPP; params->pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]); if (params->pbss && !rdev->wiphy.bands[NL80211_BAND_60GHZ]) { -- cgit v1.3 From 9c5f2c7eeb585834f8dadb552b4fd811dd2dee6f Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Mon, 7 Oct 2024 15:00:47 +0300 Subject: wifi: mac80211: rename IEEE80211_CHANCTX_CHANGE_MIN_WIDTH The name is misleading, this actually indicates that ieee80211_chanctx_conf::min_def was updated. Rename it to IEEE80211_CHANCTX_CHANGE_MIN_DEF. Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241007144851.726b5f12ae0c.I3bd9e594c9d2735183ec049a4c7224bd0a9599c9@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 4 ++-- include/net/mac80211.h | 4 ++-- net/mac80211/chan.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index a327893c6dce..11cd1a8fdd9e 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -5067,7 +5067,7 @@ void iwl_mvm_change_chanctx(struct ieee80211_hw *hw, (changed & ~(IEEE80211_CHANCTX_CHANGE_WIDTH | IEEE80211_CHANCTX_CHANGE_RX_CHAINS | IEEE80211_CHANCTX_CHANGE_RADAR | - IEEE80211_CHANCTX_CHANGE_MIN_WIDTH)), + IEEE80211_CHANCTX_CHANGE_MIN_DEF)), "Cannot change PHY. Ref=%d, changed=0x%X\n", phy_ctxt->ref, changed)) return; @@ -5075,7 +5075,7 @@ void iwl_mvm_change_chanctx(struct ieee80211_hw *hw, guard(mvm)(mvm); /* we are only changing the min_width, may be a noop */ - if (changed == IEEE80211_CHANCTX_CHANGE_MIN_WIDTH) { + if (changed == IEEE80211_CHANCTX_CHANGE_MIN_DEF) { if (phy_ctxt->width == def->width) return; diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 0b8df8ec5a3b..c42ad5a0c303 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -213,7 +213,7 @@ struct ieee80211_low_level_stats { * @IEEE80211_CHANCTX_CHANGE_RADAR: radar detection flag changed * @IEEE80211_CHANCTX_CHANGE_CHANNEL: switched to another operating channel, * this is used only with channel switching with CSA - * @IEEE80211_CHANCTX_CHANGE_MIN_WIDTH: The min required channel width changed + * @IEEE80211_CHANCTX_CHANGE_MIN_DEF: The min chandef changed * @IEEE80211_CHANCTX_CHANGE_AP: The AP channel definition changed, so (wider * bandwidth) OFDMA settings need to be changed * @IEEE80211_CHANCTX_CHANGE_PUNCTURING: The punctured channel(s) bitmap @@ -224,7 +224,7 @@ enum ieee80211_chanctx_change { IEEE80211_CHANCTX_CHANGE_RX_CHAINS = BIT(1), IEEE80211_CHANCTX_CHANGE_RADAR = BIT(2), IEEE80211_CHANCTX_CHANGE_CHANNEL = BIT(3), - IEEE80211_CHANCTX_CHANGE_MIN_WIDTH = BIT(4), + IEEE80211_CHANCTX_CHANGE_MIN_DEF = BIT(4), IEEE80211_CHANCTX_CHANGE_AP = BIT(5), IEEE80211_CHANCTX_CHANGE_PUNCTURING = BIT(6), }; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index a155e418d26b..3f7df45b0431 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -409,7 +409,7 @@ _ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, if (!ctx->driver_present) return 0; - return IEEE80211_CHANCTX_CHANGE_MIN_WIDTH; + return IEEE80211_CHANCTX_CHANGE_MIN_DEF; } static void ieee80211_chan_bw_change(struct ieee80211_local *local, -- cgit v1.3 From e21dd758cf4c51d508e6665b653c5836103d1027 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Mon, 7 Oct 2024 15:00:48 +0300 Subject: wifi: mac80211: make bss_param_ch_cnt available for the low level driver Drivers may need to track this. Make it available for them, and maintain the value when beacons are received. When link X receives a beacon, iterate the RNR elements and update all the links with their respective data. Track the link id that updated the data so that each link can know whether the update came from its own beacon or from another link. In case, the update came from the link's own beacon, always update the updater link id. The purpose is to let the low level driver know if a link is losing its beacons. If link X is losing its beacons, it can still track the bss_param_ch_cnt and know where the update came from. Signed-off-by: Emmanuel Grumbach Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241007144851.e2d8d1a722ad.I04b883daba2cd48e5730659eb62ca1614c899cbb@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 15 ++++++++ net/mac80211/ieee80211_i.h | 2 - net/mac80211/mlme.c | 93 +++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 106 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index c42ad5a0c303..20562676e9cd 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -740,6 +740,19 @@ struct ieee80211_parsed_tpe { * @eht_80mhz_full_bw_ul_mumimo: in AP-mode, does this BSS support the * reception of an EHT TB PPDU on an RU that spans the entire PPDU * bandwidth + * @bss_param_ch_cnt: in BSS-mode, the BSS params change count. This + * information is the latest known value. It can come from this link's + * beacon or from a beacon sent by another link. + * @bss_param_ch_cnt_link_id: in BSS-mode, the link_id to which the beacon + * that updated &bss_param_ch_cnt belongs. E.g. if link 1 doesn't hear + * its beacons, and link 2 sent a beacon with an RNR element that updated + * link 1's BSS params change count, then, link 1's + * bss_param_ch_cnt_link_id will be 2. That means that link 1 knows that + * link 2 was the link that updated its bss_param_ch_cnt value. + * In case link 1 hears its beacon again, bss_param_ch_cnt_link_id will + * be updated to 1, even if bss_param_ch_cnt didn't change. This allows + * the link to know that it heard the latest value from its own beacon + * (as opposed to hearing its value from another link's beacon). */ struct ieee80211_bss_conf { struct ieee80211_vif *vif; @@ -834,6 +847,8 @@ struct ieee80211_bss_conf { bool eht_su_beamformee; bool eht_mu_beamformer; bool eht_80mhz_full_bw_ul_mumimo; + u8 bss_param_ch_cnt; + u8 bss_param_ch_cnt_link_id; }; /** diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index afb867dc6b24..d884d05826d3 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1015,8 +1015,6 @@ struct ieee80211_link_data_managed { int wmm_last_param_set; int mu_edca_last_param_set; - - u8 bss_param_ch_cnt; }; struct ieee80211_link_data_ap { diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 0303972c23e4..93a11a59339c 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2643,6 +2643,89 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, &ifmgd->csa_connection_drop_work); } +struct sta_bss_param_ch_cnt_data { + struct ieee80211_sub_if_data *sdata; + u8 reporting_link_id; + u8 mld_id; +}; + +static enum cfg80211_rnr_iter_ret +ieee80211_sta_bss_param_ch_cnt_iter(void *_data, u8 type, + const struct ieee80211_neighbor_ap_info *info, + const u8 *tbtt_info, u8 tbtt_info_len) +{ + struct sta_bss_param_ch_cnt_data *data = _data; + struct ieee80211_sub_if_data *sdata = data->sdata; + const struct ieee80211_tbtt_info_ge_11 *ti; + u8 bss_param_ch_cnt; + int link_id; + + if (type != IEEE80211_TBTT_INFO_TYPE_TBTT) + return RNR_ITER_CONTINUE; + + if (tbtt_info_len < sizeof(*ti)) + return RNR_ITER_CONTINUE; + + ti = (const void *)tbtt_info; + + if (ti->mld_params.mld_id != data->mld_id) + return RNR_ITER_CONTINUE; + + link_id = le16_get_bits(ti->mld_params.params, + IEEE80211_RNR_MLD_PARAMS_LINK_ID); + bss_param_ch_cnt = + le16_get_bits(ti->mld_params.params, + IEEE80211_RNR_MLD_PARAMS_BSS_CHANGE_COUNT); + + if (bss_param_ch_cnt != 255 && + link_id < ARRAY_SIZE(sdata->link)) { + struct ieee80211_link_data *link = + sdata_dereference(sdata->link[link_id], sdata); + + if (link && link->conf->bss_param_ch_cnt != bss_param_ch_cnt) { + link->conf->bss_param_ch_cnt = bss_param_ch_cnt; + link->conf->bss_param_ch_cnt_link_id = + data->reporting_link_id; + } + } + + return RNR_ITER_CONTINUE; +} + +static void +ieee80211_mgd_update_bss_param_ch_cnt(struct ieee80211_sub_if_data *sdata, + struct ieee80211_bss_conf *bss_conf, + struct ieee802_11_elems *elems) +{ + struct sta_bss_param_ch_cnt_data data = { + .reporting_link_id = bss_conf->link_id, + .sdata = sdata, + }; + int bss_param_ch_cnt; + + if (!elems->ml_basic) + return; + + data.mld_id = ieee80211_mle_get_mld_id((const void *)elems->ml_basic); + + cfg80211_iter_rnr(elems->ie_start, elems->total_len, + ieee80211_sta_bss_param_ch_cnt_iter, &data); + + bss_param_ch_cnt = + ieee80211_mle_get_bss_param_ch_cnt((const void *)elems->ml_basic); + + /* + * Update bss_param_ch_cnt_link_id even if bss_param_ch_cnt + * didn't change to indicate that we got a beacon on our own + * link. + */ + if (bss_param_ch_cnt >= 0 && bss_param_ch_cnt != 255) { + bss_conf->bss_param_ch_cnt = bss_param_ch_cnt; + bss_conf->bss_param_ch_cnt_link_id = + bss_conf->link_id; + } +} + static bool ieee80211_find_80211h_pwr_constr(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, @@ -4667,7 +4750,8 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, ret = false; goto out; } - link->u.mgd.bss_param_ch_cnt = bss_param_ch_cnt; + bss_conf->bss_param_ch_cnt = bss_param_ch_cnt; + bss_conf->bss_param_ch_cnt_link_id = link_id; } } else if (elems->parse_error & IEEE80211_PARSE_ERR_DUP_NEST_ML_BASIC || !elems->prof || @@ -4677,6 +4761,7 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, } else { const u8 *ptr = elems->prof->variable + elems->prof->sta_info_len - 1; + int bss_param_ch_cnt; /* * During parsing, we validated that these fields exist, @@ -4684,8 +4769,10 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, */ capab_info = get_unaligned_le16(ptr); assoc_data->link[link_id].status = get_unaligned_le16(ptr + 2); - link->u.mgd.bss_param_ch_cnt = + bss_param_ch_cnt = ieee80211_mle_basic_sta_prof_bss_param_ch_cnt(elems->prof); + bss_conf->bss_param_ch_cnt = bss_param_ch_cnt; + bss_conf->bss_param_ch_cnt_link_id = link_id; if (assoc_data->link[link_id].status != WLAN_STATUS_SUCCESS) { link_info(link, "association response status code=%u\n", @@ -6913,6 +7000,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, /* note that after this elems->ml_basic can no longer be used fully */ ieee80211_mgd_check_cross_link_csa(sdata, rx_status->link_id, elems); + ieee80211_mgd_update_bss_param_ch_cnt(sdata, bss_conf, elems); + if (!link->u.mgd.disable_wmm_tracking && ieee80211_sta_wmm_params(local, link, elems->wmm_param, elems->wmm_param_len, -- cgit v1.3 From eea3323c43540f5d21668704946ea13ef0e9b574 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Mon, 7 Oct 2024 15:00:49 +0300 Subject: wifi: mac80211: remove unneeded parameters ieee80211_find_80211h_pwr_constr and ieee80211_find_cisco_dtpc don't need the pointer to struct ieee80211_sub_if_data *sdata. Remove it and it'll be one step closer to handle the power constraints per-link. Signed-off-by: Emmanuel Grumbach Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241007144851.3ea505cd74e7.Id416127544afd80e4fe7b275b612aef511fc64ed@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 93a11a59339c..9b0a0091f93c 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2727,8 +2727,7 @@ ieee80211_mgd_update_bss_param_ch_cnt(struct ieee80211_sub_if_data *sdata, } static bool -ieee80211_find_80211h_pwr_constr(struct ieee80211_sub_if_data *sdata, - struct ieee80211_channel *channel, +ieee80211_find_80211h_pwr_constr(struct ieee80211_channel *channel, const u8 *country_ie, u8 country_ie_len, const u8 *pwr_constr_elem, int *chan_pwr, int *pwr_reduction) @@ -2798,8 +2797,7 @@ ieee80211_find_80211h_pwr_constr(struct ieee80211_sub_if_data *sdata, return have_chan_pwr; } -static void ieee80211_find_cisco_dtpc(struct ieee80211_sub_if_data *sdata, - struct ieee80211_channel *channel, +static void ieee80211_find_cisco_dtpc(struct ieee80211_channel *channel, const u8 *cisco_dtpc_ie, int *pwr_level) { @@ -2833,7 +2831,7 @@ static u64 ieee80211_handle_pwr_constr(struct ieee80211_link_data *link, (capab & cpu_to_le16(WLAN_CAPABILITY_SPECTRUM_MGMT) || capab & cpu_to_le16(WLAN_CAPABILITY_RADIO_MEASURE))) { has_80211h_pwr = ieee80211_find_80211h_pwr_constr( - sdata, channel, country_ie, country_ie_len, + channel, country_ie, country_ie_len, pwr_constr_ie, &chan_pwr, &pwr_reduction_80211h); pwr_level_80211h = max_t(int, 0, chan_pwr - pwr_reduction_80211h); @@ -2841,7 +2839,7 @@ static u64 ieee80211_handle_pwr_constr(struct ieee80211_link_data *link, if (cisco_dtpc_ie) { ieee80211_find_cisco_dtpc( - sdata, channel, cisco_dtpc_ie, &pwr_level_cisco); + channel, cisco_dtpc_ie, &pwr_level_cisco); has_cisco_pwr = true; } -- cgit v1.3 From 9925aa855d4b400346c123dcc0301289779331e3 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Mon, 7 Oct 2024 15:00:50 +0300 Subject: wifi: mac80211: ieee80211_recalc_txpower receives a link Handle the tx power per-link. Don't change the behavior for now. Just change the signature of the function. Signed-off-by: Emmanuel Grumbach Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241007144851.705bbf953d0a.I8a429dede07bab5801f4c730a6abff7ce23b22d3@changeid Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 7 ++++--- net/mac80211/chan.c | 4 ++-- net/mac80211/ieee80211_i.h | 2 +- net/mac80211/iface.c | 8 ++++---- 4 files changed, 11 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 548b9bbdac04..27468a463d8b 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3096,7 +3096,7 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy, sdata->vif.bss_conf.txpower_type = txp_type; } - ieee80211_recalc_txpower(sdata, update_txp_type); + ieee80211_recalc_txpower(&sdata->deflink, update_txp_type); return 0; } @@ -3127,7 +3127,7 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy, list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_MONITOR) continue; - ieee80211_recalc_txpower(sdata, update_txp_type); + ieee80211_recalc_txpower(&sdata->deflink, update_txp_type); } if (has_monitor) { @@ -3139,7 +3139,8 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy, update_txp_type = true; sdata->vif.bss_conf.txpower_type = txp_type; - ieee80211_recalc_txpower(sdata, update_txp_type); + ieee80211_recalc_txpower(&sdata->deflink, + update_txp_type); } } diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 3f7df45b0431..426ae5c066c9 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -905,7 +905,7 @@ static int ieee80211_assign_link_chanctx(struct ieee80211_link_data *link, } if (new_ctx && ieee80211_chanctx_num_assigned(local, new_ctx) > 0) { - ieee80211_recalc_txpower(sdata, false); + ieee80211_recalc_txpower(&sdata->deflink, false); ieee80211_recalc_chanctx_min_def(local, new_ctx, NULL, false); } @@ -1712,7 +1712,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) link, changed); - ieee80211_recalc_txpower(sdata, false); + ieee80211_recalc_txpower(&sdata->deflink, false); } ieee80211_recalc_chanctx_chantype(local, ctx); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index d884d05826d3..94d9ffcbe1f8 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2034,7 +2034,7 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local); void ieee80211_del_virtual_monitor(struct ieee80211_local *local); bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata); -void ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata, +void ieee80211_recalc_txpower(struct ieee80211_link_data *link, bool update_bss); void ieee80211_recalc_offload(struct ieee80211_local *local); diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 6ef0990d3d29..e4a8ed102736 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -74,12 +74,12 @@ bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata) return false; } -void ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata, +void ieee80211_recalc_txpower(struct ieee80211_link_data *link, bool update_bss) { - if (__ieee80211_recalc_txpower(sdata) || - (update_bss && ieee80211_sdata_running(sdata))) - ieee80211_link_info_change_notify(sdata, &sdata->deflink, + if (__ieee80211_recalc_txpower(link->sdata) || + (update_bss && ieee80211_sdata_running(link->sdata))) + ieee80211_link_info_change_notify(link->sdata, link, BSS_CHANGED_TXPOWER); } -- cgit v1.3 From 0b7392ee3bcff6319a6f5c5ad51710c7033d29aa Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Mon, 7 Oct 2024 15:00:51 +0300 Subject: wifi: mac80211: __ieee80211_recalc_txpower receives a link Handle the tx power per-link. Don't change the behavior for now. Just change the signature of the function. Signed-off-by: Emmanuel Grumbach Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241007144851.3c9cd0731f5b.I6ebfd9d5084f3602b55c55e2669881fd92471c2f@changeid Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 2 +- net/mac80211/iface.c | 5 +++-- net/mac80211/mlme.c | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 94d9ffcbe1f8..45987add530f 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2033,7 +2033,7 @@ void ieee80211_sdata_stop(struct ieee80211_sub_if_data *sdata); int ieee80211_add_virtual_monitor(struct ieee80211_local *local); void ieee80211_del_virtual_monitor(struct ieee80211_local *local); -bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata); +bool __ieee80211_recalc_txpower(struct ieee80211_link_data *link); void ieee80211_recalc_txpower(struct ieee80211_link_data *link, bool update_bss); void ieee80211_recalc_offload(struct ieee80211_local *local); diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index e4a8ed102736..138ba30e23ba 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -44,8 +44,9 @@ static void ieee80211_iface_work(struct wiphy *wiphy, struct wiphy_work *work); -bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata) +bool __ieee80211_recalc_txpower(struct ieee80211_link_data *link) { + struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_chanctx_conf *chanctx_conf; int power; @@ -77,7 +78,7 @@ bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata) void ieee80211_recalc_txpower(struct ieee80211_link_data *link, bool update_bss) { - if (__ieee80211_recalc_txpower(link->sdata) || + if (__ieee80211_recalc_txpower(link) || (update_bss && ieee80211_sdata_running(link->sdata))) ieee80211_link_info_change_notify(link->sdata, link, BSS_CHANGED_TXPOWER); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 9b0a0091f93c..f8fe711e1028 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2872,7 +2872,7 @@ static u64 ieee80211_handle_pwr_constr(struct ieee80211_link_data *link, } link->ap_power_level = new_ap_level; - if (__ieee80211_recalc_txpower(sdata)) + if (__ieee80211_recalc_txpower(link)) return BSS_CHANGED_TXPOWER; return 0; } -- cgit v1.3 From c4382d5ca1af75cf909463c7a707efd1a5f9a557 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Mon, 7 Oct 2024 15:00:52 +0300 Subject: wifi: mac80211: update the right link for tx power Stop looking at deflink and start using the actual link. Initialize the power settings upon link init. Signed-off-by: Emmanuel Grumbach Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241007144851.2685dab8e1ab.I1d82cbdb2dda020aee4a225bd9a134f7d82dd810@changeid Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 94 +++++++++++++++++++++++++++++++++------------------- net/mac80211/chan.c | 4 +-- net/mac80211/iface.c | 20 +++++------ net/mac80211/link.c | 3 ++ 4 files changed, 72 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 27468a463d8b..ca4fd217be3e 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3061,9 +3061,25 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy, enum nl80211_tx_power_setting txp_type = type; bool update_txp_type = false; bool has_monitor = false; + int user_power_level; lockdep_assert_wiphy(local->hw.wiphy); + switch (type) { + case NL80211_TX_POWER_AUTOMATIC: + user_power_level = IEEE80211_UNSET_POWER_LEVEL; + txp_type = NL80211_TX_POWER_LIMITED; + break; + case NL80211_TX_POWER_LIMITED: + case NL80211_TX_POWER_FIXED: + if (mbm < 0 || (mbm % 100)) + return -EOPNOTSUPP; + user_power_level = MBM_TO_DBM(mbm); + break; + default: + return -EINVAL; + } + if (wdev) { sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); @@ -3077,57 +3093,65 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy, return -EOPNOTSUPP; } - switch (type) { - case NL80211_TX_POWER_AUTOMATIC: - sdata->deflink.user_power_level = - IEEE80211_UNSET_POWER_LEVEL; - txp_type = NL80211_TX_POWER_LIMITED; - break; - case NL80211_TX_POWER_LIMITED: - case NL80211_TX_POWER_FIXED: - if (mbm < 0 || (mbm % 100)) - return -EOPNOTSUPP; - sdata->deflink.user_power_level = MBM_TO_DBM(mbm); - break; - } + for (int link_id = 0; + link_id < ARRAY_SIZE(sdata->link); + link_id++) { + struct ieee80211_link_data *link = + wiphy_dereference(wiphy, sdata->link[link_id]); - if (txp_type != sdata->vif.bss_conf.txpower_type) { - update_txp_type = true; - sdata->vif.bss_conf.txpower_type = txp_type; - } + if (!link) + continue; + + link->user_power_level = user_power_level; - ieee80211_recalc_txpower(&sdata->deflink, update_txp_type); + if (txp_type != link->conf->txpower_type) { + update_txp_type = true; + link->conf->txpower_type = txp_type; + } + ieee80211_recalc_txpower(link, update_txp_type); + } return 0; } - switch (type) { - case NL80211_TX_POWER_AUTOMATIC: - local->user_power_level = IEEE80211_UNSET_POWER_LEVEL; - txp_type = NL80211_TX_POWER_LIMITED; - break; - case NL80211_TX_POWER_LIMITED: - case NL80211_TX_POWER_FIXED: - if (mbm < 0 || (mbm % 100)) - return -EOPNOTSUPP; - local->user_power_level = MBM_TO_DBM(mbm); - break; - } + local->user_power_level = user_power_level; list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { has_monitor = true; continue; } - sdata->deflink.user_power_level = local->user_power_level; - if (txp_type != sdata->vif.bss_conf.txpower_type) - update_txp_type = true; - sdata->vif.bss_conf.txpower_type = txp_type; + + for (int link_id = 0; + link_id < ARRAY_SIZE(sdata->link); + link_id++) { + struct ieee80211_link_data *link = + wiphy_dereference(wiphy, sdata->link[link_id]); + + if (!link) + continue; + + link->user_power_level = local->user_power_level; + if (txp_type != link->conf->txpower_type) + update_txp_type = true; + link->conf->txpower_type = txp_type; + } } list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_MONITOR) continue; - ieee80211_recalc_txpower(&sdata->deflink, update_txp_type); + + for (int link_id = 0; + link_id < ARRAY_SIZE(sdata->link); + link_id++) { + struct ieee80211_link_data *link = + wiphy_dereference(wiphy, sdata->link[link_id]); + + if (!link) + continue; + + ieee80211_recalc_txpower(link, update_txp_type); + } } if (has_monitor) { diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 426ae5c066c9..b355e9af268d 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -905,7 +905,7 @@ static int ieee80211_assign_link_chanctx(struct ieee80211_link_data *link, } if (new_ctx && ieee80211_chanctx_num_assigned(local, new_ctx) > 0) { - ieee80211_recalc_txpower(&sdata->deflink, false); + ieee80211_recalc_txpower(link, false); ieee80211_recalc_chanctx_min_def(local, new_ctx, NULL, false); } @@ -1712,7 +1712,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) link, changed); - ieee80211_recalc_txpower(&sdata->deflink, false); + ieee80211_recalc_txpower(link, false); } ieee80211_recalc_chanctx_chantype(local, ctx); diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 138ba30e23ba..7a99fa057cd9 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -46,12 +46,11 @@ static void ieee80211_iface_work(struct wiphy *wiphy, struct wiphy_work *work); bool __ieee80211_recalc_txpower(struct ieee80211_link_data *link) { - struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_chanctx_conf *chanctx_conf; int power; rcu_read_lock(); - chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); + chanctx_conf = rcu_dereference(link->conf->chanctx_conf); if (!chanctx_conf) { rcu_read_unlock(); return false; @@ -60,15 +59,15 @@ bool __ieee80211_recalc_txpower(struct ieee80211_link_data *link) power = ieee80211_chandef_max_power(&chanctx_conf->def); rcu_read_unlock(); - if (sdata->deflink.user_power_level != IEEE80211_UNSET_POWER_LEVEL) - power = min(power, sdata->deflink.user_power_level); + if (link->user_power_level != IEEE80211_UNSET_POWER_LEVEL) + power = min(power, link->user_power_level); - if (sdata->deflink.ap_power_level != IEEE80211_UNSET_POWER_LEVEL) - power = min(power, sdata->deflink.ap_power_level); + if (link->ap_power_level != IEEE80211_UNSET_POWER_LEVEL) + power = min(power, link->ap_power_level); - if (power != sdata->vif.bss_conf.txpower) { - sdata->vif.bss_conf.txpower = power; - ieee80211_hw_config(sdata->local, 0); + if (power != link->conf->txpower) { + link->conf->txpower = power; + ieee80211_hw_config(link->sdata->local, 0); return true; } @@ -2177,9 +2176,6 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, ieee80211_set_default_queues(sdata); - sdata->deflink.ap_power_level = IEEE80211_UNSET_POWER_LEVEL; - sdata->deflink.user_power_level = local->user_power_level; - /* setup type-dependent data */ ieee80211_setup_sdata(sdata, type); diff --git a/net/mac80211/link.c b/net/mac80211/link.c index 0bbac64d5fa0..503bdea904bc 100644 --- a/net/mac80211/link.c +++ b/net/mac80211/link.c @@ -36,6 +36,9 @@ void ieee80211_link_init(struct ieee80211_sub_if_data *sdata, link->conf = link_conf; link_conf->link_id = link_id; link_conf->vif = &sdata->vif; + link->ap_power_level = IEEE80211_UNSET_POWER_LEVEL; + link->user_power_level = sdata->local->user_power_level; + link_conf->txpower = INT_MIN; wiphy_work_init(&link->csa.finalize_work, ieee80211_csa_finalize_work); -- cgit v1.3 From f828deb70c96748eb3a7462d5dbc432a28adae5f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 15:00:53 +0300 Subject: wifi: mac80211: allow rate_control_rate_init() for links Andrei previously fixed an issue in the client where the NSS for links other than the primary/assoc/deflink isn't set. The same issue appears to exist on the AP side, because there's only a call to rate_control_rate_init() for the deflink, and not any other links. Rework the code a bit to do rate_control_rate_init() for links, even if it really doesn't work with software rate control yet, it does other things as well. Also add rate_control_rate_init_all_links() to actually do it properly when moving to ASSOC state in cfg80211. Change the explicit call to ieee80211_sta_init_nss() to instead be rate_control_rate_init() now in the client code, but also add a call to rate_control_rate_init() when a link is added in AP mode and the STA is already associated. This should fix the NSS initialization issue, and perhaps pave the way for actual software rate scaling a bit, in case anyone cares in the future, but that of course needs a lot more than just the init call. We still need to fix the rate control _update_ as well, and the sta_rc_update() driver method especially, but that will be in a different patch. Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241007144851.c693274a908f.I0376da02e9f5a30eaa1b5d0d01371ff09506d453@changeid Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 11 +++++++++-- net/mac80211/ibss.c | 4 ++-- net/mac80211/mesh_plink.c | 2 +- net/mac80211/mlme.c | 2 +- net/mac80211/ocb.c | 4 ++-- net/mac80211/rate.c | 24 ++++++++++++++++++++++-- net/mac80211/rate.h | 5 +++-- 7 files changed, 40 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index ca4fd217be3e..ecc138869b4b 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1720,7 +1720,7 @@ static int sta_apply_auth_flags(struct ieee80211_local *local, * before drv_sta_state() is called. */ if (!test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) - rate_control_rate_init(sta); + rate_control_rate_init_all_links(sta); ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC); if (ret) @@ -2149,7 +2149,7 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, */ if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER) && test_sta_flag(sta, WLAN_STA_ASSOC)) - rate_control_rate_init(sta); + rate_control_rate_init_all_links(sta); return sta_info_insert(sta); } @@ -5063,6 +5063,13 @@ ieee80211_add_link_station(struct wiphy *wiphy, struct net_device *dev, return ret; } + if (test_sta_flag(sta, WLAN_STA_ASSOC)) { + struct link_sta_info *link_sta; + + link_sta = sdata_dereference(sta->link[params->link_id], sdata); + rate_control_rate_init(link_sta); + } + /* ieee80211_sta_activate_link frees the link upon failure */ return ieee80211_sta_activate_link(sta, params->link_id); } diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 3f74bbceeca5..08fac295ad5b 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -569,7 +569,7 @@ static struct sta_info *ieee80211_ibss_finish_sta(struct sta_info *sta) if (!sta->sdata->u.ibss.control_port) sta_info_pre_move_state(sta, IEEE80211_STA_AUTHORIZED); - rate_control_rate_init(sta); + rate_control_rate_init(&sta->deflink); /* If it fails, maybe we raced another insertion? */ if (sta_info_insert_rcu(sta)) @@ -1068,7 +1068,7 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, /* Force rx_nss recalculation */ sta->sta.deflink.rx_nss = 0; - rate_control_rate_init(sta); + rate_control_rate_init(&sta->deflink); if (sta->sta.deflink.rx_nss != rx_nss) changed |= IEEE80211_RC_NSS_CHANGED; diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 42286aa3623c..be0700d64549 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -487,7 +487,7 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, } if (!test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) - rate_control_rate_init(sta); + rate_control_rate_init(&sta->deflink); else rate_control_rate_update(local, sband, sta, 0, changed); out: diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index f8fe711e1028..714f780cc0f5 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -5750,7 +5750,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, /* links might have changed due to rejected ones, set them again */ ieee80211_vif_set_links(sdata, valid_links, dormant_links); - rate_control_rate_init(sta); + rate_control_rate_init_all_links(sta); if (ifmgd->flags & IEEE80211_STA_MFP_ENABLED) { set_sta_flag(sta, WLAN_STA_MFP); diff --git a/net/mac80211/ocb.c b/net/mac80211/ocb.c index f4c51e4a1e29..6218abc3e441 100644 --- a/net/mac80211/ocb.c +++ b/net/mac80211/ocb.c @@ -4,7 +4,7 @@ * * Copyright: (c) 2014 Czech Technical University in Prague * (c) 2014 Volkswagen Group Research - * Copyright (C) 2022 - 2023 Intel Corporation + * Copyright (C) 2022 - 2024 Intel Corporation * Author: Rostislav Lisovy * Funded by: Volkswagen Group Research */ @@ -96,7 +96,7 @@ static struct sta_info *ieee80211_ocb_finish_sta(struct sta_info *sta) sta_info_move_state(sta, IEEE80211_STA_ASSOC); sta_info_move_state(sta, IEEE80211_STA_AUTHORIZED); - rate_control_rate_init(sta); + rate_control_rate_init(&sta->deflink); /* If it fails, maybe we raced another insertion? */ if (sta_info_insert_rcu(sta)) diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 3dc9752188d5..23b4f1af37e0 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -28,8 +28,9 @@ module_param(ieee80211_default_rc_algo, charp, 0644); MODULE_PARM_DESC(ieee80211_default_rc_algo, "Default rate control algorithm for mac80211 to use"); -void rate_control_rate_init(struct sta_info *sta) +void rate_control_rate_init(struct link_sta_info *link_sta) { + struct sta_info *sta = link_sta->sta; struct ieee80211_local *local = sta->sdata->local; struct rate_control_ref *ref = sta->rate_ctrl; struct ieee80211_sta *ista = &sta->sta; @@ -37,11 +38,15 @@ void rate_control_rate_init(struct sta_info *sta) struct ieee80211_supported_band *sband; struct ieee80211_chanctx_conf *chanctx_conf; - ieee80211_sta_init_nss(&sta->deflink); + ieee80211_sta_init_nss(link_sta); if (!ref) return; + /* SW rate control isn't supported with MLO right now */ + if (WARN_ON(ieee80211_vif_is_mld(&sta->sdata->vif))) + return; + rcu_read_lock(); chanctx_conf = rcu_dereference(sta->sdata->vif.bss_conf.chanctx_conf); @@ -67,6 +72,21 @@ void rate_control_rate_init(struct sta_info *sta) set_sta_flag(sta, WLAN_STA_RATE_CONTROL); } +void rate_control_rate_init_all_links(struct sta_info *sta) +{ + int link_id; + + for (link_id = 0; link_id < ARRAY_SIZE(sta->link); link_id++) { + struct link_sta_info *link_sta; + + link_sta = sdata_dereference(sta->link[link_id], sta->sdata); + if (!link_sta) + continue; + + rate_control_rate_init(link_sta); + } +} + void rate_control_tx_status(struct ieee80211_local *local, struct ieee80211_tx_status *st) { diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h index d6190f10fe7c..8d3c8903b4ae 100644 --- a/net/mac80211/rate.h +++ b/net/mac80211/rate.h @@ -3,7 +3,7 @@ * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc - * Copyright (C) 2022 Intel Corporation + * Copyright (C) 2022, 2024 Intel Corporation */ #ifndef IEEE80211_RATE_H @@ -29,7 +29,8 @@ void rate_control_get_rate(struct ieee80211_sub_if_data *sdata, void rate_control_tx_status(struct ieee80211_local *local, struct ieee80211_tx_status *st); -void rate_control_rate_init(struct sta_info *sta); +void rate_control_rate_init(struct link_sta_info *link_sta); +void rate_control_rate_init_all_links(struct sta_info *sta); void rate_control_rate_update(struct ieee80211_local *local, struct ieee80211_supported_band *sband, struct sta_info *sta, -- cgit v1.3 From 88b67e91e2928c3311f3658d1270b40708b0de00 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 15:00:54 +0300 Subject: wifi: mac80211: call rate_control_rate_update() for link STA In order to update the right link information, call the update rate_control_rate_update() with the right link_sta, and then pass that through to the driver's sta_rc_update() method. The software rate control still doesn't support it, but that'll be skipped by not having a rate control ref. Since it now operates on a link sta, rename the driver method. Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241007144851.5851b6b5fd41.Ibdf50d96afa4b761dd9b9dfd54a1147e77a75329@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath10k/mac.c | 5 +++-- drivers/net/wireless/ath/ath11k/mac.c | 5 +++-- drivers/net/wireless/ath/ath12k/mac.c | 5 +++-- drivers/net/wireless/ath/ath9k/htc_drv_main.c | 6 ++++-- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 5 +++-- drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c | 2 +- drivers/net/wireless/intel/iwlwifi/mvm/mvm.h | 2 +- drivers/net/wireless/mediatek/mt76/mt7915/main.c | 5 +++-- drivers/net/wireless/mediatek/mt76/mt7996/main.c | 5 +++-- drivers/net/wireless/realtek/rtw88/mac80211.c | 6 ++++-- drivers/net/wireless/realtek/rtw89/mac80211.c | 6 ++++-- drivers/net/wireless/ti/wlcore/main.c | 5 +++-- drivers/net/wireless/virtual/mac80211_hwsim.c | 8 ++++---- include/net/mac80211.h | 12 ++++++------ net/mac80211/chan.c | 2 +- net/mac80211/driver-ops.c | 15 ++++++++------- net/mac80211/driver-ops.h | 6 +++--- net/mac80211/ibss.c | 3 ++- net/mac80211/mesh_plink.c | 3 ++- net/mac80211/rate.c | 8 ++++---- net/mac80211/rate.h | 3 +-- net/mac80211/rx.c | 4 ++-- net/mac80211/tdls.c | 3 ++- net/mac80211/trace.h | 15 +++++++++------ net/mac80211/vht.c | 3 +-- 25 files changed, 80 insertions(+), 62 deletions(-) (limited to 'net') diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 646e1737d4c4..41ab83c3d3f7 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -8507,9 +8507,10 @@ exit: static void ath10k_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, + struct ieee80211_link_sta *link_sta, u32 changed) { + struct ieee80211_sta *sta = link_sta->sta; struct ath10k *ar = hw->priv; struct ath10k_sta *arsta = (struct ath10k_sta *)sta->drv_priv; struct ath10k_vif *arvif = (void *)vif->drv_priv; @@ -9450,7 +9451,7 @@ static const struct ieee80211_ops ath10k_ops = { .reconfig_complete = ath10k_reconfig_complete, .get_survey = ath10k_get_survey, .set_bitrate_mask = ath10k_mac_op_set_bitrate_mask, - .sta_rc_update = ath10k_sta_rc_update, + .link_sta_rc_update = ath10k_sta_rc_update, .offset_tsf = ath10k_offset_tsf, .ampdu_action = ath10k_ampdu_action, .get_et_sset_count = ath10k_debug_get_et_sset_count, diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index f8068d2e848c..e6acbff06749 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -5079,9 +5079,10 @@ static void ath11k_mac_op_sta_set_4addr(struct ieee80211_hw *hw, static void ath11k_mac_op_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, + struct ieee80211_link_sta *link_sta, u32 changed) { + struct ieee80211_sta *sta = link_sta->sta; struct ath11k *ar = hw->priv; struct ath11k_sta *arsta = ath11k_sta_to_arsta(sta); struct ath11k_vif *arvif = ath11k_vif_to_arvif(vif); @@ -9708,7 +9709,7 @@ static const struct ieee80211_ops ath11k_ops = { .sta_state = ath11k_mac_op_sta_state, .sta_set_4addr = ath11k_mac_op_sta_set_4addr, .sta_set_txpwr = ath11k_mac_op_sta_set_txpwr, - .sta_rc_update = ath11k_mac_op_sta_rc_update, + .link_sta_rc_update = ath11k_mac_op_sta_rc_update, .conf_tx = ath11k_mac_op_conf_tx, .set_antenna = ath11k_mac_op_set_antenna, .get_antenna = ath11k_mac_op_get_antenna, diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 137394c36460..acf5628adda5 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -4737,9 +4737,10 @@ out: static void ath12k_mac_op_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, + struct ieee80211_link_sta *link_sta, u32 changed) { + struct ieee80211_sta *sta = link_sta->sta; struct ath12k *ar; struct ath12k_sta *arsta = ath12k_sta_to_arsta(sta); struct ath12k_vif *arvif = ath12k_vif_to_arvif(vif); @@ -8681,7 +8682,7 @@ static const struct ieee80211_ops ath12k_ops = { .set_rekey_data = ath12k_mac_op_set_rekey_data, .sta_state = ath12k_mac_op_sta_state, .sta_set_txpwr = ath12k_mac_op_sta_set_txpwr, - .sta_rc_update = ath12k_mac_op_sta_rc_update, + .link_sta_rc_update = ath12k_mac_op_sta_rc_update, .conf_tx = ath12k_mac_op_conf_tx, .set_antenna = ath12k_mac_op_set_antenna, .get_antenna = ath12k_mac_op_get_antenna, diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_main.c b/drivers/net/wireless/ath/ath9k/htc_drv_main.c index 8a03bcc2789e..57094bd45d98 100644 --- a/drivers/net/wireless/ath/ath9k/htc_drv_main.c +++ b/drivers/net/wireless/ath/ath9k/htc_drv_main.c @@ -1357,8 +1357,10 @@ static int ath9k_htc_sta_remove(struct ieee80211_hw *hw, static void ath9k_htc_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, u32 changed) + struct ieee80211_link_sta *link_sta, + u32 changed) { + struct ieee80211_sta *sta = link_sta->sta; struct ath9k_htc_sta *ista = (struct ath9k_htc_sta *) sta->drv_priv; if (!(changed & IEEE80211_RC_SUPP_RATES_CHANGED)) @@ -1883,7 +1885,7 @@ struct ieee80211_ops ath9k_htc_ops = { .sta_add = ath9k_htc_sta_add, .sta_remove = ath9k_htc_sta_remove, .conf_tx = ath9k_htc_conf_tx, - .sta_rc_update = ath9k_htc_sta_rc_update, + .link_sta_rc_update = ath9k_htc_sta_rc_update, .bss_info_changed = ath9k_htc_bss_info_changed, .set_key = ath9k_htc_set_key, .get_tsf = ath9k_htc_get_tsf, diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 11cd1a8fdd9e..3721d6349cc5 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -4236,8 +4236,9 @@ int iwl_mvm_mac_set_rts_threshold(struct ieee80211_hw *hw, u32 value) } void iwl_mvm_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, u32 changed) + struct ieee80211_link_sta *link_sta, u32 changed) { + struct ieee80211_sta *sta = link_sta->sta; struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw); if (changed & (IEEE80211_RC_BW_CHANGED | @@ -6562,7 +6563,7 @@ const struct ieee80211_ops iwl_mvm_hw_ops = { .allow_buffered_frames = iwl_mvm_mac_allow_buffered_frames, .release_buffered_frames = iwl_mvm_mac_release_buffered_frames, .set_rts_threshold = iwl_mvm_mac_set_rts_threshold, - .sta_rc_update = iwl_mvm_sta_rc_update, + .link_sta_rc_update = iwl_mvm_sta_rc_update, .conf_tx = iwl_mvm_mac_conf_tx, .mgd_prepare_tx = iwl_mvm_mac_mgd_prepare_tx, .mgd_complete_tx = iwl_mvm_mac_mgd_complete_tx, diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c index f2378e0fb2fb..7de6c96646ca 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c @@ -1401,7 +1401,7 @@ const struct ieee80211_ops iwl_mvm_mld_hw_ops = { .allow_buffered_frames = iwl_mvm_mac_allow_buffered_frames, .release_buffered_frames = iwl_mvm_mac_release_buffered_frames, .set_rts_threshold = iwl_mvm_mac_set_rts_threshold, - .sta_rc_update = iwl_mvm_sta_rc_update, + .link_sta_rc_update = iwl_mvm_sta_rc_update, .conf_tx = iwl_mvm_mld_mac_conf_tx, .mgd_prepare_tx = iwl_mvm_mac_mgd_prepare_tx, .mgd_complete_tx = iwl_mvm_mac_mgd_complete_tx, diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h index ef07cff203b0..6246ffce7cf8 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h @@ -2914,7 +2914,7 @@ iwl_mvm_mac_release_buffered_frames(struct ieee80211_hw *hw, bool more_data); int iwl_mvm_mac_set_rts_threshold(struct ieee80211_hw *hw, u32 value); void iwl_mvm_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, u32 changed); + struct ieee80211_link_sta *link_sta, u32 changed); void iwl_mvm_mac_mgd_prepare_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_prep_tx_info *info); diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/main.c b/drivers/net/wireless/mediatek/mt76/mt7915/main.c index d75e8dea1fbd..c6f498fc81ff 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/main.c @@ -1163,9 +1163,10 @@ static void mt7915_sta_rc_work(void *data, struct ieee80211_sta *sta) static void mt7915_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, + struct ieee80211_link_sta *link_sta, u32 changed) { + struct ieee80211_sta *sta = link_sta->sta; struct mt7915_phy *phy = mt7915_hw_phy(hw); struct mt7915_dev *dev = phy->dev; struct mt7915_sta *msta = (struct mt7915_sta *)sta->drv_priv; @@ -1709,7 +1710,7 @@ const struct ieee80211_ops mt7915_ops = { .stop_ap = mt7915_stop_ap, .sta_state = mt76_sta_state, .sta_pre_rcu_remove = mt76_sta_pre_rcu_remove, - .sta_rc_update = mt7915_sta_rc_update, + .link_sta_rc_update = mt7915_sta_rc_update, .set_key = mt7915_set_key, .ampdu_action = mt7915_ampdu_action, .set_rts_threshold = mt7915_set_rts_threshold, diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/main.c b/drivers/net/wireless/mediatek/mt76/mt7996/main.c index 39f071ece35e..2b34ae5e0cb5 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7996/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7996/main.c @@ -1060,9 +1060,10 @@ static void mt7996_sta_rc_work(void *data, struct ieee80211_sta *sta) static void mt7996_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, + struct ieee80211_link_sta *link_sta, u32 changed) { + struct ieee80211_sta *sta = link_sta->sta; struct mt7996_phy *phy = mt7996_hw_phy(hw); struct mt7996_dev *dev = phy->dev; @@ -1472,7 +1473,7 @@ const struct ieee80211_ops mt7996_ops = { .sta_add = mt7996_sta_add, .sta_remove = mt7996_sta_remove, .sta_pre_rcu_remove = mt76_sta_pre_rcu_remove, - .sta_rc_update = mt7996_sta_rc_update, + .link_sta_rc_update = mt7996_sta_rc_update, .set_key = mt7996_set_key, .ampdu_action = mt7996_ampdu_action, .set_rts_threshold = mt7996_set_rts_threshold, diff --git a/drivers/net/wireless/realtek/rtw88/mac80211.c b/drivers/net/wireless/realtek/rtw88/mac80211.c index b39e90fb66b4..026fbf4ad9cc 100644 --- a/drivers/net/wireless/realtek/rtw88/mac80211.c +++ b/drivers/net/wireless/realtek/rtw88/mac80211.c @@ -928,8 +928,10 @@ static int rtw_ops_set_sar_specs(struct ieee80211_hw *hw, static void rtw_ops_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, u32 changed) + struct ieee80211_link_sta *link_sta, + u32 changed) { + struct ieee80211_sta *sta = link_sta->sta; struct rtw_dev *rtwdev = hw->priv; struct rtw_sta_info *si = (struct rtw_sta_info *)sta->drv_priv; @@ -973,7 +975,7 @@ const struct ieee80211_ops rtw_ops = { .reconfig_complete = rtw_reconfig_complete, .hw_scan = rtw_ops_hw_scan, .cancel_hw_scan = rtw_ops_cancel_hw_scan, - .sta_rc_update = rtw_ops_sta_rc_update, + .link_sta_rc_update = rtw_ops_sta_rc_update, .set_sar_specs = rtw_ops_set_sar_specs, #ifdef CONFIG_PM .suspend = rtw_ops_suspend, diff --git a/drivers/net/wireless/realtek/rtw89/mac80211.c b/drivers/net/wireless/realtek/rtw89/mac80211.c index 1ee63a85308f..3f33c3a2ae7d 100644 --- a/drivers/net/wireless/realtek/rtw89/mac80211.c +++ b/drivers/net/wireless/realtek/rtw89/mac80211.c @@ -1290,8 +1290,10 @@ out: static void rtw89_ops_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, u32 changed) + struct ieee80211_link_sta *link_sta, + u32 changed) { + struct ieee80211_sta *sta = link_sta->sta; struct rtw89_dev *rtwdev = hw->priv; rtw89_phy_ra_update_sta(rtwdev, sta, changed); @@ -1593,7 +1595,7 @@ const struct ieee80211_ops rtw89_ops = { .remain_on_channel = rtw89_ops_remain_on_channel, .cancel_remain_on_channel = rtw89_ops_cancel_remain_on_channel, .set_sar_specs = rtw89_ops_set_sar_specs, - .sta_rc_update = rtw89_ops_sta_rc_update, + .link_sta_rc_update = rtw89_ops_sta_rc_update, .set_tid_config = rtw89_ops_set_tid_config, #ifdef CONFIG_PM .suspend = rtw89_ops_suspend, diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c index 0c77b8524160..986b07bfa0ee 100644 --- a/drivers/net/wireless/ti/wlcore/main.c +++ b/drivers/net/wireless/ti/wlcore/main.c @@ -5789,9 +5789,10 @@ static int wlcore_op_cancel_remain_on_channel(struct ieee80211_hw *hw, static void wlcore_op_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, + struct ieee80211_link_sta *link_sta, u32 changed) { + struct ieee80211_sta *sta = link_sta->sta; struct wl12xx_vif *wlvif = wl12xx_vif_to_data(vif); wl1271_debug(DEBUG_MAC80211, "mac80211 sta_rc_update"); @@ -6052,7 +6053,7 @@ static const struct ieee80211_ops wl1271_ops = { .assign_vif_chanctx = wlcore_op_assign_vif_chanctx, .unassign_vif_chanctx = wlcore_op_unassign_vif_chanctx, .switch_vif_chanctx = wlcore_op_switch_vif_chanctx, - .sta_rc_update = wlcore_op_sta_rc_update, + .link_sta_rc_update = wlcore_op_sta_rc_update, .sta_statistics = wlcore_op_sta_statistics, .get_expected_throughput = wlcore_op_get_expected_throughput, CFG80211_TESTMODE_CMD(wl1271_tm_cmd) diff --git a/drivers/net/wireless/virtual/mac80211_hwsim.c b/drivers/net/wireless/virtual/mac80211_hwsim.c index f0e528abb1b4..fe3e2dc1626b 100644 --- a/drivers/net/wireless/virtual/mac80211_hwsim.c +++ b/drivers/net/wireless/virtual/mac80211_hwsim.c @@ -2594,10 +2594,11 @@ static void mac80211_hwsim_link_info_changed(struct ieee80211_hw *hw, static void mac80211_hwsim_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - struct ieee80211_sta *sta, + struct ieee80211_link_sta *link_sta, u32 changed) { struct mac80211_hwsim_data *data = hw->priv; + struct ieee80211_sta *sta = link_sta->sta; u32 bw = U32_MAX; int link_id; @@ -2607,7 +2608,6 @@ mac80211_hwsim_sta_rc_update(struct ieee80211_hw *hw, link_id++) { enum nl80211_chan_width confbw = NL80211_CHAN_WIDTH_20_NOHT; struct ieee80211_bss_conf *vif_conf; - struct ieee80211_link_sta *link_sta; link_sta = rcu_dereference(sta->link[link_id]); @@ -2659,7 +2659,7 @@ static int mac80211_hwsim_sta_add(struct ieee80211_hw *hw, hwsim_check_magic(vif); hwsim_set_sta_magic(sta); - mac80211_hwsim_sta_rc_update(hw, vif, sta, 0); + mac80211_hwsim_sta_rc_update(hw, vif, &sta->deflink, 0); if (sta->valid_links) { WARN(hweight16(sta->valid_links) > 1, @@ -3961,7 +3961,7 @@ out: .link_info_changed = mac80211_hwsim_link_info_changed, \ .tx_last_beacon = mac80211_hwsim_tx_last_beacon, \ .sta_notify = mac80211_hwsim_sta_notify, \ - .sta_rc_update = mac80211_hwsim_sta_rc_update, \ + .link_sta_rc_update = mac80211_hwsim_sta_rc_update, \ .conf_tx = mac80211_hwsim_conf_tx, \ .get_survey = mac80211_hwsim_get_survey, \ CFG80211_TESTMODE_CMD(mac80211_hwsim_testmode_cmd) \ diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 20562676e9cd..7a6edc04e212 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4090,8 +4090,8 @@ struct ieee80211_prep_tx_info { * in @sta_state. * The callback can sleep. * - * @sta_rc_update: Notifies the driver of changes to the bitrates that can be - * used to transmit to the station. The changes are advertised with bits + * @link_sta_rc_update: Notifies the driver of changes to the bitrates that can + * be used to transmit to the station. The changes are advertised with bits * from &enum ieee80211_rate_control_changed and the values are reflected * in the station data. This callback should only be used when the driver * uses hardware rate control (%IEEE80211_HW_HAS_RATE_CONTROL) since @@ -4581,10 +4581,10 @@ struct ieee80211_ops { void (*sta_pre_rcu_remove)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); - void (*sta_rc_update)(struct ieee80211_hw *hw, - struct ieee80211_vif *vif, - struct ieee80211_sta *sta, - u32 changed); + void (*link_sta_rc_update)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_link_sta *link_sta, + u32 changed); void (*sta_rate_tbl_update)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index b355e9af268d..2d7ec557cbd6 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -467,7 +467,7 @@ static void ieee80211_chan_bw_change(struct ieee80211_local *local, continue; link_sta->pub->bandwidth = new_sta_bw; - rate_control_rate_update(local, sband, sta, link_id, + rate_control_rate_update(local, sband, link_sta, IEEE80211_RC_BW_CHANGED); } } diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index fe868b521622..84d048339113 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -181,9 +181,10 @@ int drv_sta_set_txpwr(struct ieee80211_local *local, return ret; } -void drv_sta_rc_update(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, - struct ieee80211_sta *sta, u32 changed) +void drv_link_sta_rc_update(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_link_sta *link_sta, + u32 changed) { sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) @@ -193,10 +194,10 @@ void drv_sta_rc_update(struct ieee80211_local *local, (sdata->vif.type != NL80211_IFTYPE_ADHOC && sdata->vif.type != NL80211_IFTYPE_MESH_POINT)); - trace_drv_sta_rc_update(local, sdata, sta, changed); - if (local->ops->sta_rc_update) - local->ops->sta_rc_update(&local->hw, &sdata->vif, - sta, changed); + trace_drv_link_sta_rc_update(local, sdata, link_sta, changed); + if (local->ops->link_sta_rc_update) + local->ops->link_sta_rc_update(&local->hw, &sdata->vif, + link_sta, changed); trace_drv_return_void(local); } diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 48bc2da728c0..edd1e4d4ad9d 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -594,9 +594,9 @@ int drv_sta_set_txpwr(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta); -void drv_sta_rc_update(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, - struct ieee80211_sta *sta, u32 changed); +void drv_link_sta_rc_update(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_link_sta *link_sta, u32 changed); static inline void drv_sta_rate_tbl_update(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 08fac295ad5b..a1b4178deccf 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -1072,7 +1072,8 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, if (sta->sta.deflink.rx_nss != rx_nss) changed |= IEEE80211_RC_NSS_CHANGED; - drv_sta_rc_update(local, sdata, &sta->sta, changed); + drv_link_sta_rc_update(local, sdata, &sta->sta.deflink, + changed); } rcu_read_unlock(); diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index be0700d64549..6ea35c88dc48 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -486,10 +486,11 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, sta->sta.deflink.bandwidth = IEEE80211_STA_RX_BW_20; } + /* FIXME: this check is wrong without SW rate control */ if (!test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) rate_control_rate_init(&sta->deflink); else - rate_control_rate_update(local, sband, sta, 0, changed); + rate_control_rate_update(local, sband, &sta->deflink, changed); out: spin_unlock_bh(&sta->mesh->plink_lock); } diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 23b4f1af37e0..63fe58311a77 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -113,16 +113,15 @@ void rate_control_tx_status(struct ieee80211_local *local, void rate_control_rate_update(struct ieee80211_local *local, struct ieee80211_supported_band *sband, - struct sta_info *sta, unsigned int link_id, + struct link_sta_info *link_sta, u32 changed) { struct rate_control_ref *ref = local->rate_ctrl; + struct sta_info *sta = link_sta->sta; struct ieee80211_sta *ista = &sta->sta; void *priv_sta = sta->rate_ctrl_priv; struct ieee80211_chanctx_conf *chanctx_conf; - WARN_ON(link_id != 0); - if (ref && ref->ops->rate_update) { rcu_read_lock(); @@ -140,7 +139,8 @@ void rate_control_rate_update(struct ieee80211_local *local, } if (sta->uploaded) - drv_sta_rc_update(local, sta->sdata, &sta->sta, changed); + drv_link_sta_rc_update(local, sta->sdata, link_sta->pub, + changed); } int ieee80211_rate_control_register(const struct rate_control_ops *ops) diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h index 8d3c8903b4ae..673aa9efe30b 100644 --- a/net/mac80211/rate.h +++ b/net/mac80211/rate.h @@ -33,8 +33,7 @@ void rate_control_rate_init(struct link_sta_info *link_sta); void rate_control_rate_init_all_links(struct sta_info *sta); void rate_control_rate_update(struct ieee80211_local *local, struct ieee80211_supported_band *sband, - struct sta_info *sta, - unsigned int link_id, + struct link_sta_info *link_sta, u32 changed); static inline void *rate_control_alloc_sta(struct rate_control_ref *ref, diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 694b43091fec..d7b294221d43 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3568,7 +3568,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) sband = rx->local->hw.wiphy->bands[status->band]; - rate_control_rate_update(local, sband, rx->sta, 0, + rate_control_rate_update(local, sband, rx->link_sta, IEEE80211_RC_SMPS_CHANGED); cfg80211_sta_opmode_change_notify(sdata->dev, rx->sta->addr, @@ -3605,7 +3605,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) ieee80211_sta_rx_bw_to_chan_width(rx->link_sta); sta_opmode.changed = STA_OPMODE_MAX_BW_CHANGED; - rate_control_rate_update(local, sband, rx->sta, 0, + rate_control_rate_update(local, sband, rx->link_sta, IEEE80211_RC_BW_CHANGED); cfg80211_sta_opmode_change_notify(sdata->dev, rx->sta->addr, diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index f07b40916485..2f92e7c7f203 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -1342,7 +1342,8 @@ static void iee80211_tdls_recalc_chanctx(struct ieee80211_sub_if_data *sdata, bw = min(bw, ieee80211_sta_cap_rx_bw(&sta->deflink)); if (bw != sta->sta.deflink.bandwidth) { sta->sta.deflink.bandwidth = bw; - rate_control_rate_update(local, sband, sta, 0, + rate_control_rate_update(local, sband, + &sta->deflink, IEEE80211_RC_BW_CHANGED); /* * if a TDLS peer BW was updated, we need to diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index e6f0ce8e5d43..7a4985fc2b16 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -939,31 +939,34 @@ TRACE_EVENT(drv_sta_set_txpwr, ) ); -TRACE_EVENT(drv_sta_rc_update, +TRACE_EVENT(drv_link_sta_rc_update, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, - struct ieee80211_sta *sta, + struct ieee80211_link_sta *link_sta, u32 changed), - TP_ARGS(local, sdata, sta, changed), + TP_ARGS(local, sdata, link_sta, changed), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY STA_ENTRY __field(u32, changed) + __field(u32, link_id) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; - STA_ASSIGN; + STA_NAMED_ASSIGN(link_sta->sta); __entry->changed = changed; + __entry->link_id = link_sta->link_id; ), TP_printk( - LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " changed: 0x%x", - LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->changed + LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " (link %d) changed: 0x%x", + LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->link_id, + __entry->changed ) ); diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index eafe47bf201a..0d439cefb445 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -766,8 +766,7 @@ void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, if (changed > 0) { ieee80211_recalc_min_chandef(sdata, link_sta->link_id); - rate_control_rate_update(local, sband, link_sta->sta, - link_sta->link_id, changed); + rate_control_rate_update(local, sband, link_sta, changed); } } -- cgit v1.3 From cf007927972707d91fae3013ae27f165cebdf535 Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Mon, 7 Oct 2024 15:00:55 +0300 Subject: wifi: mac80211: parse A-MSDU len from EHT capabilities On 2.4 GHz there's no VHT, so EHT defines its own bits for the maximum MPDU length. Parse and store them in the link_sta. Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241007144851.e05da59c419a.I0b1c047639160d9a96f48ab013c18ea33f5473b0@changeid Signed-off-by: Johannes Berg --- net/mac80211/eht.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/eht.c b/net/mac80211/eht.c index ddc7acc68335..7a3116c36df9 100644 --- a/net/mac80211/eht.c +++ b/net/mac80211/eht.c @@ -2,7 +2,7 @@ /* * EHT handling * - * Copyright(c) 2021-2023 Intel Corporation + * Copyright(c) 2021-2024 Intel Corporation */ #include "ieee80211_i.h" @@ -75,4 +75,23 @@ ieee80211_eht_cap_ie_to_sta_eht_cap(struct ieee80211_sub_if_data *sdata, link_sta->cur_max_bandwidth = ieee80211_sta_cap_rx_bw(link_sta); link_sta->pub->bandwidth = ieee80211_sta_cur_vht_bw(link_sta); + + switch (u8_get_bits(eht_cap->eht_cap_elem.mac_cap_info[0], + IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_MASK)) { + case IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_11454: + link_sta->pub->agg.max_amsdu_len = + IEEE80211_MAX_MPDU_LEN_VHT_11454; + break; + case IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_7991: + link_sta->pub->agg.max_amsdu_len = + IEEE80211_MAX_MPDU_LEN_VHT_7991; + break; + case IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_3895: + default: + link_sta->pub->agg.max_amsdu_len = + IEEE80211_MAX_MPDU_LEN_VHT_3895; + break; + } + + ieee80211_sta_recalc_aggregates(&link_sta->sta->sta); } -- cgit v1.3 From 41eba07636af6f95a9421af574a5a2fa9f3888ee Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Mon, 7 Oct 2024 15:00:56 +0300 Subject: wifi: mac80211: add an option to fake ieee80211_connection_loss This allows faking this function in KUnit tests. Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241007144851.3b42e7547c65.I3bcbd51bec9ccfc7c08739450ec778722549c007@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 714f780cc0f5..480b664151c9 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -31,6 +31,8 @@ #include "led.h" #include "fils_aead.h" +#include + #define IEEE80211_AUTH_TIMEOUT (HZ / 5) #define IEEE80211_AUTH_TIMEOUT_LONG (HZ / 2) #define IEEE80211_AUTH_TIMEOUT_SHORT (HZ / 10) @@ -4182,8 +4184,13 @@ EXPORT_SYMBOL(ieee80211_beacon_loss); void ieee80211_connection_loss(struct ieee80211_vif *vif) { - struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); - struct ieee80211_hw *hw = &sdata->local->hw; + struct ieee80211_sub_if_data *sdata; + struct ieee80211_hw *hw; + + KUNIT_STATIC_STUB_REDIRECT(ieee80211_connection_loss, vif); + + sdata = vif_to_sdata(vif); + hw = &sdata->local->hw; trace_api_connection_loss(sdata); -- cgit v1.3 From b23af47921a708e0008698eb373118bb5f20334c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 15:00:57 +0300 Subject: wifi: mac80211: chan: calculate min_def also for client mode In order to deal with (temporary) bandwidth reductions to/from the AP such as the upcoming RX OMI changes, modify the min_def calculation to also not take the chanreq width into account in client mode. This normally changes nothing as the AP bandwidth will be the same as the channel request's width. In the RX OMI changes, however, the code will reduce the bandwidth for only the AP STA, since the OMI is only to that, and TDLS STAs are unaffected. Using the min_def for this case simplifies RX OMI a lot. Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241007144851.95a39c4f6f45.I2e7517fb1a7221dc6f60b0c752e4882042b4265d@changeid Signed-off-by: Johannes Berg --- net/mac80211/chan.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 2d7ec557cbd6..0e44f4e38099 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -323,18 +323,26 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local, continue; switch (link->sdata->vif.type) { - case NL80211_IFTYPE_AP: - case NL80211_IFTYPE_AP_VLAN: - width = ieee80211_get_max_required_bw(link); - break; case NL80211_IFTYPE_STATION: + if (!link->sdata->vif.cfg.assoc) { + /* + * The AP's sta->bandwidth may not yet be set + * at this point (pre-association), so simply + * take the width from the chandef. We cannot + * have TDLS peers yet (only after association). + */ + width = link->conf->chanreq.oper.width; + break; + } /* - * The ap's sta->bandwidth is not set yet at this - * point, so take the width from the chandef, but - * account also for TDLS peers + * otherwise just use min_def like in AP, depending on what + * we currently think the AP STA (and possibly TDLS peers) + * require(s) */ - width = max(link->conf->chanreq.oper.width, - ieee80211_get_max_required_bw(link)); + fallthrough; + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_AP_VLAN: + width = ieee80211_get_max_required_bw(link); break; case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_NAN: -- cgit v1.3 From 751e7489c1d74b94ffffbed619d8fd724eeff4ee Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 7 Oct 2024 15:00:58 +0300 Subject: wifi: mac80211: expose ieee80211_chan_width_to_rx_bw() to drivers Drivers might need to also do this calculation, no point in them duplicating the code. Since it's so simple, just make it an inline. Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20241007144851.af003cb4a088.I8b5d29504b726caae24af6013c65b3daebe842a2@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 27 +++++++++++++++++++++++++++ net/mac80211/ieee80211_i.h | 2 -- net/mac80211/vht.c | 22 ---------------------- 3 files changed, 27 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 7a6edc04e212..32bdda90a4ac 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -7704,6 +7704,33 @@ void ieee80211_set_active_links_async(struct ieee80211_vif *vif, */ void ieee80211_send_teardown_neg_ttlm(struct ieee80211_vif *vif); +/** + * ieee80211_chan_width_to_rx_bw - convert channel width to STA RX bandwidth + * @width: the channel width value to convert + * Return: the STA RX bandwidth value for the channel width + */ +static inline enum ieee80211_sta_rx_bandwidth +ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width) +{ + switch (width) { + default: + WARN_ON_ONCE(1); + fallthrough; + case NL80211_CHAN_WIDTH_20_NOHT: + case NL80211_CHAN_WIDTH_20: + return IEEE80211_STA_RX_BW_20; + case NL80211_CHAN_WIDTH_40: + return IEEE80211_STA_RX_BW_40; + case NL80211_CHAN_WIDTH_80: + return IEEE80211_STA_RX_BW_80; + case NL80211_CHAN_WIDTH_160: + case NL80211_CHAN_WIDTH_80P80: + return IEEE80211_STA_RX_BW_160; + case NL80211_CHAN_WIDTH_320: + return IEEE80211_STA_RX_BW_320; + } +} + /* for older drivers - let's not document these ... */ int ieee80211_emulate_add_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 45987add530f..d20c2e796703 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2194,8 +2194,6 @@ ieee80211_sta_cur_vht_bw(struct link_sta_info *link_sta) return _ieee80211_sta_cur_vht_bw(link_sta, NULL); } void ieee80211_sta_init_nss(struct link_sta_info *link_sta); -enum ieee80211_sta_rx_bandwidth -ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width); enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct link_sta_info *link_sta); void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index 0d439cefb445..6a20fa099190 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -479,28 +479,6 @@ ieee80211_sta_rx_bw_to_chan_width(struct link_sta_info *link_sta) } } -enum ieee80211_sta_rx_bandwidth -ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width) -{ - switch (width) { - case NL80211_CHAN_WIDTH_20_NOHT: - case NL80211_CHAN_WIDTH_20: - return IEEE80211_STA_RX_BW_20; - case NL80211_CHAN_WIDTH_40: - return IEEE80211_STA_RX_BW_40; - case NL80211_CHAN_WIDTH_80: - return IEEE80211_STA_RX_BW_80; - case NL80211_CHAN_WIDTH_160: - case NL80211_CHAN_WIDTH_80P80: - return IEEE80211_STA_RX_BW_160; - case NL80211_CHAN_WIDTH_320: - return IEEE80211_STA_RX_BW_320; - default: - WARN_ON_ONCE(1); - return IEEE80211_STA_RX_BW_20; - } -} - /* FIXME: rename/move - this deals with everything not just VHT */ enum ieee80211_sta_rx_bandwidth _ieee80211_sta_cur_vht_bw(struct link_sta_info *link_sta, -- cgit v1.3 From 3607798ad9bdef35ad08489a8239390fccaac6b5 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 9 Oct 2024 10:25:42 +0200 Subject: wifi: cfg80211: add option for vif allowed radios This allows users to prevent a vif from affecting radios other than the configured ones. This can be useful in cases where e.g. an AP is running on one radio, and triggering a scan on another radio should not disturb it. Changing the allowed radios list for a vif is supported, but only while it is down. While it is possible to achieve the same by always explicitly specifying a frequency list for scan requests and ensuring that the wrong channel/band is never accidentally set on an unrelated interface, this change makes multi-radio wiphy setups a lot easier to deal with for CLI users. By itself, this patch only enforces the radio mask for scanning requests and remain-on-channel. Follow-up changes build on this to limit configured frequencies. Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/eefcb218780f71a1549875d149f1196486762756.1728462320.git-series.nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 14 +++++++++++ include/uapi/linux/nl80211.h | 5 ++++ net/wireless/core.c | 2 ++ net/wireless/nl80211.c | 60 ++++++++++++++++++++++++++++++++++++++------ net/wireless/scan.c | 10 +++++--- net/wireless/util.c | 29 +++++++++++++++++++++ 6 files changed, 109 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index c8ce5c2e14f4..95d05e67e69a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -6221,6 +6221,7 @@ enum ieee80211_ap_reg_power { * entered. * @links.cac_time_ms: CAC time in ms * @valid_links: bitmap describing what elements of @links are valid + * @radio_mask: Bitmask of radios that this interface is allowed to operate on. */ struct wireless_dev { struct wiphy *wiphy; @@ -6333,6 +6334,8 @@ struct wireless_dev { unsigned int cac_time_ms; } links[IEEE80211_MLD_MAX_NUM_LINKS]; u16 valid_links; + + u32 radio_mask; }; static inline const u8 *wdev_address(struct wireless_dev *wdev) @@ -6518,6 +6521,17 @@ static inline bool cfg80211_channel_is_psc(struct ieee80211_channel *chan) bool cfg80211_radio_chandef_valid(const struct wiphy_radio *radio, const struct cfg80211_chan_def *chandef); +/** + * cfg80211_wdev_channel_allowed - Check if the wdev may use the channel + * + * @wdev: the wireless device + * @chan: channel to check + * + * Return: whether or not the wdev may use the channel + */ +bool cfg80211_wdev_channel_allowed(struct wireless_dev *wdev, + struct ieee80211_channel *chan); + /** * ieee80211_get_response_rate - get basic rate for a given rate * diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index f97f5adc8d51..d31ccee99cc7 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2868,6 +2868,9 @@ enum nl80211_commands { * nested item, it contains attributes defined in * &enum nl80211_if_combination_attrs. * + * @NL80211_ATTR_VIF_RADIO_MASK: Bitmask of allowed radios (u32). + * A value of 0 means all radios. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3416,6 +3419,8 @@ enum nl80211_attrs { NL80211_ATTR_WIPHY_RADIOS, NL80211_ATTR_WIPHY_INTERFACE_COMBINATIONS, + NL80211_ATTR_VIF_RADIO_MASK, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/core.c b/net/wireless/core.c index 4c8d8f167409..93d62a1d3a45 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1430,6 +1430,8 @@ void cfg80211_init_wdev(struct wireless_dev *wdev) /* allow mac80211 to determine the timeout */ wdev->ps_timeout = -1; + wdev->radio_mask = BIT(wdev->wiphy->n_radio) - 1; + if ((wdev->iftype == NL80211_IFTYPE_STATION || wdev->iftype == NL80211_IFTYPE_P2P_CLIENT || wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index fb35c03af34c..a330347dd7a3 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -829,6 +829,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MLO_TTLM_DLINK] = NLA_POLICY_EXACT_LEN(sizeof(u16) * 8), [NL80211_ATTR_MLO_TTLM_ULINK] = NLA_POLICY_EXACT_LEN(sizeof(u16) * 8), [NL80211_ATTR_ASSOC_SPP_AMSDU] = { .type = NLA_FLAG }, + [NL80211_ATTR_VIF_RADIO_MASK] = { .type = NLA_U32 }, }; /* policy for the key attributes */ @@ -3996,7 +3997,8 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag nla_put_u32(msg, NL80211_ATTR_GENERATION, rdev->devlist_generation ^ (cfg80211_rdev_list_generation << 2)) || - nla_put_u8(msg, NL80211_ATTR_4ADDR, wdev->use_4addr)) + nla_put_u8(msg, NL80211_ATTR_4ADDR, wdev->use_4addr) || + nla_put_u32(msg, NL80211_ATTR_VIF_RADIO_MASK, wdev->radio_mask)) goto nla_put_failure; if (rdev->ops->get_channel && !wdev->valid_links) { @@ -4312,6 +4314,29 @@ static int nl80211_valid_4addr(struct cfg80211_registered_device *rdev, return -EOPNOTSUPP; } +static int nl80211_parse_vif_radio_mask(struct genl_info *info, + u32 *radio_mask) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct nlattr *attr = info->attrs[NL80211_ATTR_VIF_RADIO_MASK]; + u32 mask, allowed; + + if (!attr) { + *radio_mask = 0; + return 0; + } + + allowed = BIT(rdev->wiphy.n_radio) - 1; + mask = nla_get_u32(attr); + if (mask & ~allowed) + return -EINVAL; + if (!mask) + mask = allowed; + *radio_mask = mask; + + return 1; +} + static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -4319,6 +4344,8 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) int err; enum nl80211_iftype otype, ntype; struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + u32 radio_mask = 0; bool change = false; memset(¶ms, 0, sizeof(params)); @@ -4332,8 +4359,6 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) } if (info->attrs[NL80211_ATTR_MESH_ID]) { - struct wireless_dev *wdev = dev->ieee80211_ptr; - if (ntype != NL80211_IFTYPE_MESH_POINT) return -EINVAL; if (otype != NL80211_IFTYPE_MESH_POINT) @@ -4364,6 +4389,12 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) if (err > 0) change = true; + err = nl80211_parse_vif_radio_mask(info, &radio_mask); + if (err < 0) + return err; + if (err && netif_running(dev)) + return -EBUSY; + if (change) err = cfg80211_change_iface(rdev, dev, ntype, ¶ms); else @@ -4372,11 +4403,11 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) if (!err && params.use_4addr != -1) dev->ieee80211_ptr->use_4addr = params.use_4addr; - if (change && !err) { - struct wireless_dev *wdev = dev->ieee80211_ptr; + if (radio_mask) + wdev->radio_mask = radio_mask; + if (change && !err) nl80211_notify_iface(rdev, wdev, NL80211_CMD_SET_INTERFACE); - } return err; } @@ -4387,6 +4418,7 @@ static int _nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) struct vif_params params; struct wireless_dev *wdev; struct sk_buff *msg; + u32 radio_mask; int err; enum nl80211_iftype type = NL80211_IFTYPE_UNSPECIFIED; @@ -4424,6 +4456,10 @@ static int _nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) if (err < 0) return err; + err = nl80211_parse_vif_radio_mask(info, &radio_mask); + if (err < 0) + return err; + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; @@ -4465,6 +4501,9 @@ static int _nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) break; } + if (radio_mask) + wdev->radio_mask = radio_mask; + if (nl80211_send_iface(msg, info->snd_portid, info->snd_seq, 0, rdev, wdev, NL80211_CMD_NEW_INTERFACE) < 0) { nlmsg_free(msg); @@ -9156,6 +9195,9 @@ static bool cfg80211_off_channel_oper_allowed(struct wireless_dev *wdev, lockdep_assert_wiphy(wdev->wiphy); + if (!cfg80211_wdev_channel_allowed(wdev, chan)) + return false; + if (!cfg80211_beaconing_iface_active(wdev)) return true; @@ -9368,7 +9410,8 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) } /* ignore disabled channels */ - if (chan->flags & IEEE80211_CHAN_DISABLED) + if (chan->flags & IEEE80211_CHAN_DISABLED || + !cfg80211_wdev_channel_allowed(wdev, chan)) continue; request->channels[i] = chan; @@ -9388,7 +9431,8 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) chan = &wiphy->bands[band]->channels[j]; - if (chan->flags & IEEE80211_CHAN_DISABLED) + if (chan->flags & IEEE80211_CHAN_DISABLED || + !cfg80211_wdev_channel_allowed(wdev, chan)) continue; request->channels[i] = chan; diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 8ba618f4734f..8e3d46bf4836 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -956,7 +956,8 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) struct ieee80211_channel *chan = ieee80211_get_channel(&rdev->wiphy, ap->center_freq); - if (!chan || chan->flags & IEEE80211_CHAN_DISABLED) + if (!chan || chan->flags & IEEE80211_CHAN_DISABLED || + !cfg80211_wdev_channel_allowed(rdev_req->wdev, chan)) continue; for (i = 0; i < rdev_req->n_channels; i++) { @@ -3515,9 +3516,12 @@ int cfg80211_wext_siwscan(struct net_device *dev, continue; for (j = 0; j < wiphy->bands[band]->n_channels; j++) { + struct ieee80211_channel *chan; + /* ignore disabled channels */ - if (wiphy->bands[band]->channels[j].flags & - IEEE80211_CHAN_DISABLED) + chan = &wiphy->bands[band]->channels[j]; + if (chan->flags & IEEE80211_CHAN_DISABLED || + !cfg80211_wdev_channel_allowed(creq->wdev, chan)) continue; /* If we have a wireless request structure and the diff --git a/net/wireless/util.c b/net/wireless/util.c index 93a9c32418a6..040d62051eb9 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -2923,3 +2923,32 @@ bool cfg80211_radio_chandef_valid(const struct wiphy_radio *radio, return true; } EXPORT_SYMBOL(cfg80211_radio_chandef_valid); + +bool cfg80211_wdev_channel_allowed(struct wireless_dev *wdev, + struct ieee80211_channel *chan) +{ + struct wiphy *wiphy = wdev->wiphy; + const struct wiphy_radio *radio; + struct cfg80211_chan_def chandef; + u32 radio_mask; + int i; + + radio_mask = wdev->radio_mask; + if (!wiphy->n_radio || radio_mask == BIT(wiphy->n_radio) - 1) + return true; + + cfg80211_chandef_create(&chandef, chan, NL80211_CHAN_HT20); + for (i = 0; i < wiphy->n_radio; i++) { + if (!(radio_mask & BIT(i))) + continue; + + radio = &wiphy->radio[i]; + if (!cfg80211_radio_chandef_valid(radio, &chandef)) + continue; + + return true; + } + + return false; +} +EXPORT_SYMBOL(cfg80211_wdev_channel_allowed); -- cgit v1.3 From 32ee616a7f8c36fa3ab00985ebd038c3487e721f Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 9 Oct 2024 10:25:43 +0200 Subject: wifi: mac80211: use vif radio mask to limit ibss scan frequencies Reject frequencies not supported by any radio that the vif is allowed to use. Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/9d5c0b6b00a7ecef6a0ac6de765c0af00c8bb0e1.1728462320.git-series.nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/scan.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index adb88c06b598..cb7079071885 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -1176,14 +1176,14 @@ int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, unsigned int n_channels) { struct ieee80211_local *local = sdata->local; - int ret = -EBUSY, i, n_ch = 0; + int i, n_ch = 0; enum nl80211_band band; lockdep_assert_wiphy(local->hw.wiphy); /* busy scanning */ if (local->scan_req) - goto unlock; + return -EBUSY; /* fill internal scan request */ if (!channels) { @@ -1200,7 +1200,9 @@ int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, &local->hw.wiphy->bands[band]->channels[i]; if (tmp_ch->flags & (IEEE80211_CHAN_NO_IR | - IEEE80211_CHAN_DISABLED)) + IEEE80211_CHAN_DISABLED) || + !cfg80211_wdev_channel_allowed(&sdata->wdev, + tmp_ch)) continue; local->int_scan_req->channels[n_ch] = tmp_ch; @@ -1209,21 +1211,23 @@ int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, } if (WARN_ON_ONCE(n_ch == 0)) - goto unlock; + return -EINVAL; local->int_scan_req->n_channels = n_ch; } else { for (i = 0; i < n_channels; i++) { if (channels[i]->flags & (IEEE80211_CHAN_NO_IR | - IEEE80211_CHAN_DISABLED)) + IEEE80211_CHAN_DISABLED) || + !cfg80211_wdev_channel_allowed(&sdata->wdev, + channels[i])) continue; local->int_scan_req->channels[n_ch] = channels[i]; n_ch++; } - if (WARN_ON_ONCE(n_ch == 0)) - goto unlock; + if (n_ch == 0) + return -EINVAL; local->int_scan_req->n_channels = n_ch; } @@ -1233,9 +1237,7 @@ int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, memcpy(local->int_scan_req->ssids[0].ssid, ssid, IEEE80211_MAX_SSID_LEN); local->int_scan_req->ssids[0].ssid_len = ssid_len; - ret = __ieee80211_start_scan(sdata, sdata->local->int_scan_req); - unlock: - return ret; + return __ieee80211_start_scan(sdata, sdata->local->int_scan_req); } void ieee80211_scan_cancel(struct ieee80211_local *local) -- cgit v1.3 From 7b68f63d5c00105f8b1f28017369f1da6dfe704c Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 9 Oct 2024 10:25:44 +0200 Subject: wifi: mac80211: use vif radio mask to limit creating chanctx Reject frequencies not supported by any radio that the vif is allowed to use. Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/95ea1f6fc5bd1614a0c7952b6c67726e3fd635fb.1728462320.git-series.nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/chan.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 0e44f4e38099..996965005d49 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -1177,7 +1177,7 @@ ieee80211_replace_chanctx(struct ieee80211_local *local, static bool ieee80211_find_available_radio(struct ieee80211_local *local, const struct ieee80211_chan_req *chanreq, - int *radio_idx) + u32 radio_mask, int *radio_idx) { struct wiphy *wiphy = local->hw.wiphy; const struct wiphy_radio *radio; @@ -1188,6 +1188,9 @@ ieee80211_find_available_radio(struct ieee80211_local *local, return true; for (i = 0; i < wiphy->n_radio; i++) { + if (!(radio_mask & BIT(i))) + continue; + radio = &wiphy->radio[i]; if (!cfg80211_radio_chandef_valid(radio, &chanreq->oper)) continue; @@ -1221,7 +1224,9 @@ int ieee80211_link_reserve_chanctx(struct ieee80211_link_data *link, new_ctx = ieee80211_find_reservation_chanctx(local, chanreq, mode); if (!new_ctx) { if (ieee80211_can_create_new_chanctx(local, -1) && - ieee80211_find_available_radio(local, chanreq, &radio_idx)) + ieee80211_find_available_radio(local, chanreq, + sdata->wdev.radio_mask, + &radio_idx)) new_ctx = ieee80211_new_chanctx(local, chanreq, mode, false, radio_idx); else @@ -1891,7 +1896,9 @@ int _ieee80211_link_use_channel(struct ieee80211_link_data *link, /* Note: context is now reserved */ if (ctx) reserved = true; - else if (!ieee80211_find_available_radio(local, chanreq, &radio_idx)) + else if (!ieee80211_find_available_radio(local, chanreq, + sdata->wdev.radio_mask, + &radio_idx)) ctx = ERR_PTR(-EBUSY); else ctx = ieee80211_new_chanctx(local, chanreq, mode, -- cgit v1.3 From ebda716ea4da03326ac4d0a71604d18aa8a2e695 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 9 Oct 2024 10:25:45 +0200 Subject: wifi: cfg80211: report per wiphy radio antenna mask With multi-radio devices, each radio typically gets a fixed set of antennas. In order to be able to disable specific antennas for some radios, user space needs to know which antenna mask bits are assigned to which radio. Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/e0a26afa2c88eaa188ec96ec6d17ecac4e827641.1728462320.git-series.nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 ++++ include/uapi/linux/nl80211.h | 3 +++ net/wireless/nl80211.c | 5 +++++ 3 files changed, 12 insertions(+) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 95d05e67e69a..3100733f3e23 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5434,6 +5434,8 @@ struct wiphy_radio_freq_range { * @iface_combinations: Valid interface combinations array, should not * list single interface types. * @n_iface_combinations: number of entries in @iface_combinations array. + * + * @antenna_mask: bitmask of antennas connected to this radio. */ struct wiphy_radio { const struct wiphy_radio_freq_range *freq_range; @@ -5441,6 +5443,8 @@ struct wiphy_radio { const struct ieee80211_iface_combination *iface_combinations; int n_iface_combinations; + + u32 antenna_mask; }; #define CFG80211_HW_TIMESTAMP_ALL_PEERS 0xffff diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index d31ccee99cc7..1b8827f920ff 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -8036,6 +8036,8 @@ enum nl80211_ap_settings_flags { * @NL80211_WIPHY_RADIO_ATTR_INTERFACE_COMBINATION: Supported interface * combination for this radio. Attribute may be present multiple times * and contains attributes defined in &enum nl80211_if_combination_attrs. + * @NL80211_WIPHY_RADIO_ATTR_ANTENNA_MASK: bitmask (u32) of antennas + * connected to this radio. * * @__NL80211_WIPHY_RADIO_ATTR_LAST: Internal * @NL80211_WIPHY_RADIO_ATTR_MAX: Highest attribute @@ -8046,6 +8048,7 @@ enum nl80211_wiphy_radio_attrs { NL80211_WIPHY_RADIO_ATTR_INDEX, NL80211_WIPHY_RADIO_ATTR_FREQ_RANGE, NL80211_WIPHY_RADIO_ATTR_INTERFACE_COMBINATION, + NL80211_WIPHY_RADIO_ATTR_ANTENNA_MASK, /* keep last */ __NL80211_WIPHY_RADIO_ATTR_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index a330347dd7a3..aa78f18dd454 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2431,6 +2431,11 @@ static int nl80211_put_radio(struct wiphy *wiphy, struct sk_buff *msg, int idx) if (nla_put_u32(msg, NL80211_WIPHY_RADIO_ATTR_INDEX, idx)) goto nla_put_failure; + if (r->antenna_mask && + nla_put_u32(msg, NL80211_WIPHY_RADIO_ATTR_ANTENNA_MASK, + r->antenna_mask)) + goto nla_put_failure; + for (i = 0; i < r->n_freq_range; i++) { const struct wiphy_radio_freq_range *range = &r->freq_range[i]; -- cgit v1.3 From 006a97ceb6732c861c0e2fa3b6a34512caac9354 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 9 Oct 2024 10:25:46 +0200 Subject: wifi: mac80211: remove status->ampdu_delimiter_crc This was never used by any driver, so remove it to free up some space. Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/e6fee6eed49b105261830db1c74f13841fb9616c.1728462320.git-series.nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 +----- net/mac80211/rx.c | 7 +------ 2 files changed, 2 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 32bdda90a4ac..d4f23224eff9 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1463,8 +1463,6 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * @RX_FLAG_AMPDU_IS_LAST: this subframe is the last subframe of the A-MPDU * @RX_FLAG_AMPDU_DELIM_CRC_ERROR: A delimiter CRC error has been detected * on this subframe - * @RX_FLAG_AMPDU_DELIM_CRC_KNOWN: The delimiter CRC field is known (the CRC - * is stored in the @ampdu_delimiter_crc field) * @RX_FLAG_MIC_STRIPPED: The mic was stripped of this packet. Decryption was * done by the hardware * @RX_FLAG_ONLY_MONITOR: Report frame only to monitor interfaces without @@ -1536,7 +1534,7 @@ enum mac80211_rx_flags { RX_FLAG_AMPDU_LAST_KNOWN = BIT(12), RX_FLAG_AMPDU_IS_LAST = BIT(13), RX_FLAG_AMPDU_DELIM_CRC_ERROR = BIT(14), - RX_FLAG_AMPDU_DELIM_CRC_KNOWN = BIT(15), + /* one free bit at 15 */ RX_FLAG_MACTIME = BIT(16) | BIT(17), RX_FLAG_MACTIME_PLCP_START = 1 << 16, RX_FLAG_MACTIME_START = 2 << 16, @@ -1633,7 +1631,6 @@ enum mac80211_rx_encoding { * @rx_flags: internal RX flags for mac80211 * @ampdu_reference: A-MPDU reference number, must be a different value for * each A-MPDU but the same for each subframe within one A-MPDU - * @ampdu_delimiter_crc: A-MPDU delimiter CRC * @zero_length_psdu_type: radiotap type of the 0-length PSDU * @link_valid: if the link which is identified by @link_id is valid. This flag * is set only when connection is MLO. @@ -1671,7 +1668,6 @@ struct ieee80211_rx_status { s8 signal; u8 chains; s8 chain_signal[IEEE80211_MAX_CHAINS]; - u8 ampdu_delimiter_crc; u8 zero_length_psdu_type; u8 link_valid:1, link_id:4; }; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index d7b294221d43..a34523bbd156 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -508,18 +508,13 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, flags |= IEEE80211_RADIOTAP_AMPDU_IS_LAST; if (status->flag & RX_FLAG_AMPDU_DELIM_CRC_ERROR) flags |= IEEE80211_RADIOTAP_AMPDU_DELIM_CRC_ERR; - if (status->flag & RX_FLAG_AMPDU_DELIM_CRC_KNOWN) - flags |= IEEE80211_RADIOTAP_AMPDU_DELIM_CRC_KNOWN; if (status->flag & RX_FLAG_AMPDU_EOF_BIT_KNOWN) flags |= IEEE80211_RADIOTAP_AMPDU_EOF_KNOWN; if (status->flag & RX_FLAG_AMPDU_EOF_BIT) flags |= IEEE80211_RADIOTAP_AMPDU_EOF; put_unaligned_le16(flags, pos); pos += 2; - if (status->flag & RX_FLAG_AMPDU_DELIM_CRC_KNOWN) - *pos++ = status->ampdu_delimiter_crc; - else - *pos++ = 0; + *pos++ = 0; *pos++ = 0; } -- cgit v1.3 From 9c4f830927750a2bf9fd9426a5257f0fdce3b662 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 9 Oct 2024 10:25:47 +0200 Subject: wifi: cfg80211: pass net_device to .set_monitor_channel Preparation for allowing multiple monitor interfaces with different channels on a multi-radio wiphy. Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/35fa652dbfebf93343f8b9a08fdef0467a2a02dc.1728462320.git-series.nbd@nbd.name Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/wil6210/cfg80211.c | 1 + drivers/net/wireless/marvell/libertas/cfg.c | 1 + drivers/net/wireless/microchip/wilc1000/cfg80211.c | 3 ++- include/net/cfg80211.h | 1 + net/mac80211/cfg.c | 1 + net/wireless/chan.c | 3 ++- net/wireless/core.h | 1 + net/wireless/nl80211.c | 2 +- net/wireless/rdev-ops.h | 5 +++-- net/wireless/trace.h | 10 ++++++---- net/wireless/wext-compat.c | 2 +- 11 files changed, 20 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/drivers/net/wireless/ath/wil6210/cfg80211.c b/drivers/net/wireless/ath/wil6210/cfg80211.c index e8f1d30a8d73..a1a0a9223e74 100644 --- a/drivers/net/wireless/ath/wil6210/cfg80211.c +++ b/drivers/net/wireless/ath/wil6210/cfg80211.c @@ -1493,6 +1493,7 @@ out: } static int wil_cfg80211_set_channel(struct wiphy *wiphy, + struct net_device *dev, struct cfg80211_chan_def *chandef) { struct wil6210_priv *wil = wiphy_to_wil(wiphy); diff --git a/drivers/net/wireless/marvell/libertas/cfg.c b/drivers/net/wireless/marvell/libertas/cfg.c index afe9bcd3ad46..2e2c193716d9 100644 --- a/drivers/net/wireless/marvell/libertas/cfg.c +++ b/drivers/net/wireless/marvell/libertas/cfg.c @@ -486,6 +486,7 @@ static int lbs_add_wps_enrollee_tlv(u8 *tlv, const u8 *ie, size_t ie_len) */ static int lbs_cfg_set_monitor_channel(struct wiphy *wiphy, + struct net_device *dev, struct cfg80211_chan_def *chandef) { struct lbs_private *priv = wiphy_priv(wiphy); diff --git a/drivers/net/wireless/microchip/wilc1000/cfg80211.c b/drivers/net/wireless/microchip/wilc1000/cfg80211.c index b0dae6f7c633..e96736cc7259 100644 --- a/drivers/net/wireless/microchip/wilc1000/cfg80211.c +++ b/drivers/net/wireless/microchip/wilc1000/cfg80211.c @@ -231,6 +231,7 @@ struct wilc_vif *wilc_get_wl_to_vif(struct wilc *wl) } static int set_channel(struct wiphy *wiphy, + struct net_device *dev, struct cfg80211_chan_def *chandef) { struct wilc *wl = wiphy_priv(wiphy); @@ -1424,7 +1425,7 @@ static int start_ap(struct wiphy *wiphy, struct net_device *dev, struct wilc_vif *vif = netdev_priv(dev); int ret; - ret = set_channel(wiphy, &settings->chandef); + ret = set_channel(wiphy, dev, &settings->chandef); if (ret != 0) netdev_err(dev, "Error in setting channel\n"); diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 3100733f3e23..5feb93ba0400 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4694,6 +4694,7 @@ struct cfg80211_ops { struct ieee80211_channel *chan); int (*set_monitor_channel)(struct wiphy *wiphy, + struct net_device *dev, struct cfg80211_chan_def *chandef); int (*scan)(struct wiphy *wiphy, diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index ecc138869b4b..492349d6f7bb 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -897,6 +897,7 @@ static int ieee80211_get_station(struct wiphy *wiphy, struct net_device *dev, } static int ieee80211_set_monitor_channel(struct wiphy *wiphy, + struct net_device *dev, struct cfg80211_chan_def *chandef) { struct ieee80211_local *local = wiphy_priv(wiphy); diff --git a/net/wireless/chan.c b/net/wireless/chan.c index afd86f7c66ce..40b6375a5de4 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -1628,6 +1628,7 @@ bool cfg80211_reg_check_beaconing(struct wiphy *wiphy, EXPORT_SYMBOL(cfg80211_reg_check_beaconing); int cfg80211_set_monitor_channel(struct cfg80211_registered_device *rdev, + struct net_device *dev, struct cfg80211_chan_def *chandef) { if (!rdev->ops->set_monitor_channel) @@ -1635,7 +1636,7 @@ int cfg80211_set_monitor_channel(struct cfg80211_registered_device *rdev, if (!cfg80211_has_monitors_only(rdev)) return -EBUSY; - return rdev_set_monitor_channel(rdev, chandef); + return rdev_set_monitor_channel(rdev, dev, chandef); } bool cfg80211_any_usable_channels(struct wiphy *wiphy, diff --git a/net/wireless/core.h b/net/wireless/core.h index 3b3e3cd7027a..4c45f994a8c0 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -516,6 +516,7 @@ static inline unsigned int elapsed_jiffies_msecs(unsigned long start) } int cfg80211_set_monitor_channel(struct cfg80211_registered_device *rdev, + struct net_device *dev, struct cfg80211_chan_def *chandef); int ieee80211_get_ratemask(struct ieee80211_supported_band *sband, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index aa78f18dd454..84015f56e93a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3567,7 +3567,7 @@ static int __nl80211_set_channel(struct cfg80211_registered_device *rdev, case NL80211_IFTYPE_MESH_POINT: return cfg80211_set_mesh_channel(rdev, wdev, &chandef); case NL80211_IFTYPE_MONITOR: - return cfg80211_set_monitor_channel(rdev, &chandef); + return cfg80211_set_monitor_channel(rdev, dev, &chandef); default: break; } diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index f5adbf6b5c84..adb6105bbb7d 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -445,11 +445,12 @@ rdev_libertas_set_mesh_channel(struct cfg80211_registered_device *rdev, static inline int rdev_set_monitor_channel(struct cfg80211_registered_device *rdev, + struct net_device *dev, struct cfg80211_chan_def *chandef) { int ret; - trace_rdev_set_monitor_channel(&rdev->wiphy, chandef); - ret = rdev->ops->set_monitor_channel(&rdev->wiphy, chandef); + trace_rdev_set_monitor_channel(&rdev->wiphy, dev, chandef); + ret = rdev->ops->set_monitor_channel(&rdev->wiphy, dev, chandef); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 97c21b627791..d5c9bb614fa6 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1318,19 +1318,21 @@ TRACE_EVENT(rdev_libertas_set_mesh_channel, ); TRACE_EVENT(rdev_set_monitor_channel, - TP_PROTO(struct wiphy *wiphy, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_chan_def *chandef), - TP_ARGS(wiphy, chandef), + TP_ARGS(wiphy, netdev, chandef), TP_STRUCT__entry( WIPHY_ENTRY + NETDEV_ENTRY CHAN_DEF_ENTRY ), TP_fast_assign( WIPHY_ASSIGN; + NETDEV_ASSIGN; CHAN_DEF_ASSIGN(chandef); ), - TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT, - WIPHY_PR_ARG, CHAN_DEF_PR_ARG) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT, + WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG) ); TRACE_EVENT(rdev_auth, diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 0c8d3797a02e..90d5c0592667 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -821,7 +821,7 @@ static int cfg80211_wext_siwfreq(struct net_device *dev, ret = -EINVAL; break; } - ret = cfg80211_set_monitor_channel(rdev, &chandef); + ret = cfg80211_set_monitor_channel(rdev, dev, &chandef); break; case NL80211_IFTYPE_MESH_POINT: freq = cfg80211_wext_freq(wextfreq); -- cgit v1.3 From 9d40f7e32774279be7e5a7a278d7a290872b2f81 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 9 Oct 2024 10:25:48 +0200 Subject: wifi: mac80211: add flag to opt out of virtual monitor support This is useful for multi-radio devices that are capable of monitoring on multiple channels simultanenously. When this flag is set, each monitor interface is passed to the driver individually and can have a configured channel. The vif mac address for non-active monitor interfaces is cleared, in order to allow the driver to tell them apart from active ones. Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/3c55505ee0cf0a5f141fbcb30d1e8be8d9f40373.1728462320.git-series.nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 ++++++ net/mac80211/cfg.c | 44 ++++++++++++++++++++++++++++---------------- net/mac80211/chan.c | 14 +++++++++++++- net/mac80211/debugfs.c | 1 + net/mac80211/driver-ops.c | 1 + net/mac80211/iface.c | 22 +++++++++++++++++----- net/mac80211/rx.c | 3 +++ net/mac80211/tx.c | 6 ++++-- net/mac80211/util.c | 14 ++++++++++---- 9 files changed, 83 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index d4f23224eff9..b4f246cdcca4 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2694,6 +2694,11 @@ struct ieee80211_txq { * a virtual monitor interface when monitor interfaces are the only * active interfaces. * + * @IEEE80211_HW_NO_VIRTUAL_MONITOR: The driver would like to be informed + * of any monitor interface, as well as their configured channel. + * This is useful for supporting multiple monitor interfaces on different + * channels. + * * @IEEE80211_HW_NO_AUTO_VIF: The driver would like for no wlanX to * be created. It is expected user-space will create vifs as * desired (and thus have them named as desired). @@ -2853,6 +2858,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_SUPPORTS_DYNAMIC_PS, IEEE80211_HW_MFP_CAPABLE, IEEE80211_HW_WANT_MONITOR_VIF, + IEEE80211_HW_NO_VIRTUAL_MONITOR, IEEE80211_HW_NO_AUTO_VIF, IEEE80211_HW_SW_CRYPTO_CONTROL, IEEE80211_HW_SUPPORT_FAST_XMIT, diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 492349d6f7bb..6c0b228523cb 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -105,8 +105,11 @@ static int ieee80211_set_mon_options(struct ieee80211_sub_if_data *sdata, } /* also validate MU-MIMO change */ - monitor_sdata = wiphy_dereference(local->hw.wiphy, - local->monitor_sdata); + if (ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) + monitor_sdata = sdata; + else + monitor_sdata = wiphy_dereference(local->hw.wiphy, + local->monitor_sdata); if (!monitor_sdata && (params->vht_mumimo_groups || params->vht_mumimo_follow_addr)) @@ -114,7 +117,9 @@ static int ieee80211_set_mon_options(struct ieee80211_sub_if_data *sdata, /* apply all changes now - no failures allowed */ - if (monitor_sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) + if (monitor_sdata && + (ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) || + ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR))) ieee80211_set_mu_mimo_follow(monitor_sdata, params); if (params->flags) { @@ -907,22 +912,25 @@ static int ieee80211_set_monitor_channel(struct wiphy *wiphy, lockdep_assert_wiphy(local->hw.wiphy); - if (cfg80211_chandef_identical(&local->monitor_chanreq.oper, - &chanreq.oper)) - return 0; + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + if (!ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { + if (cfg80211_chandef_identical(&local->monitor_chanreq.oper, + &chanreq.oper)) + return 0; - sdata = wiphy_dereference(local->hw.wiphy, - local->monitor_sdata); - if (!sdata) - goto done; + sdata = wiphy_dereference(wiphy, local->monitor_sdata); + if (!sdata) + goto done; + } - if (cfg80211_chandef_identical(&sdata->vif.bss_conf.chanreq.oper, + if (rcu_access_pointer(sdata->deflink.conf->chanctx_conf) && + cfg80211_chandef_identical(&sdata->vif.bss_conf.chanreq.oper, &chanreq.oper)) return 0; ieee80211_link_release_channel(&sdata->deflink); ret = ieee80211_link_use_channel(&sdata->deflink, &chanreq, - IEEE80211_CHANCTX_EXCLUSIVE); + IEEE80211_CHANCTX_SHARED); if (ret) return ret; done: @@ -3084,7 +3092,8 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy, if (wdev) { sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); - if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { + if (sdata->vif.type == NL80211_IFTYPE_MONITOR && + !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) return -EOPNOTSUPP; @@ -3118,7 +3127,8 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy, local->user_power_level = user_power_level; list_for_each_entry(sdata, &local->interfaces, list) { - if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { + if (sdata->vif.type == NL80211_IFTYPE_MONITOR && + !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { has_monitor = true; continue; } @@ -3139,7 +3149,8 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy, } } list_for_each_entry(sdata, &local->interfaces, list) { - if (sdata->vif.type == NL80211_IFTYPE_MONITOR) + if (sdata->vif.type == NL80211_IFTYPE_MONITOR && + !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) continue; for (int link_id = 0; @@ -4342,7 +4353,8 @@ static int ieee80211_cfg_get_channel(struct wiphy *wiphy, if (chanctx_conf) { *chandef = link->conf->chanreq.oper; ret = 0; - } else if (local->open_count > 0 && + } else if (!ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR) && + local->open_count > 0 && local->open_count == local->monitors && sdata->vif.type == NL80211_IFTYPE_MONITOR) { *chandef = local->monitor_chanreq.oper; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 996965005d49..a442cb667520 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -347,6 +347,10 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local, case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_NAN: continue; + case NL80211_IFTYPE_MONITOR: + WARN_ON_ONCE(!ieee80211_hw_check(&local->hw, + NO_VIRTUAL_MONITOR)); + fallthrough; case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_MESH_POINT: case NL80211_IFTYPE_OCB: @@ -355,7 +359,6 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local, case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: - case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: WARN_ON_ONCE(1); @@ -964,6 +967,10 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, if (!link->sdata->u.mgd.associated) continue; break; + case NL80211_IFTYPE_MONITOR: + if (!ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) + continue; + break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_MESH_POINT: @@ -976,6 +983,11 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, if (rcu_access_pointer(link->conf->chanctx_conf) != &chanctx->conf) continue; + if (link->sdata->vif.type == NL80211_IFTYPE_MONITOR) { + rx_chains_dynamic = rx_chains_static = local->rx_chains; + break; + } + switch (link->smps_mode) { default: WARN_ONCE(1, "Invalid SMPS mode %d\n", diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 02b5476a4376..b777240c924e 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -456,6 +456,7 @@ static const char *hw_flag_names[] = { FLAG(SUPPORTS_DYNAMIC_PS), FLAG(MFP_CAPABLE), FLAG(WANT_MONITOR_VIF), + FLAG(NO_VIRTUAL_MONITOR), FLAG(NO_AUTO_VIF), FLAG(SW_CRYPTO_CONTROL), FLAG(SUPPORT_FAST_XMIT), diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index 84d048339113..299d38e9e863 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -65,6 +65,7 @@ int drv_add_interface(struct ieee80211_local *local, if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_AP_VLAN || (sdata->vif.type == NL80211_IFTYPE_MONITOR && !ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) && + !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR) && !(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE)))) return -EINVAL; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 7a99fa057cd9..683cc50cc266 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -279,8 +279,13 @@ static int _ieee80211_change_mac(struct ieee80211_sub_if_data *sdata, ret = eth_mac_addr(sdata->dev, sa); if (ret == 0) { - memcpy(sdata->vif.addr, sa->sa_data, ETH_ALEN); - ether_addr_copy(sdata->vif.bss_conf.addr, sdata->vif.addr); + if (check_dup) { + memcpy(sdata->vif.addr, sa->sa_data, ETH_ALEN); + ether_addr_copy(sdata->vif.bss_conf.addr, sdata->vif.addr); + } else { + memset(sdata->vif.addr, 0, ETH_ALEN); + memset(sdata->vif.bss_conf.addr, 0, ETH_ALEN); + } } /* Regardless of eth_mac_addr() return we still want to add the @@ -699,9 +704,11 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do ieee80211_recalc_idle(local); ieee80211_recalc_offload(local); - if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE)) + if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) && + !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) break; + ieee80211_link_release_channel(&sdata->deflink); fallthrough; default: if (!going_down) @@ -1131,7 +1138,8 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) ASSERT_RTNL(); lockdep_assert_wiphy(local->hw.wiphy); - if (local->monitor_sdata) + if (local->monitor_sdata || + ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) return 0; sdata = kzalloc(sizeof(*sdata) + local->hw.vif_data_size, GFP_KERNEL); @@ -1193,6 +1201,9 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; + if (ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) + return; + ASSERT_RTNL(); lockdep_assert_wiphy(local->hw.wiphy); @@ -1328,7 +1339,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) break; } - if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) { + if ((sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) || + ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { res = drv_add_interface(local, sdata); if (res) goto err_stop; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index a34523bbd156..d032bfb00ade 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -840,6 +840,9 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, bool last_monitor = list_is_last(&sdata->u.mntr.list, &local->mon_list); + if (ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) + ieee80211_handle_mu_mimo_mon(sdata, origskb, rtap_space); + if (!monskb) monskb = ieee80211_make_monitor_skb(local, &origskb, rate, rtap_space, diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index f10379bef9fb..a24636bda679 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1763,7 +1763,8 @@ static bool __ieee80211_tx(struct ieee80211_local *local, switch (sdata->vif.type) { case NL80211_IFTYPE_MONITOR: - if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) { + if ((sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) || + ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { vif = &sdata->vif; break; } @@ -3952,7 +3953,8 @@ begin: switch (tx.sdata->vif.type) { case NL80211_IFTYPE_MONITOR: - if (tx.sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) { + if ((tx.sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) || + ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { vif = &tx.sdata->vif; break; } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index bd93de637f94..a4e1301cc999 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -756,7 +756,8 @@ static void __iterate_interfaces(struct ieee80211_local *local, lockdep_is_held(&local->hw.wiphy->mtx)) { switch (sdata->vif.type) { case NL80211_IFTYPE_MONITOR: - if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE)) + if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) && + !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) continue; break; case NL80211_IFTYPE_AP_VLAN: @@ -1873,8 +1874,10 @@ int ieee80211_reconfig(struct ieee80211_local *local) } list_for_each_entry(sdata, &local->interfaces, list) { + if (sdata->vif.type == NL80211_IFTYPE_MONITOR && + !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) + continue; if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN && - sdata->vif.type != NL80211_IFTYPE_MONITOR && ieee80211_sdata_running(sdata)) { res = drv_add_interface(local, sdata); if (WARN_ON(res)) @@ -1887,11 +1890,14 @@ int ieee80211_reconfig(struct ieee80211_local *local) */ if (res) { list_for_each_entry_continue_reverse(sdata, &local->interfaces, - list) + list) { + if (sdata->vif.type == NL80211_IFTYPE_MONITOR && + !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) + continue; if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN && - sdata->vif.type != NL80211_IFTYPE_MONITOR && ieee80211_sdata_running(sdata)) drv_remove_interface(local, sdata); + } ieee80211_handle_reconfig_failure(local); return res; } -- cgit v1.3 From a77e527b470cc38754c730bce1483711f643bb60 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 9 Oct 2024 10:25:49 +0200 Subject: wifi: cfg80211: add monitor SKIP_TX flag This can be used to indicate that the user is not interested in receiving locally sent packets on the monitor interface. Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/f0c20f832eadd36c71fba9a2a16ba57d78389b6c.1728462320.git-series.nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 2 ++ net/wireless/nl80211.c | 1 + 3 files changed, 5 insertions(+) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5feb93ba0400..8f9853b1a5d1 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2267,6 +2267,7 @@ static inline int cfg80211_get_station(struct net_device *dev, * @MONITOR_FLAG_OTHER_BSS: disable BSSID filtering * @MONITOR_FLAG_COOK_FRAMES: report frames after processing * @MONITOR_FLAG_ACTIVE: active monitor, ACKs frames on its MAC address + * @MONITOR_FLAG_SKIP_TX: do not pass locally transmitted frames */ enum monitor_flags { MONITOR_FLAG_CHANGED = BIT(__NL80211_MNTR_FLAG_INVALID), @@ -2276,6 +2277,7 @@ enum monitor_flags { MONITOR_FLAG_OTHER_BSS = BIT(NL80211_MNTR_FLAG_OTHER_BSS), MONITOR_FLAG_COOK_FRAMES = BIT(NL80211_MNTR_FLAG_COOK_FRAMES), MONITOR_FLAG_ACTIVE = BIT(NL80211_MNTR_FLAG_ACTIVE), + MONITOR_FLAG_SKIP_TX = BIT(NL80211_MNTR_FLAG_SKIP_TX), }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 1b8827f920ff..6d11437596b9 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4703,6 +4703,7 @@ enum nl80211_survey_info { * overrides all other flags. * @NL80211_MNTR_FLAG_ACTIVE: use the configured MAC address * and ACK incoming unicast packets. + * @NL80211_MNTR_FLAG_SKIP_TX: do not pass local tx packets * * @__NL80211_MNTR_FLAG_AFTER_LAST: internal use * @NL80211_MNTR_FLAG_MAX: highest possible monitor flag @@ -4715,6 +4716,7 @@ enum nl80211_mntr_flags { NL80211_MNTR_FLAG_OTHER_BSS, NL80211_MNTR_FLAG_COOK_FRAMES, NL80211_MNTR_FLAG_ACTIVE, + NL80211_MNTR_FLAG_SKIP_TX, /* keep last */ __NL80211_MNTR_FLAG_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 84015f56e93a..4a8c3b6d49d1 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4206,6 +4206,7 @@ static const struct nla_policy mntr_flags_policy[NL80211_MNTR_FLAG_MAX + 1] = { [NL80211_MNTR_FLAG_OTHER_BSS] = { .type = NLA_FLAG }, [NL80211_MNTR_FLAG_COOK_FRAMES] = { .type = NLA_FLAG }, [NL80211_MNTR_FLAG_ACTIVE] = { .type = NLA_FLAG }, + [NL80211_MNTR_FLAG_SKIP_TX] = { .type = NLA_FLAG }, }; static int parse_monitor_flags(struct nlattr *nla, u32 *mntrflags) -- cgit v1.3 From 754905ce1a327ee3297548e132367038cc62b3d8 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 9 Oct 2024 10:25:50 +0200 Subject: wifi: mac80211: add support for the monitor SKIP_TX flag Do not pass locally sent packets to monitor interfaces with this flag set. Skip processing tx packets on the status call entirely if no monitor interfaces without this flag are present. Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/c327bb57ef8dadaa6a0e8e4dc2f5f99ae8123e6c.1728462320.git-series.nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 2 +- net/mac80211/iface.c | 2 ++ net/mac80211/status.c | 5 ++++- 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index d20c2e796703..7dcb46120abc 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1368,7 +1368,7 @@ struct ieee80211_local { spinlock_t queue_stop_reason_lock; int open_count; - int monitors, cooked_mntrs; + int monitors, cooked_mntrs, tx_mntrs; /* number of interfaces with corresponding FIF_ flags */ int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll, fif_probe_req; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 683cc50cc266..57f6fac343eb 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1094,6 +1094,8 @@ void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata, ADJUST(CONTROL, control); ADJUST(CONTROL, pspoll); ADJUST(OTHER_BSS, other_bss); + if (!(flags & MONITOR_FLAG_SKIP_TX)) + local->tx_mntrs += offset; #undef ADJUST } diff --git a/net/mac80211/status.c b/net/mac80211/status.c index b41b867f43b2..5f28f3633fa0 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -927,6 +927,9 @@ void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, if (!ieee80211_sdata_running(sdata)) continue; + if (sdata->u.mntr.flags & MONITOR_FLAG_SKIP_TX) + continue; + if ((sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) && !send_to_cooked) continue; @@ -1099,7 +1102,7 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, * This is a bit racy but we can avoid a lot of work * with this test... */ - if (!local->monitors && (!send_to_cooked || !local->cooked_mntrs)) { + if (!local->tx_mntrs && (!send_to_cooked || !local->cooked_mntrs)) { if (status->free_list) list_add_tail(&skb->list, status->free_list); else -- cgit v1.3 From 342afe693ee765a215343fe1a1af0d6c8b8e10a3 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 9 Oct 2024 10:25:51 +0200 Subject: wifi: mac80211: refactor ieee80211_rx_monitor Rework the monitor mode interface iteration to get rid of the last_monitor condition. Preparation for further filtering received monitor packets. Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/d57d82f109643894325beb9db6da8f001fc533eb.1728462320.git-series.nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 54 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 28 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index d032bfb00ade..4849af232ee6 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -762,8 +762,8 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, struct ieee80211_rate *rate) { struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(origskb); - struct ieee80211_sub_if_data *sdata; - struct sk_buff *monskb = NULL; + struct ieee80211_sub_if_data *sdata, *prev_sdata = NULL; + struct sk_buff *skb, *monskb = NULL; int present_fcs_len = 0; unsigned int rtap_space = 0; struct ieee80211_sub_if_data *monitor_sdata = @@ -837,8 +837,10 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, ieee80211_handle_mu_mimo_mon(monitor_sdata, origskb, rtap_space); list_for_each_entry_rcu(sdata, &local->mon_list, u.mntr.list) { - bool last_monitor = list_is_last(&sdata->u.mntr.list, - &local->mon_list); + if (!prev_sdata) { + prev_sdata = sdata; + continue; + } if (ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) ieee80211_handle_mu_mimo_mon(sdata, origskb, rtap_space); @@ -846,34 +848,34 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, if (!monskb) monskb = ieee80211_make_monitor_skb(local, &origskb, rate, rtap_space, - only_monitor && - last_monitor); + false); + if (!monskb) + continue; - if (monskb) { - struct sk_buff *skb; + skb = skb_clone(monskb, GFP_ATOMIC); + if (!skb) + continue; - if (last_monitor) { - skb = monskb; - monskb = NULL; - } else { - skb = skb_clone(monskb, GFP_ATOMIC); - } + skb->dev = prev_sdata->dev; + dev_sw_netstats_rx_add(skb->dev, skb->len); + netif_receive_skb(skb); + prev_sdata = sdata; + } - if (skb) { - skb->dev = sdata->dev; - dev_sw_netstats_rx_add(skb->dev, skb->len); - netif_receive_skb(skb); - } + if (prev_sdata) { + if (monskb) + skb = monskb; + else + skb = ieee80211_make_monitor_skb(local, &origskb, + rate, rtap_space, + only_monitor); + if (skb) { + skb->dev = prev_sdata->dev; + dev_sw_netstats_rx_add(skb->dev, skb->len); + netif_receive_skb(skb); } - - if (last_monitor) - break; } - /* this happens if last_monitor was erroneously false */ - dev_kfree_skb(monskb); - - /* ditto */ if (!origskb) return NULL; -- cgit v1.3 From f92e0cf19ae0fd08e7b60f24d27a5819d8d949ba Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 9 Oct 2024 10:25:52 +0200 Subject: wifi: mac80211: filter on monitor interfaces based on configured channel When a monitor interface has an assigned channel (only happens with the NO_VIRTUAL_MONITOR feature), only pass packets received on that channel. This is useful for monitoring on multiple channels at the same time using multiple monitor interfaces. Signed-off-by: Felix Fietkau Link: https://patch.msgid.link/1bbe55107ba0f2e62ea90f305faeb7ba9247ef29.1728462320.git-series.nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 4849af232ee6..2bec18fc1b03 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -837,6 +837,13 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, ieee80211_handle_mu_mimo_mon(monitor_sdata, origskb, rtap_space); list_for_each_entry_rcu(sdata, &local->mon_list, u.mntr.list) { + struct cfg80211_chan_def *chandef; + + chandef = &sdata->vif.bss_conf.chanreq.oper; + if (chandef->chan && + chandef->chan->center_freq != status->freq) + continue; + if (!prev_sdata) { prev_sdata = sdata; continue; -- cgit v1.3 From 2d63e6530ec1f50f57c1cde598274d055fb7a36c Mon Sep 17 00:00:00 2001 From: Michael-CY Lee Date: Wed, 9 Oct 2024 20:18:12 +0800 Subject: wifi: mac80211: refactor BW limitation check for CSA parsing Refactor the BW limitation check to a more general format when parsing CSA. Also, the original BW check did not account for BW less than 160 MHz. Signed-off-by: Michael-CY Lee Link: https://patch.msgid.link/20241009121812.2419-1-michael-cy.lee@mediatek.com Signed-off-by: Johannes Berg --- net/mac80211/spectmgmt.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c index 073ff9e0f397..c6015cd00372 100644 --- a/net/mac80211/spectmgmt.c +++ b/net/mac80211/spectmgmt.c @@ -377,13 +377,8 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, /* capture the AP chandef before (potential) downgrading */ csa_ie->chanreq.ap = new_chandef; - if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_320 && - new_chandef.width == NL80211_CHAN_WIDTH_320) - ieee80211_chandef_downgrade(&new_chandef, NULL); - - if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_160 && - (new_chandef.width == NL80211_CHAN_WIDTH_80P80 || - new_chandef.width == NL80211_CHAN_WIDTH_160)) + while (conn->bw_limit < + ieee80211_min_bw_limit_from_chandef(&new_chandef)) ieee80211_chandef_downgrade(&new_chandef, NULL); if (!cfg80211_chandef_compatible(&new_chandef, -- cgit v1.3 From b457d8713872a9aad89c8a57dd8fe471c7db158d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 21 Oct 2024 15:14:14 +0200 Subject: wifi: mac80211: remove misleading j_0 construction parts The GCM algorithm implementation in the kernel assumes that a 12-byte IV is passed, not the actual J_0 from the GCM spec. Don't rename, that'd be messy, but also don't fill the bytes beyond the IV that aren't used, since otherwise it looks as though j_0[12] is used uninitialized. Link: https://patch.msgid.link/20241021151414.798ceb7a5896.Ic57751edad228d56865ecf7433fef469e5e0a4aa@changeid Signed-off-by: Johannes Berg --- net/mac80211/wpa.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 293afa3f57c5..40d5d9e48479 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -598,9 +598,6 @@ static void gcmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *j_0, u8 *aad, memcpy(j_0, hdr->addr2, ETH_ALEN); memcpy(&j_0[ETH_ALEN], pn, IEEE80211_GCMP_PN_LEN); - j_0[13] = 0; - j_0[14] = 0; - j_0[AES_BLOCK_SIZE - 1] = 0x01; ccmp_gcmp_aad(skb, aad, spp_amsdu); } -- cgit v1.3 From 31cb94f71c1ba126b9c684c5882cbf5032d0d1bc Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 22 Oct 2024 15:18:35 +0200 Subject: wifi: mac80211: convert debugfs files to short fops Given the large size of the regular struct file_operations, save a lot of space with the newly added short fops for debugfs. Link: https://patch.msgid.link/20241022151838.2f6de3ea3ecc.I45657e6a8415d796ec95c95becc9efb377ee3be6@changeid Signed-off-by: Johannes Berg --- net/mac80211/debugfs.c | 27 +++++++++------------------ net/mac80211/debugfs_key.c | 9 +++------ net/mac80211/debugfs_netdev.c | 3 +-- net/mac80211/debugfs_sta.c | 9 +++------ net/mac80211/rate.c | 3 +-- net/mac80211/rate.h | 2 +- 6 files changed, 18 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index b777240c924e..be2e486907f9 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -42,9 +42,8 @@ static ssize_t name## _read(struct file *file, char __user *userbuf, \ } #define DEBUGFS_READONLY_FILE_OPS(name) \ -static const struct file_operations name## _ops = { \ +static const struct debugfs_short_fops name## _ops = { \ .read = name## _read, \ - .open = simple_open, \ .llseek = generic_file_llseek, \ }; @@ -142,10 +141,9 @@ static ssize_t aqm_write(struct file *file, return -EINVAL; } -static const struct file_operations aqm_ops = { +static const struct debugfs_short_fops aqm_ops = { .write = aqm_write, .read = aqm_read, - .open = simple_open, .llseek = default_llseek, }; @@ -194,10 +192,9 @@ static ssize_t airtime_flags_write(struct file *file, return count; } -static const struct file_operations airtime_flags_ops = { +static const struct debugfs_short_fops airtime_flags_ops = { .write = airtime_flags_write, .read = airtime_flags_read, - .open = simple_open, .llseek = default_llseek, }; @@ -225,9 +222,8 @@ static ssize_t aql_pending_read(struct file *file, buf, len); } -static const struct file_operations aql_pending_ops = { +static const struct debugfs_short_fops aql_pending_ops = { .read = aql_pending_read, - .open = simple_open, .llseek = default_llseek, }; @@ -305,10 +301,9 @@ static ssize_t aql_txq_limit_write(struct file *file, return count; } -static const struct file_operations aql_txq_limit_ops = { +static const struct debugfs_short_fops aql_txq_limit_ops = { .write = aql_txq_limit_write, .read = aql_txq_limit_read, - .open = simple_open, .llseek = default_llseek, }; @@ -355,10 +350,9 @@ static ssize_t aql_enable_write(struct file *file, const char __user *user_buf, return count; } -static const struct file_operations aql_enable_ops = { +static const struct debugfs_short_fops aql_enable_ops = { .write = aql_enable_write, .read = aql_enable_read, - .open = simple_open, .llseek = default_llseek, }; @@ -406,10 +400,9 @@ static ssize_t force_tx_status_write(struct file *file, return count; } -static const struct file_operations force_tx_status_ops = { +static const struct debugfs_short_fops force_tx_status_ops = { .write = force_tx_status_write, .read = force_tx_status_read, - .open = simple_open, .llseek = default_llseek, }; @@ -434,9 +427,8 @@ static ssize_t reset_write(struct file *file, const char __user *user_buf, return count; } -static const struct file_operations reset_ops = { +static const struct debugfs_short_fops reset_ops = { .write = reset_write, - .open = simple_open, .llseek = noop_llseek, }; #endif @@ -624,9 +616,8 @@ static ssize_t stats_ ##name## _read(struct file *file, \ print_devstats_##name); \ } \ \ -static const struct file_operations stats_ ##name## _ops = { \ +static const struct debugfs_short_fops stats_ ##name## _ops = { \ .read = stats_ ##name## _read, \ - .open = simple_open, \ .llseek = generic_file_llseek, \ }; diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c index 7e54da508765..b3a64edea0f2 100644 --- a/net/mac80211/debugfs_key.c +++ b/net/mac80211/debugfs_key.c @@ -26,17 +26,15 @@ static ssize_t key_##name##_read(struct file *file, \ #define KEY_READ_X(name) KEY_READ(name, name, "0x%x\n") #define KEY_OPS(name) \ -static const struct file_operations key_ ##name## _ops = { \ +static const struct debugfs_short_fops key_ ##name## _ops = { \ .read = key_##name##_read, \ - .open = simple_open, \ .llseek = generic_file_llseek, \ } #define KEY_OPS_W(name) \ -static const struct file_operations key_ ##name## _ops = { \ +static const struct debugfs_short_fops key_ ##name## _ops = { \ .read = key_##name##_read, \ .write = key_##name##_write, \ - .open = simple_open, \ .llseek = generic_file_llseek, \ } @@ -49,9 +47,8 @@ static const struct file_operations key_ ##name## _ops = { \ #define KEY_CONF_READ_D(name) KEY_CONF_READ(name, "%d\n") #define KEY_CONF_OPS(name) \ -static const struct file_operations key_ ##name## _ops = { \ +static const struct debugfs_short_fops key_ ##name## _ops = { \ .read = key_conf_##name##_read, \ - .open = simple_open, \ .llseek = generic_file_llseek, \ } diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 68596ef78b15..a9bc2fd59f55 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -221,10 +221,9 @@ static ssize_t ieee80211_if_fmt_##name( \ } #define _IEEE80211_IF_FILE_OPS(name, _read, _write) \ -static const struct file_operations name##_ops = { \ +static const struct debugfs_short_fops name##_ops = { \ .read = (_read), \ .write = (_write), \ - .open = simple_open, \ .llseek = generic_file_llseek, \ } diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 1e9389c49a57..a67a9d316008 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -30,17 +30,15 @@ static ssize_t sta_ ##name## _read(struct file *file, \ #define STA_READ_D(name, field) STA_READ(name, field, "%d\n") #define STA_OPS(name) \ -static const struct file_operations sta_ ##name## _ops = { \ +static const struct debugfs_short_fops sta_ ##name## _ops = { \ .read = sta_##name##_read, \ - .open = simple_open, \ .llseek = generic_file_llseek, \ } #define STA_OPS_RW(name) \ -static const struct file_operations sta_ ##name## _ops = { \ +static const struct debugfs_short_fops sta_ ##name## _ops = { \ .read = sta_##name##_read, \ .write = sta_##name##_write, \ - .open = simple_open, \ .llseek = generic_file_llseek, \ } @@ -450,9 +448,8 @@ STA_OPS_RW(agg_status); /* link sta attributes */ #define LINK_STA_OPS(name) \ -static const struct file_operations link_sta_ ##name## _ops = { \ +static const struct debugfs_short_fops link_sta_ ##name## _ops = { \ .read = link_sta_##name##_read, \ - .open = simple_open, \ .llseek = generic_file_llseek, \ } diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 63fe58311a77..0d056db9f81e 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -249,9 +249,8 @@ static ssize_t rcname_read(struct file *file, char __user *userbuf, ref->ops->name, len); } -const struct file_operations rcname_ops = { +const struct debugfs_short_fops rcname_ops = { .read = rcname_read, - .open = simple_open, .llseek = default_llseek, }; #endif diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h index 673aa9efe30b..5e4bde598212 100644 --- a/net/mac80211/rate.h +++ b/net/mac80211/rate.h @@ -62,7 +62,7 @@ static inline void rate_control_add_sta_debugfs(struct sta_info *sta) #endif } -extern const struct file_operations rcname_ops; +extern const struct debugfs_short_fops rcname_ops; static inline void rate_control_add_debugfs(struct ieee80211_local *local) { -- cgit v1.3 From 188a1bf894323bf3a90361676da8364c82ce6d32 Mon Sep 17 00:00:00 2001 From: Aditya Kumar Singh Date: Tue, 1 Oct 2024 14:20:34 +0530 Subject: wifi: mac80211: re-order assigning channel in activate links The current flow in _ieee80211_set_active_links() does not align with the operational requirements of drivers that groups multiple hardware under a single wiphy. These drivers (e.g ath12k) rely on channel assignment to determine the appropriate hardware for each link. Without this, the drivers cannot correctly establish the link interface. Currently in _ieee80211_set_active_links(), after calling drv_change_vif_links() on the driver, the state of all connected stations is updated via drv_change_sta_links(). This is followed by handling keys in the links, and finally, assigning the channel to the links. Consequently, drv_change_sta_links() prompts drivers to create the station entry at their level and within their firmware. However, since channels have not yet been assigned to links at this stage, drivers have not created the necessary link interface for establishing link stations, leading to failures in activating the links. Therefore, re-order the logic so that after drv_change_vif_links() and removing the old links, channels are assigned to newly added links. Following this, the flow proceeds to station handling. Signed-off-by: Aditya Kumar Singh Link: https://patch.msgid.link/20241001085034.2745669-1-quic_adisi@quicinc.com [Johannes: fix iwlwifi to deal with the changes] Reviewed-by: Miriam Rachel Korenblit Signed-off-by: Johannes Berg --- .../net/wireless/intel/iwlwifi/mvm/mld-mac80211.c | 40 +++++++---------- drivers/net/wireless/intel/iwlwifi/mvm/mld-sta.c | 3 ++ net/mac80211/link.c | 51 +++++++++++++--------- 3 files changed, 50 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c index 9aa08d289680..a5c38f389c69 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac80211.c @@ -331,23 +331,18 @@ __iwl_mvm_mld_assign_vif_chanctx(struct iwl_mvm *mvm, if (ret) goto out; - /* Initialize rate control for the AP station, since we might be - * doing a link switch here - we cannot initialize it before since - * this needs the phy context assigned (and in FW?), and we cannot - * do it later because it needs to be initialized as soon as we're - * able to TX on the link, i.e. when active. + /* + * if link switching (link not active yet) we'll activate it in + * firmware later on link-info change, which mac80211 guarantees + * for link switch after the stations are set up */ - if (mvmvif->ap_sta) { - struct ieee80211_link_sta *link_sta; - - rcu_read_lock(); - link_sta = rcu_dereference(mvmvif->ap_sta->link[link_id]); - - if (!WARN_ON_ONCE(!link_sta)) - iwl_mvm_rs_rate_init(mvm, vif, mvmvif->ap_sta, - link_conf, link_sta, - phy_ctxt->channel->band); - rcu_read_unlock(); + if (ieee80211_vif_link_active(vif, link_conf->link_id)) { + ret = iwl_mvm_link_changed(mvm, vif, link_conf, + LINK_CONTEXT_MODIFY_ACTIVE | + LINK_CONTEXT_MODIFY_RATES_INFO, + true); + if (ret) + goto out; } if (vif->type == NL80211_IFTYPE_STATION) @@ -355,14 +350,6 @@ __iwl_mvm_mld_assign_vif_chanctx(struct iwl_mvm *mvm, link_conf, false); - /* then activate */ - ret = iwl_mvm_link_changed(mvm, vif, link_conf, - LINK_CONTEXT_MODIFY_ACTIVE | - LINK_CONTEXT_MODIFY_RATES_INFO, - true); - if (ret) - goto out; - /* * Power state must be updated before quotas, * otherwise fw will complain. @@ -775,6 +762,11 @@ iwl_mvm_mld_link_info_changed_station(struct iwl_mvm *mvm, if (WARN_ON_ONCE(!mvmvif->link[link_conf->link_id])) return; + /* not yet marked active in vif means during link switch */ + if (!ieee80211_vif_link_active(vif, link_conf->link_id) && + vif->cfg.assoc && mvmvif->link[link_conf->link_id]->phy_ctxt) + link_changes |= LINK_CONTEXT_MODIFY_ACTIVE; + has_he = link_conf->he_support && !iwlwifi_mod_params.disable_11ax; has_eht = link_conf->eht_support && !iwlwifi_mod_params.disable_11be; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mld-sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/mld-sta.c index 28a9d90ad1cd..99eb1e1db1bb 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mld-sta.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mld-sta.c @@ -1182,6 +1182,9 @@ int iwl_mvm_mld_update_sta_links(struct iwl_mvm *mvm, link_sta_added_to_fw |= BIT(link_id); iwl_mvm_rs_add_sta_link(mvm, mvm_sta_link); + + iwl_mvm_rs_rate_init(mvm, vif, sta, link_conf, link_sta, + link_conf->chanreq.oper.chan->band); } if (sta_mask_added) { diff --git a/net/mac80211/link.c b/net/mac80211/link.c index 503bdea904bc..e8def387d0f8 100644 --- a/net/mac80211/link.c +++ b/net/mac80211/link.c @@ -388,6 +388,37 @@ static int _ieee80211_set_active_links(struct ieee80211_sub_if_data *sdata, jiffies); } + for_each_set_bit(link_id, &add, IEEE80211_MLD_MAX_NUM_LINKS) { + struct ieee80211_link_data *link; + + link = sdata_dereference(sdata->link[link_id], sdata); + + /* + * This call really should not fail. Unfortunately, it appears + * that this may happen occasionally with some drivers. Should + * it happen, we are stuck in a bad place as going backwards is + * not really feasible. + * + * So lets just tell link_use_channel that it must not fail to + * assign the channel context (from mac80211's perspective) and + * assume the driver is going to trigger a recovery flow if it + * had a failure. + * That really is not great nor guaranteed to work. But at least + * the internal mac80211 state remains consistent and there is + * a chance that we can recover. + */ + ret = _ieee80211_link_use_channel(link, + &link->conf->chanreq, + IEEE80211_CHANCTX_SHARED, + true); + WARN_ON_ONCE(ret); + + /* + * inform about the link info changed parameters after all + * stations are also added + */ + } + list_for_each_entry(sta, &local->sta_list, list) { if (sdata != sta->sdata) continue; @@ -431,26 +462,6 @@ static int _ieee80211_set_active_links(struct ieee80211_sub_if_data *sdata, link = sdata_dereference(sdata->link[link_id], sdata); - /* - * This call really should not fail. Unfortunately, it appears - * that this may happen occasionally with some drivers. Should - * it happen, we are stuck in a bad place as going backwards is - * not really feasible. - * - * So lets just tell link_use_channel that it must not fail to - * assign the channel context (from mac80211's perspective) and - * assume the driver is going to trigger a recovery flow if it - * had a failure. - * That really is not great nor guaranteed to work. But at least - * the internal mac80211 state remains consistent and there is - * a chance that we can recover. - */ - ret = _ieee80211_link_use_channel(link, - &link->conf->chanreq, - IEEE80211_CHANCTX_SHARED, - true); - WARN_ON_ONCE(ret); - ieee80211_mgd_set_link_qos_params(link); ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_ERP_CTS_PROT | -- cgit v1.3 From eaed5fc0c322d75cfcdbc7a16c0c5023d9e4f6fe Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Thu, 10 Oct 2024 13:40:36 -0700 Subject: mac80211: Remove NOP call to ieee80211_hw_config If changed is '0', then the ieee80211_hw_config takes no action, so just remove the call in __ieee809211_recalc_txpower() Signed-off-by: Ben Greear Link: https://patch.msgid.link/20241010204036.1219896-1-greearb@candelatech.com Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 57f6fac343eb..a8fbedd530f4 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -67,7 +67,6 @@ bool __ieee80211_recalc_txpower(struct ieee80211_link_data *link) if (power != link->conf->txpower) { link->conf->txpower = power; - ieee80211_hw_config(link->sdata->local, 0); return true; } -- cgit v1.3 From 08a9572be36819b5d9011604edfa5db6c5062a7a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Oct 2024 11:31:32 -0700 Subject: phonet: Pass ifindex to fill_addr(). We will convert addr_doit() and getaddr_dumpit() to RCU, both of which call fill_addr(). The former will call phonet_address_notify() outside of RCU due to GFP_KERNEL, so dev will not be available in fill_addr(). Let's pass ifindex directly to fill_addr(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/phonet/pn_netlink.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c index 894e5c72d6bf..3205d2457477 100644 --- a/net/phonet/pn_netlink.c +++ b/net/phonet/pn_netlink.c @@ -19,7 +19,7 @@ /* Device address handling */ -static int fill_addr(struct sk_buff *skb, struct net_device *dev, u8 addr, +static int fill_addr(struct sk_buff *skb, u32 ifindex, u8 addr, u32 portid, u32 seq, int event); void phonet_address_notify(int event, struct net_device *dev, u8 addr) @@ -31,7 +31,8 @@ void phonet_address_notify(int event, struct net_device *dev, u8 addr) nla_total_size(1), GFP_KERNEL); if (skb == NULL) goto errout; - err = fill_addr(skb, dev, addr, 0, 0, event); + + err = fill_addr(skb, dev->ifindex, addr, 0, 0, event); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(skb); @@ -92,8 +93,8 @@ static int addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh, return err; } -static int fill_addr(struct sk_buff *skb, struct net_device *dev, u8 addr, - u32 portid, u32 seq, int event) +static int fill_addr(struct sk_buff *skb, u32 ifindex, u8 addr, + u32 portid, u32 seq, int event) { struct ifaddrmsg *ifm; struct nlmsghdr *nlh; @@ -107,7 +108,7 @@ static int fill_addr(struct sk_buff *skb, struct net_device *dev, u8 addr, ifm->ifa_prefixlen = 0; ifm->ifa_flags = IFA_F_PERMANENT; ifm->ifa_scope = RT_SCOPE_LINK; - ifm->ifa_index = dev->ifindex; + ifm->ifa_index = ifindex; if (nla_put_u8(skb, IFA_LOCAL, addr)) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -140,7 +141,7 @@ static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback *cb) if (addr_idx++ < addr_start_idx) continue; - if (fill_addr(skb, pnd->netdev, addr << 2, + if (fill_addr(skb, pnd->netdev->ifindex, addr << 2, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWADDR) < 0) goto out; -- cgit v1.3 From 68ed5c38b512b734caf3da1f87db4a99fcfe3002 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Oct 2024 11:31:33 -0700 Subject: phonet: Pass net and ifindex to phonet_address_notify(). Currently, phonet_address_notify() fetches netns and ifindex from dev. Once addr_doit() is converted to RCU, phonet_address_notify() will be called outside of RCU due to GFP_KERNEL, and dev will be unavailable there. Let's pass net and ifindex to phonet_address_notify(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/phonet/pn_dev.h | 2 +- net/phonet/pn_dev.c | 10 +++++++--- net/phonet/pn_netlink.c | 12 ++++++------ 3 files changed, 14 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/phonet/pn_dev.h b/include/net/phonet/pn_dev.h index e9dc8dca5817..6b2102b4ece3 100644 --- a/include/net/phonet/pn_dev.h +++ b/include/net/phonet/pn_dev.h @@ -38,7 +38,7 @@ int phonet_address_add(struct net_device *dev, u8 addr); int phonet_address_del(struct net_device *dev, u8 addr); u8 phonet_address_get(struct net_device *dev, u8 addr); int phonet_address_lookup(struct net *net, u8 addr); -void phonet_address_notify(int event, struct net_device *dev, u8 addr); +void phonet_address_notify(struct net *net, int event, u32 ifindex, u8 addr); int phonet_route_add(struct net_device *dev, u8 daddr); int phonet_route_del(struct net_device *dev, u8 daddr); diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index cde671d29d5d..2e7d850dc726 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -98,10 +98,13 @@ static void phonet_device_destroy(struct net_device *dev) mutex_unlock(&pndevs->lock); if (pnd) { + struct net *net = dev_net(dev); + u32 ifindex = dev->ifindex; u8 addr; for_each_set_bit(addr, pnd->addrs, 64) - phonet_address_notify(RTM_DELADDR, dev, addr); + phonet_address_notify(net, RTM_DELADDR, ifindex, addr); + kfree(pnd); } } @@ -244,8 +247,9 @@ static int phonet_device_autoconf(struct net_device *dev) ret = phonet_address_add(dev, req.ifr_phonet_autoconf.device); if (ret) return ret; - phonet_address_notify(RTM_NEWADDR, dev, - req.ifr_phonet_autoconf.device); + + phonet_address_notify(dev_net(dev), RTM_NEWADDR, dev->ifindex, + req.ifr_phonet_autoconf.device); return 0; } diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c index 3205d2457477..23097085ad38 100644 --- a/net/phonet/pn_netlink.c +++ b/net/phonet/pn_netlink.c @@ -22,7 +22,7 @@ static int fill_addr(struct sk_buff *skb, u32 ifindex, u8 addr, u32 portid, u32 seq, int event); -void phonet_address_notify(int event, struct net_device *dev, u8 addr) +void phonet_address_notify(struct net *net, int event, u32 ifindex, u8 addr) { struct sk_buff *skb; int err = -ENOBUFS; @@ -32,17 +32,17 @@ void phonet_address_notify(int event, struct net_device *dev, u8 addr) if (skb == NULL) goto errout; - err = fill_addr(skb, dev->ifindex, addr, 0, 0, event); + err = fill_addr(skb, ifindex, addr, 0, 0, event); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } - rtnl_notify(skb, dev_net(dev), 0, - RTNLGRP_PHONET_IFADDR, NULL, GFP_KERNEL); + + rtnl_notify(skb, net, 0, RTNLGRP_PHONET_IFADDR, NULL, GFP_KERNEL); return; errout: - rtnl_set_sk_err(dev_net(dev), RTNLGRP_PHONET_IFADDR, err); + rtnl_set_sk_err(net, RTNLGRP_PHONET_IFADDR, err); } static const struct nla_policy ifa_phonet_policy[IFA_MAX+1] = { @@ -89,7 +89,7 @@ static int addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh, else err = phonet_address_del(dev, pnaddr); if (!err) - phonet_address_notify(nlh->nlmsg_type, dev, pnaddr); + phonet_address_notify(net, nlh->nlmsg_type, ifm->ifa_index, pnaddr); return err; } -- cgit v1.3 From 42f5fe1dc4babad1c49bcc4121983fffccee3cd9 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Oct 2024 11:31:34 -0700 Subject: phonet: Convert phonet_device_list.lock to spinlock_t. addr_doit() calls phonet_address_add() or phonet_address_del() for RTM_NEWADDR or RTM_DELADDR, respectively. Both functions only touch phonet_device_list(dev_net(dev)), which is currently protected by RTNL and its dedicated mutex, phonet_device_list.lock. We will convert addr_doit() to RCU and cannot use mutex inside RCU. Let's convert the mutex to spinlock_t. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/phonet/pn_dev.h | 3 ++- net/phonet/pn_dev.c | 26 +++++++++++++++++--------- 2 files changed, 19 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/phonet/pn_dev.h b/include/net/phonet/pn_dev.h index 6b2102b4ece3..ac0331d83a81 100644 --- a/include/net/phonet/pn_dev.h +++ b/include/net/phonet/pn_dev.h @@ -12,12 +12,13 @@ #include #include +#include struct net; struct phonet_device_list { struct list_head list; - struct mutex lock; + spinlock_t lock; }; struct phonet_device_list *phonet_device_list(struct net *net); diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index 2e7d850dc726..545279ef5910 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -54,7 +54,7 @@ static struct phonet_device *__phonet_device_alloc(struct net_device *dev) pnd->netdev = dev; bitmap_zero(pnd->addrs, 64); - BUG_ON(!mutex_is_locked(&pndevs->lock)); + lockdep_assert_held(&pndevs->lock); list_add_rcu(&pnd->list, &pndevs->list); return pnd; } @@ -64,7 +64,8 @@ static struct phonet_device *__phonet_get(struct net_device *dev) struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); struct phonet_device *pnd; - BUG_ON(!mutex_is_locked(&pndevs->lock)); + lockdep_assert_held(&pndevs->lock); + list_for_each_entry(pnd, &pndevs->list, list) { if (pnd->netdev == dev) return pnd; @@ -91,11 +92,13 @@ static void phonet_device_destroy(struct net_device *dev) ASSERT_RTNL(); - mutex_lock(&pndevs->lock); + spin_lock(&pndevs->lock); + pnd = __phonet_get(dev); if (pnd) list_del_rcu(&pnd->list); - mutex_unlock(&pndevs->lock); + + spin_unlock(&pndevs->lock); if (pnd) { struct net *net = dev_net(dev); @@ -136,7 +139,8 @@ int phonet_address_add(struct net_device *dev, u8 addr) struct phonet_device *pnd; int err = 0; - mutex_lock(&pndevs->lock); + spin_lock(&pndevs->lock); + /* Find or create Phonet-specific device data */ pnd = __phonet_get(dev); if (pnd == NULL) @@ -145,7 +149,9 @@ int phonet_address_add(struct net_device *dev, u8 addr) err = -ENOMEM; else if (test_and_set_bit(addr >> 2, pnd->addrs)) err = -EEXIST; - mutex_unlock(&pndevs->lock); + + spin_unlock(&pndevs->lock); + return err; } @@ -155,7 +161,8 @@ int phonet_address_del(struct net_device *dev, u8 addr) struct phonet_device *pnd; int err = 0; - mutex_lock(&pndevs->lock); + spin_lock(&pndevs->lock); + pnd = __phonet_get(dev); if (!pnd || !test_and_clear_bit(addr >> 2, pnd->addrs)) { err = -EADDRNOTAVAIL; @@ -164,7 +171,8 @@ int phonet_address_del(struct net_device *dev, u8 addr) list_del_rcu(&pnd->list); else pnd = NULL; - mutex_unlock(&pndevs->lock); + + spin_unlock(&pndevs->lock); if (pnd) kfree_rcu(pnd, rcu); @@ -313,7 +321,7 @@ static int __net_init phonet_init_net(struct net *net) return -ENOMEM; INIT_LIST_HEAD(&pnn->pndevs.list); - mutex_init(&pnn->pndevs.lock); + spin_lock_init(&pnn->pndevs.lock); mutex_init(&pnn->routes.lock); return 0; } -- cgit v1.3 From 8786e98dd0ebb8ab3adfddd3517f3505c3a61c23 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Oct 2024 11:31:35 -0700 Subject: phonet: Don't hold RTNL for addr_doit(). Now only __dev_get_by_index() depends on RTNL in addr_doit(). Let's use dev_get_by_index_rcu() and register addr_doit() with RTNL_FLAG_DOIT_UNLOCKED. While at it, I changed phonet_rtnl_msg_handlers[]'s init to C99 style like other core networking code. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/phonet/pn_netlink.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c index 23097085ad38..5996141e258f 100644 --- a/net/phonet/pn_netlink.c +++ b/net/phonet/pn_netlink.c @@ -65,8 +65,6 @@ static int addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh, if (!netlink_capable(skb, CAP_SYS_ADMIN)) return -EPERM; - ASSERT_RTNL(); - err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_phonet_policy, extack); if (err < 0) @@ -80,16 +78,24 @@ static int addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh, /* Phonet addresses only have 6 high-order bits */ return -EINVAL; - dev = __dev_get_by_index(net, ifm->ifa_index); - if (dev == NULL) + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, ifm->ifa_index); + if (!dev) { + rcu_read_unlock(); return -ENODEV; + } if (nlh->nlmsg_type == RTM_NEWADDR) err = phonet_address_add(dev, pnaddr); else err = phonet_address_del(dev, pnaddr); + + rcu_read_unlock(); + if (!err) phonet_address_notify(net, nlh->nlmsg_type, ifm->ifa_index, pnaddr); + return err; } @@ -287,13 +293,18 @@ static int route_dumpit(struct sk_buff *skb, struct netlink_callback *cb) } static const struct rtnl_msg_handler phonet_rtnl_msg_handlers[] __initdata_or_module = { - {THIS_MODULE, PF_PHONET, RTM_NEWADDR, addr_doit, NULL, 0}, - {THIS_MODULE, PF_PHONET, RTM_DELADDR, addr_doit, NULL, 0}, - {THIS_MODULE, PF_PHONET, RTM_GETADDR, NULL, getaddr_dumpit, 0}, - {THIS_MODULE, PF_PHONET, RTM_NEWROUTE, route_doit, NULL, 0}, - {THIS_MODULE, PF_PHONET, RTM_DELROUTE, route_doit, NULL, 0}, - {THIS_MODULE, PF_PHONET, RTM_GETROUTE, NULL, route_dumpit, - RTNL_FLAG_DUMP_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_NEWADDR, + .doit = addr_doit, .flags = RTNL_FLAG_DOIT_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_DELADDR, + .doit = addr_doit, .flags = RTNL_FLAG_DOIT_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_GETADDR, + .dumpit = getaddr_dumpit}, + {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_NEWROUTE, + .doit = route_doit}, + {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_DELROUTE, + .doit = route_doit}, + {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_GETROUTE, + .dumpit = route_dumpit, .flags = RTNL_FLAG_DUMP_UNLOCKED}, }; int __init phonet_netlink_register(void) -- cgit v1.3 From b7d2fc9ad7fe75b536f94409b7f1e90e12e4f44d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Oct 2024 11:31:36 -0700 Subject: phonet: Don't hold RTNL for getaddr_dumpit(). getaddr_dumpit() already relies on RCU and does not need RTNL. Let's use READ_ONCE() for ifindex and register getaddr_dumpit() with RTNL_FLAG_DUMP_UNLOCKED. While at it, the retval of getaddr_dumpit() is changed to combine NLMSG_DONE and save recvmsg() as done in 58a4ff5d77b1 ("phonet: no longer hold RTNL in route_dumpit()"). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/phonet/pn_netlink.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c index 5996141e258f..14928fa04675 100644 --- a/net/phonet/pn_netlink.c +++ b/net/phonet/pn_netlink.c @@ -127,14 +127,17 @@ nla_put_failure: static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { + int addr_idx = 0, addr_start_idx = cb->args[1]; + int dev_idx = 0, dev_start_idx = cb->args[0]; struct phonet_device_list *pndevs; struct phonet_device *pnd; - int dev_idx = 0, dev_start_idx = cb->args[0]; - int addr_idx = 0, addr_start_idx = cb->args[1]; + int err = 0; pndevs = phonet_device_list(sock_net(skb->sk)); + rcu_read_lock(); list_for_each_entry_rcu(pnd, &pndevs->list, list) { + DECLARE_BITMAP(addrs, 64); u8 addr; if (dev_idx > dev_start_idx) @@ -143,23 +146,26 @@ static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback *cb) continue; addr_idx = 0; - for_each_set_bit(addr, pnd->addrs, 64) { + memcpy(addrs, pnd->addrs, sizeof(pnd->addrs)); + + for_each_set_bit(addr, addrs, 64) { if (addr_idx++ < addr_start_idx) continue; - if (fill_addr(skb, pnd->netdev->ifindex, addr << 2, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, RTM_NEWADDR) < 0) + err = fill_addr(skb, READ_ONCE(pnd->netdev->ifindex), + addr << 2, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, RTM_NEWADDR); + if (err < 0) goto out; } } - out: rcu_read_unlock(); + cb->args[0] = dev_idx; cb->args[1] = addr_idx; - return skb->len; + return err; } /* Routes handling */ @@ -298,7 +304,7 @@ static const struct rtnl_msg_handler phonet_rtnl_msg_handlers[] __initdata_or_mo {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_DELADDR, .doit = addr_doit, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_GETADDR, - .dumpit = getaddr_dumpit}, + .dumpit = getaddr_dumpit, .flags = RTNL_FLAG_DUMP_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_NEWROUTE, .doit = route_doit}, {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_DELROUTE, -- cgit v1.3 From 302fc6bbcba4beee6ff5e73c2fcc257e62667d4d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Oct 2024 11:31:37 -0700 Subject: phonet: Pass ifindex to fill_route(). We will convert route_doit() to RCU. route_doit() will call rtm_phonet_notify() outside of RCU due to GFP_KERNEL, so dev will not be available in fill_route(). Let's pass ifindex directly to fill_route(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/phonet/pn_netlink.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c index 14928fa04675..c9a4215ec560 100644 --- a/net/phonet/pn_netlink.c +++ b/net/phonet/pn_netlink.c @@ -170,8 +170,8 @@ out: /* Routes handling */ -static int fill_route(struct sk_buff *skb, struct net_device *dev, u8 dst, - u32 portid, u32 seq, int event) +static int fill_route(struct sk_buff *skb, u32 ifindex, u8 dst, + u32 portid, u32 seq, int event) { struct rtmsg *rtm; struct nlmsghdr *nlh; @@ -190,8 +190,7 @@ static int fill_route(struct sk_buff *skb, struct net_device *dev, u8 dst, rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_type = RTN_UNICAST; rtm->rtm_flags = 0; - if (nla_put_u8(skb, RTA_DST, dst) || - nla_put_u32(skb, RTA_OIF, READ_ONCE(dev->ifindex))) + if (nla_put_u8(skb, RTA_DST, dst) || nla_put_u32(skb, RTA_OIF, ifindex)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; @@ -210,7 +209,8 @@ void rtm_phonet_notify(int event, struct net_device *dev, u8 dst) nla_total_size(1) + nla_total_size(4), GFP_KERNEL); if (skb == NULL) goto errout; - err = fill_route(skb, dev, dst, 0, 0, event); + + err = fill_route(skb, dev->ifindex, dst, 0, 0, event); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(skb); @@ -286,7 +286,7 @@ static int route_dumpit(struct sk_buff *skb, struct netlink_callback *cb) if (!dev) continue; - err = fill_route(skb, dev, addr << 2, + err = fill_route(skb, READ_ONCE(dev->ifindex), addr << 2, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWROUTE); if (err < 0) -- cgit v1.3 From de51ad08b1177bbbb8b60cb7dd4c3c5dd50d262f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Oct 2024 11:31:38 -0700 Subject: phonet: Pass net and ifindex to rtm_phonet_notify(). Currently, rtm_phonet_notify() fetches netns and ifindex from dev. Once route_doit() is converted to RCU, rtm_phonet_notify() will be called outside of RCU due to GFP_KERNEL, and dev will be unavailable there. Let's pass net and ifindex to rtm_phonet_notify(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/phonet/pn_dev.h | 2 +- net/phonet/pn_dev.c | 10 +++++++--- net/phonet/pn_netlink.c | 16 +++++++++------- 3 files changed, 17 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/phonet/pn_dev.h b/include/net/phonet/pn_dev.h index ac0331d83a81..021e524fd20a 100644 --- a/include/net/phonet/pn_dev.h +++ b/include/net/phonet/pn_dev.h @@ -43,7 +43,7 @@ void phonet_address_notify(struct net *net, int event, u32 ifindex, u8 addr); int phonet_route_add(struct net_device *dev, u8 daddr); int phonet_route_del(struct net_device *dev, u8 daddr); -void rtm_phonet_notify(int event, struct net_device *dev, u8 dst); +void rtm_phonet_notify(struct net *net, int event, u32 ifindex, u8 dst); struct net_device *phonet_route_get_rcu(struct net *net, u8 daddr); struct net_device *phonet_route_output(struct net *net, u8 daddr); diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index 545279ef5910..6ded0d347b9f 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -263,9 +263,13 @@ static int phonet_device_autoconf(struct net_device *dev) static void phonet_route_autodel(struct net_device *dev) { - struct phonet_net *pnn = phonet_pernet(dev_net(dev)); - unsigned int i; + struct net *net = dev_net(dev); DECLARE_BITMAP(deleted, 64); + u32 ifindex = dev->ifindex; + struct phonet_net *pnn; + unsigned int i; + + pnn = phonet_pernet(net); /* Remove left-over Phonet routes */ bitmap_zero(deleted, 64); @@ -281,7 +285,7 @@ static void phonet_route_autodel(struct net_device *dev) return; /* short-circuit RCU */ synchronize_rcu(); for_each_set_bit(i, deleted, 64) { - rtm_phonet_notify(RTM_DELROUTE, dev, i); + rtm_phonet_notify(net, RTM_DELROUTE, ifindex, i); dev_put(dev); } } diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c index c9a4215ec560..bfec5bd639b6 100644 --- a/net/phonet/pn_netlink.c +++ b/net/phonet/pn_netlink.c @@ -200,7 +200,7 @@ nla_put_failure: return -EMSGSIZE; } -void rtm_phonet_notify(int event, struct net_device *dev, u8 dst) +void rtm_phonet_notify(struct net *net, int event, u32 ifindex, u8 dst) { struct sk_buff *skb; int err = -ENOBUFS; @@ -210,17 +210,17 @@ void rtm_phonet_notify(int event, struct net_device *dev, u8 dst) if (skb == NULL) goto errout; - err = fill_route(skb, dev->ifindex, dst, 0, 0, event); + err = fill_route(skb, ifindex, dst, 0, 0, event); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } - rtnl_notify(skb, dev_net(dev), 0, - RTNLGRP_PHONET_ROUTE, NULL, GFP_KERNEL); + + rtnl_notify(skb, net, 0, RTNLGRP_PHONET_ROUTE, NULL, GFP_KERNEL); return; errout: - rtnl_set_sk_err(dev_net(dev), RTNLGRP_PHONET_ROUTE, err); + rtnl_set_sk_err(net, RTNLGRP_PHONET_ROUTE, err); } static const struct nla_policy rtm_phonet_policy[RTA_MAX+1] = { @@ -235,6 +235,7 @@ static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr *tb[RTA_MAX+1]; struct net_device *dev; struct rtmsg *rtm; + u32 ifindex; int err; u8 dst; @@ -260,7 +261,8 @@ static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh, if (dst & 3) /* Phonet addresses only have 6 high-order bits */ return -EINVAL; - dev = __dev_get_by_index(net, nla_get_u32(tb[RTA_OIF])); + ifindex = nla_get_u32(tb[RTA_OIF]); + dev = __dev_get_by_index(net, ifindex); if (dev == NULL) return -ENODEV; @@ -269,7 +271,7 @@ static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh, else err = phonet_route_del(dev, dst); if (!err) - rtm_phonet_notify(nlh->nlmsg_type, dev, dst); + rtm_phonet_notify(net, nlh->nlmsg_type, ifindex, dst); return err; } -- cgit v1.3 From 3deec3b4afb4c767007eae1eeedbcf3da599395b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Oct 2024 11:31:39 -0700 Subject: phonet: Convert phonet_routes.lock to spinlock_t. route_doit() calls phonet_route_add() or phonet_route_del() for RTM_NEWROUTE or RTM_DELROUTE, respectively. Both functions only touch phonet_pernet(dev_net(dev))->routes, which is currently protected by RTNL and its dedicated mutex, phonet_routes.lock. We will convert route_doit() to RCU and cannot use mutex inside RCU. Let's convert the mutex to spinlock_t. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/phonet/pn_dev.h | 1 - net/phonet/pn_dev.c | 23 ++++++++++++++--------- 2 files changed, 14 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/phonet/pn_dev.h b/include/net/phonet/pn_dev.h index 021e524fd20a..37a3e83531c6 100644 --- a/include/net/phonet/pn_dev.h +++ b/include/net/phonet/pn_dev.h @@ -11,7 +11,6 @@ #define PN_DEV_H #include -#include #include struct net; diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index 6ded0d347b9f..19234d664c4f 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -22,7 +22,7 @@ #include struct phonet_routes { - struct mutex lock; + spinlock_t lock; struct net_device __rcu *table[64]; }; @@ -273,13 +273,15 @@ static void phonet_route_autodel(struct net_device *dev) /* Remove left-over Phonet routes */ bitmap_zero(deleted, 64); - mutex_lock(&pnn->routes.lock); - for (i = 0; i < 64; i++) + + spin_lock(&pnn->routes.lock); + for (i = 0; i < 64; i++) { if (rcu_access_pointer(pnn->routes.table[i]) == dev) { RCU_INIT_POINTER(pnn->routes.table[i], NULL); set_bit(i, deleted); } - mutex_unlock(&pnn->routes.lock); + } + spin_unlock(&pnn->routes.lock); if (bitmap_empty(deleted, 64)) return; /* short-circuit RCU */ @@ -326,7 +328,7 @@ static int __net_init phonet_init_net(struct net *net) INIT_LIST_HEAD(&pnn->pndevs.list); spin_lock_init(&pnn->pndevs.lock); - mutex_init(&pnn->routes.lock); + spin_lock_init(&pnn->routes.lock); return 0; } @@ -376,13 +378,15 @@ int phonet_route_add(struct net_device *dev, u8 daddr) int err = -EEXIST; daddr = daddr >> 2; - mutex_lock(&routes->lock); + + spin_lock(&routes->lock); if (routes->table[daddr] == NULL) { rcu_assign_pointer(routes->table[daddr], dev); dev_hold(dev); err = 0; } - mutex_unlock(&routes->lock); + spin_unlock(&routes->lock); + return err; } @@ -392,12 +396,13 @@ int phonet_route_del(struct net_device *dev, u8 daddr) struct phonet_routes *routes = &pnn->routes; daddr = daddr >> 2; - mutex_lock(&routes->lock); + + spin_lock(&routes->lock); if (rcu_access_pointer(routes->table[daddr]) == dev) RCU_INIT_POINTER(routes->table[daddr], NULL); else dev = NULL; - mutex_unlock(&routes->lock); + spin_unlock(&routes->lock); if (!dev) return -ENOENT; -- cgit v1.3 From 17a1ac0018ae1cee0b2c2235ce54e91ecbbed7be Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Oct 2024 11:31:40 -0700 Subject: phonet: Don't hold RTNL for route_doit(). Now only __dev_get_by_index() depends on RTNL in route_doit(). Let's use dev_get_by_index_rcu() and register route_doit() with RTNL_FLAG_DOIT_UNLOCKED. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/phonet/pn_netlink.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c index bfec5bd639b6..ca1f04e4a2d9 100644 --- a/net/phonet/pn_netlink.c +++ b/net/phonet/pn_netlink.c @@ -245,8 +245,6 @@ static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh, if (!netlink_capable(skb, CAP_SYS_ADMIN)) return -EPERM; - ASSERT_RTNL(); - err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_phonet_policy, extack); if (err < 0) @@ -262,16 +260,25 @@ static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; ifindex = nla_get_u32(tb[RTA_OIF]); - dev = __dev_get_by_index(net, ifindex); - if (dev == NULL) + + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, ifindex); + if (!dev) { + rcu_read_unlock(); return -ENODEV; + } if (nlh->nlmsg_type == RTM_NEWROUTE) err = phonet_route_add(dev, dst); else err = phonet_route_del(dev, dst); + + rcu_read_unlock(); + if (!err) rtm_phonet_notify(net, nlh->nlmsg_type, ifindex, dst); + return err; } @@ -308,9 +315,9 @@ static const struct rtnl_msg_handler phonet_rtnl_msg_handlers[] __initdata_or_mo {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_GETADDR, .dumpit = getaddr_dumpit, .flags = RTNL_FLAG_DUMP_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_NEWROUTE, - .doit = route_doit}, + .doit = route_doit, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_DELROUTE, - .doit = route_doit}, + .doit = route_doit, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_PHONET, .msgtype = RTM_GETROUTE, .dumpit = route_dumpit, .flags = RTNL_FLAG_DUMP_UNLOCKED}, }; -- cgit v1.3 From b9a5a07aeaa2a903fb1306eb422880b2fa5f937f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 23 Oct 2024 16:47:50 -0700 Subject: bpf: Add "bool swap_uptrs" arg to bpf_local_storage_update() and bpf_selem_alloc() In a later patch, the task local storage will only accept uptr from the syscall update_elem and will not accept uptr from the bpf prog. The reason is the bpf prog does not have a way to provide a valid user space address. bpf_local_storage_update() and bpf_selem_alloc() are used by both bpf prog bpf_task_storage_get(BPF_LOCAL_STORAGE_GET_F_CREATE) and bpf syscall update_elem. "bool swap_uptrs" arg is added to bpf_local_storage_update() and bpf_selem_alloc() to tell if it is called by the bpf prog or by the bpf syscall. When swap_uptrs==true, it is called by the syscall. The arg is named (swap_)uptrs because the later patch will swap the uptrs between the newly allocated selem and the user space provided map_value. It will make error handling easier in case map->ops->map_update_elem() fails and the caller can decide if it needs to unpin the uptr in the user space provided map_value or the bpf_local_storage_update() has already taken the uptr ownership and will take care of unpinning it also. Only swap_uptrs==false is passed now. The logic to handle the true case will be added in a later patch. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20241023234759.860539-4-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 4 ++-- kernel/bpf/bpf_cgrp_storage.c | 4 ++-- kernel/bpf/bpf_inode_storage.c | 4 ++-- kernel/bpf/bpf_local_storage.c | 8 ++++---- kernel/bpf/bpf_task_storage.c | 4 ++-- net/core/bpf_sk_storage.c | 6 +++--- 6 files changed, 15 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index dcddb0aef7d8..0c7216c065d5 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -181,7 +181,7 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, - bool charge_mem, gfp_t gfp_flags); + bool charge_mem, bool swap_uptrs, gfp_t gfp_flags); void bpf_selem_free(struct bpf_local_storage_elem *selem, struct bpf_local_storage_map *smap, @@ -195,7 +195,7 @@ bpf_local_storage_alloc(void *owner, struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, - void *value, u64 map_flags, gfp_t gfp_flags); + void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags); u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map); diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 28efd0a3f220..20f05de92e9c 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -107,7 +107,7 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, bpf_cgrp_storage_lock(); sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, - value, map_flags, GFP_ATOMIC); + value, map_flags, false, GFP_ATOMIC); bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return PTR_ERR_OR_ZERO(sdata); @@ -181,7 +181,7 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, if (!percpu_ref_is_dying(&cgroup->self.refcnt) && (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, - value, BPF_NOEXIST, gfp_flags); + value, BPF_NOEXIST, false, gfp_flags); unlock: bpf_cgrp_storage_unlock(); diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 29da6d3838f6..44ccebc745e5 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -100,7 +100,7 @@ static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, sdata = bpf_local_storage_update(file_inode(fd_file(f)), (struct bpf_local_storage_map *)map, - value, map_flags, GFP_ATOMIC); + value, map_flags, false, GFP_ATOMIC); return PTR_ERR_OR_ZERO(sdata); } @@ -154,7 +154,7 @@ BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { sdata = bpf_local_storage_update( inode, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST, gfp_flags); + BPF_NOEXIST, false, gfp_flags); return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index c938dea5ddbf..1cf772cb26eb 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -73,7 +73,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, - void *value, bool charge_mem, gfp_t gfp_flags) + void *value, bool charge_mem, bool swap_uptrs, gfp_t gfp_flags) { struct bpf_local_storage_elem *selem; @@ -524,7 +524,7 @@ uncharge: */ struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, - void *value, u64 map_flags, gfp_t gfp_flags) + void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags) { struct bpf_local_storage_data *old_sdata = NULL; struct bpf_local_storage_elem *alloc_selem, *selem = NULL; @@ -550,7 +550,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (err) return ERR_PTR(err); - selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); + selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags); if (!selem) return ERR_PTR(-ENOMEM); @@ -584,7 +584,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, /* A lookup has just been done before and concluded a new selem is * needed. The chance of an unnecessary alloc is unlikely. */ - alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); + alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags); if (!alloc_selem) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index adf6dfe0ba68..45dc3ca334d3 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -147,7 +147,7 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, bpf_task_storage_lock(); sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, map_flags, - GFP_ATOMIC); + false, GFP_ATOMIC); bpf_task_storage_unlock(); err = PTR_ERR_OR_ZERO(sdata); @@ -219,7 +219,7 @@ static void *__bpf_task_storage_get(struct bpf_map *map, (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) { sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST, gfp_flags); + BPF_NOEXIST, false, gfp_flags); return IS_ERR(sdata) ? NULL : sdata->data; } diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index bc01b3aa6b0f..2f4ed83a75ae 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -106,7 +106,7 @@ static long bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, if (sock) { sdata = bpf_local_storage_update( sock->sk, (struct bpf_local_storage_map *)map, value, - map_flags, GFP_ATOMIC); + map_flags, false, GFP_ATOMIC); sockfd_put(sock); return PTR_ERR_OR_ZERO(sdata); } @@ -137,7 +137,7 @@ bpf_sk_storage_clone_elem(struct sock *newsk, { struct bpf_local_storage_elem *copy_selem; - copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, GFP_ATOMIC); + copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, false, GFP_ATOMIC); if (!copy_selem) return NULL; @@ -243,7 +243,7 @@ BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, refcount_inc_not_zero(&sk->sk_refcnt)) { sdata = bpf_local_storage_update( sk, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST, gfp_flags); + BPF_NOEXIST, false, gfp_flags); /* sk must be a fullsock (guaranteed by verifier), * so sock_gen_put() is unnecessary. */ -- cgit v1.3 From a42f3076648e0b507de9039f8085edcc10b35fb7 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 21 Oct 2024 17:14:03 +0200 Subject: mptcp: pm: send ACK on non-stale subflows If the subflow is considered as "staled", it is better to avoid it to send an ACK carrying an ADD_ADDR or RM_ADDR. Another subflow, if any, will then be selected. Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241021-net-next-mptcp-misc-6-13-v1-1-1ef02746504a@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index db586a5b3866..618289aac0ab 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -781,7 +781,7 @@ bool mptcp_pm_nl_is_init_remote_addr(struct mptcp_sock *msk, void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) { - struct mptcp_subflow_context *subflow; + struct mptcp_subflow_context *subflow, *alt = NULL; msk_owned_by_me(msk); lockdep_assert_held(&msk->pm.lock); @@ -792,10 +792,18 @@ void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) mptcp_for_each_subflow(msk, subflow) { if (__mptcp_subflow_active(subflow)) { - mptcp_pm_send_ack(msk, subflow, false, false); - break; + if (!subflow->stale) { + mptcp_pm_send_ack(msk, subflow, false, false); + return; + } + + if (!alt) + alt = subflow; } } + + if (alt) + mptcp_pm_send_ack(msk, alt, false, false); } int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, -- cgit v1.3 From 581c8cbfa934aaa555daa4e843242fcecc160f05 Mon Sep 17 00:00:00 2001 From: Gang Yan Date: Mon, 21 Oct 2024 17:14:04 +0200 Subject: mptcp: annotate data-races around subflow->fully_established We introduce the same handling for potential data races with the 'fully_established' flag in subflow as previously done for msk->fully_established. Additionally, we make a crucial change: convert the subflow's 'fully_established' from 'bit_field' to 'bool' type. This is necessary because methods for avoiding data races don't work well with 'bit_field'. Specifically, the 'READ_ONCE' needs to know the size of the variable being accessed, which is not supported in 'bit_field'. Also, 'test_bit' expect the address of 'bit_field'. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/516 Signed-off-by: Gang Yan Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241021-net-next-mptcp-misc-6-13-v1-2-1ef02746504a@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/diag.c | 2 +- net/mptcp/options.c | 4 ++-- net/mptcp/protocol.c | 2 +- net/mptcp/protocol.h | 6 +++--- net/mptcp/subflow.c | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c index 2d3efb405437..02205f7994d7 100644 --- a/net/mptcp/diag.c +++ b/net/mptcp/diag.c @@ -47,7 +47,7 @@ static int subflow_get_info(struct sock *sk, struct sk_buff *skb) flags |= MPTCP_SUBFLOW_FLAG_BKUP_REM; if (sf->request_bkup) flags |= MPTCP_SUBFLOW_FLAG_BKUP_LOC; - if (sf->fully_established) + if (READ_ONCE(sf->fully_established)) flags |= MPTCP_SUBFLOW_FLAG_FULLY_ESTABLISHED; if (sf->conn_finished) flags |= MPTCP_SUBFLOW_FLAG_CONNECTED; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 370c3836b771..1603b3702e22 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -461,7 +461,7 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, return false; /* MPC/MPJ needed only on 3rd ack packet, DATA_FIN and TCP shutdown take precedence */ - if (subflow->fully_established || snd_data_fin_enable || + if (READ_ONCE(subflow->fully_established) || snd_data_fin_enable || subflow->snd_isn != TCP_SKB_CB(skb)->seq || sk->sk_state != TCP_ESTABLISHED) return false; @@ -930,7 +930,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, /* here we can process OoO, in-window pkts, only in-sequence 4th ack * will make the subflow fully established */ - if (likely(subflow->fully_established)) { + if (likely(READ_ONCE(subflow->fully_established))) { /* on passive sockets, check for 3rd ack retransmission * note that msk is always set by subflow_syn_recv_sock() * for mp_join subflows diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 1f5c63eb21f0..a6c9661a4c45 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3511,7 +3511,7 @@ static void schedule_3rdack_retransmission(struct sock *ssk) struct tcp_sock *tp = tcp_sk(ssk); unsigned long timeout; - if (mptcp_subflow_ctx(ssk)->fully_established) + if (READ_ONCE(mptcp_subflow_ctx(ssk)->fully_established)) return; /* reschedule with a timeout above RTT, as we must look only for drop */ diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 568a72702b08..a93e661ef5c4 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -513,7 +513,6 @@ struct mptcp_subflow_context { request_bkup : 1, mp_capable : 1, /* remote is MPTCP capable */ mp_join : 1, /* remote is JOINing */ - fully_established : 1, /* path validated */ pm_notified : 1, /* PM hook called for established status */ conn_finished : 1, map_valid : 1, @@ -532,10 +531,11 @@ struct mptcp_subflow_context { is_mptfo : 1, /* subflow is doing TFO */ close_event_done : 1, /* has done the post-closed part */ mpc_drop : 1, /* the MPC option has been dropped in a rtx */ - __unused : 8; + __unused : 9; bool data_avail; bool scheduled; bool pm_listener; /* a listener managed by the kernel PM? */ + bool fully_established; /* path validated */ u32 remote_nonce; u64 thmac; u32 local_nonce; @@ -780,7 +780,7 @@ static inline bool __tcp_can_send(const struct sock *ssk) static inline bool __mptcp_subflow_active(struct mptcp_subflow_context *subflow) { /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */ - if (subflow->request_join && !subflow->fully_established) + if (subflow->request_join && !READ_ONCE(subflow->fully_established)) return false; return __tcp_can_send(mptcp_subflow_tcp_sock(subflow)); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 6170f2fff71e..860903e06422 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -800,7 +800,7 @@ void __mptcp_subflow_fully_established(struct mptcp_sock *msk, const struct mptcp_options_received *mp_opt) { subflow_set_remote_key(msk, subflow, mp_opt); - subflow->fully_established = 1; + WRITE_ONCE(subflow->fully_established, true); WRITE_ONCE(msk->fully_established, true); if (subflow->is_mptfo) @@ -2062,7 +2062,7 @@ static void subflow_ulp_clone(const struct request_sock *req, } else if (subflow_req->mp_join) { new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->mp_join = 1; - new_ctx->fully_established = 1; + WRITE_ONCE(new_ctx->fully_established, true); new_ctx->remote_key_valid = 1; new_ctx->backup = subflow_req->backup; new_ctx->request_bkup = subflow_req->request_bkup; -- cgit v1.3 From 5add80bfdc46f9ad6857c80e3af109177e59a280 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 21 Oct 2024 17:14:05 +0200 Subject: mptcp: implement mptcp_pm_connection_closed The MPTCP path manager event handler mptcp_pm_connection_closed interface has been added in the commit 1b1c7a0ef7f3 ("mptcp: Add path manager interface") but it was an empty function from then on. With such name, it sounds good to invoke mptcp_event with the MPTCP_EVENT_CLOSED event type from it. It also removes a bit of duplicated code. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241021-net-next-mptcp-misc-6-13-v1-3-1ef02746504a@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 3 +++ net/mptcp/protocol.c | 6 ++---- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 620264c75dc2..16c336c51940 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -154,6 +154,9 @@ void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk) void mptcp_pm_connection_closed(struct mptcp_sock *msk) { pr_debug("msk=%p\n", msk); + + if (msk->token) + mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL); } void mptcp_pm_subflow_established(struct mptcp_sock *msk) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index a6c9661a4c45..e978e05ec8d1 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3139,8 +3139,7 @@ cleanup: sock_hold(sk); pr_debug("msk=%p state=%d\n", sk, sk->sk_state); - if (msk->token) - mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL); + mptcp_pm_connection_closed(msk); if (sk->sk_state == TCP_CLOSE) { __mptcp_destroy_sock(sk); @@ -3206,8 +3205,7 @@ static int mptcp_disconnect(struct sock *sk, int flags) mptcp_stop_rtx_timer(sk); mptcp_stop_tout_timer(sk); - if (msk->token) - mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL); + mptcp_pm_connection_closed(msk); /* msk->subflow is still intact, the following will not free the first * subflow -- cgit v1.3 From 46a3282b87b1f9a88534eba59ecf852b2a21289c Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Mon, 21 Oct 2024 17:14:06 +0200 Subject: mptcp: use "middlebox interference" RST when no DSS RFC8684 suggests use of "Middlebox interference (code 0x06)" in case of fully established subflow that carries data at TCP level with no DSS sub-option. This is generally the case when mpext is NULL or mpext->use_map is 0: use a dedicated value of 'mapping_status' and use it before closing the socket in subflow_check_data_avail(). Link: https://github.com/multipath-tcp/mptcp_net-next/issues/518 Signed-off-by: Davide Caratti Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241021-net-next-mptcp-misc-6-13-v1-4-1ef02746504a@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 860903e06422..07352b15f145 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -971,7 +971,8 @@ enum mapping_status { MAPPING_EMPTY, MAPPING_DATA_FIN, MAPPING_DUMMY, - MAPPING_BAD_CSUM + MAPPING_BAD_CSUM, + MAPPING_NODSS }; static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) @@ -1128,8 +1129,9 @@ static enum mapping_status get_mapping_status(struct sock *ssk, return MAPPING_EMPTY; } + /* If the required DSS has likely been dropped by a middlebox */ if (!subflow->map_valid) - return MAPPING_INVALID; + return MAPPING_NODSS; goto validate_seq; } @@ -1343,7 +1345,7 @@ static bool subflow_check_data_avail(struct sock *ssk) status = get_mapping_status(ssk, msk); trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue)); if (unlikely(status == MAPPING_INVALID || status == MAPPING_DUMMY || - status == MAPPING_BAD_CSUM)) + status == MAPPING_BAD_CSUM || status == MAPPING_NODSS)) goto fallback; if (status != MAPPING_OK) @@ -1396,7 +1398,9 @@ fallback: * subflow_error_report() will introduce the appropriate barriers */ subflow->reset_transient = 0; - subflow->reset_reason = MPTCP_RST_EMPTCP; + subflow->reset_reason = status == MAPPING_NODSS ? + MPTCP_RST_EMIDDLEBOX : + MPTCP_RST_EMPTCP; reset: WRITE_ONCE(ssk->sk_err, EBADMSG); -- cgit v1.3 From ba4e469e42fe1a771b5653d179eb12dc4be6b6a8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Oct 2024 13:48:19 +0000 Subject: vsock: do not leave dangling sk pointer in vsock_create() syzbot was able to trigger the following warning after recent core network cleanup. On error vsock_create() frees the allocated sk object, but sock_init_data() has already attached it to the provided sock object. We must clear sock->sk to avoid possible use-after-free later. WARNING: CPU: 0 PID: 5282 at net/socket.c:1581 __sock_create+0x897/0x950 net/socket.c:1581 Modules linked in: CPU: 0 UID: 0 PID: 5282 Comm: syz.2.43 Not tainted 6.12.0-rc2-syzkaller-00667-g53bac8330865 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:__sock_create+0x897/0x950 net/socket.c:1581 Code: 7f 06 01 65 48 8b 34 25 00 d8 03 00 48 81 c6 b0 08 00 00 48 c7 c7 60 0b 0d 8d e8 d4 9a 3c 02 e9 11 f8 ff ff e8 0a ab 0d f8 90 <0f> 0b 90 e9 82 fd ff ff 89 e9 80 e1 07 fe c1 38 c1 0f 8c c7 f8 ff RSP: 0018:ffffc9000394fda8 EFLAGS: 00010293 RAX: ffffffff89873c46 RBX: ffff888079f3c818 RCX: ffff8880314b9e00 RDX: 0000000000000000 RSI: 00000000ffffffed RDI: 0000000000000000 RBP: ffffffff8d3337f0 R08: ffffffff8987384e R09: ffffffff8989473a R10: dffffc0000000000 R11: fffffbfff203a276 R12: 00000000ffffffed R13: ffff888079f3c8c0 R14: ffffffff898736e7 R15: dffffc0000000000 FS: 00005555680ab500(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f22b11196d0 CR3: 00000000308c0000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: sock_create net/socket.c:1632 [inline] __sys_socket_create net/socket.c:1669 [inline] __sys_socket+0x150/0x3c0 net/socket.c:1716 __do_sys_socket net/socket.c:1730 [inline] __se_sys_socket net/socket.c:1728 [inline] __x64_sys_socket+0x7a/0x90 net/socket.c:1728 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f22b117dff9 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fff56aec0e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000029 RAX: ffffffffffffffda RBX: 00007f22b1335f80 RCX: 00007f22b117dff9 RDX: 0000000000000000 RSI: 0000000000000002 RDI: 0000000000000028 RBP: 00007f22b11f0296 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007f22b1335f80 R14: 00007f22b1335f80 R15: 00000000000012dd Fixes: 48156296a08c ("net: warn, if pf->create does not clear sock->sk on error") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Ignat Korchagin Reviewed-by: Kuniyuki Iwashima Reviewed-by: Stefano Garzarella Link: https://patch.msgid.link/20241022134819.1085254-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/vmw_vsock/af_vsock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 35681adedd9a..109b7a0bd071 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -2417,6 +2417,7 @@ static int vsock_create(struct net *net, struct socket *sock, if (sock->type == SOCK_DGRAM) { ret = vsock_assign_transport(vsk, NULL); if (ret < 0) { + sock->sk = NULL; sock_put(sk); return ret; } -- cgit v1.3 From ab101c553bc1f76a839163d1dc0d1e715ad6bb4e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Oct 2024 15:00:59 +0000 Subject: neighbour: use kvzalloc()/kvfree() mm layer is providing convenient functions, we do not have to work around old limitations. Signed-off-by: Eric Dumazet Cc: Gilad Naaman Reviewed-by: Joe Damato Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241022150059.1345406-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/neighbour.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 395ae1626eef..4b871cecd2ce 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -14,7 +14,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include -#include #include #include #include @@ -538,14 +537,7 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) ret = kmalloc(sizeof(*ret), GFP_ATOMIC); if (!ret) return NULL; - if (size <= PAGE_SIZE) { - buckets = kzalloc(size, GFP_ATOMIC); - } else { - buckets = (struct neighbour __rcu **) - __get_free_pages(GFP_ATOMIC | __GFP_ZERO, - get_order(size)); - kmemleak_alloc(buckets, size, 1, GFP_ATOMIC); - } + buckets = kvzalloc(size, GFP_ATOMIC); if (!buckets) { kfree(ret); return NULL; @@ -562,15 +554,8 @@ static void neigh_hash_free_rcu(struct rcu_head *head) struct neigh_hash_table *nht = container_of(head, struct neigh_hash_table, rcu); - size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *); - struct neighbour __rcu **buckets = nht->hash_buckets; - if (size <= PAGE_SIZE) { - kfree(buckets); - } else { - kmemleak_free(buckets); - free_pages((unsigned long)buckets, get_order(size)); - } + kvfree(nht->hash_buckets); kfree(nht); } -- cgit v1.3 From 2d34429d14f9d09b38a8bee6a972a07228378df6 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 21 Oct 2024 11:32:30 -0700 Subject: ipv4: Factorise RTM_NEWADDR validation to inet_validate_rtm(). rtm_to_ifaddr() validates some attributes, looks up a netdev, allocates struct in_ifaddr, and validates IFA_CACHEINFO. There is no reason to delay IFA_CACHEINFO validation. We will push RTNL down to inet_rtm_newaddr(), and then we want to complete rtnetlink validation before rtnl_net_lock(). Let's factorise the validation parts. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/ipv4/devinet.c | 79 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 44 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 5f859d01cbbe..da5412fb34e7 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -846,35 +846,54 @@ static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft, WRITE_ONCE(ifa->ifa_cstamp, ifa->ifa_tstamp); } -static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, - __u32 *pvalid_lft, __u32 *pprefered_lft, - struct netlink_ext_ack *extack) +static int inet_validate_rtm(struct nlmsghdr *nlh, struct nlattr **tb, + struct netlink_ext_ack *extack, + __u32 *valid_lft, __u32 *prefered_lft) { - struct nlattr *tb[IFA_MAX+1]; - struct in_ifaddr *ifa; - struct ifaddrmsg *ifm; - struct net_device *dev; - struct in_device *in_dev; + struct ifaddrmsg *ifm = nlmsg_data(nlh); int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, extack); if (err < 0) - goto errout; - - ifm = nlmsg_data(nlh); - err = -EINVAL; + return err; if (ifm->ifa_prefixlen > 32) { NL_SET_ERR_MSG(extack, "ipv4: Invalid prefix length"); - goto errout; + return -EINVAL; } if (!tb[IFA_LOCAL]) { NL_SET_ERR_MSG(extack, "ipv4: Local address is not supplied"); - goto errout; + return -EINVAL; } + if (tb[IFA_CACHEINFO]) { + struct ifa_cacheinfo *ci; + + ci = nla_data(tb[IFA_CACHEINFO]); + if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) { + NL_SET_ERR_MSG(extack, "ipv4: address lifetime invalid"); + return -EINVAL; + } + + *valid_lft = ci->ifa_valid; + *prefered_lft = ci->ifa_prefered; + } + + return 0; +} + +static struct in_ifaddr *inet_rtm_to_ifa(struct net *net, struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct ifaddrmsg *ifm = nlmsg_data(nlh); + struct in_device *in_dev; + struct net_device *dev; + struct in_ifaddr *ifa; + int err; + dev = __dev_get_by_index(net, ifm->ifa_index); err = -ENODEV; if (!dev) { @@ -923,23 +942,8 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, if (tb[IFA_PROTO]) ifa->ifa_proto = nla_get_u8(tb[IFA_PROTO]); - if (tb[IFA_CACHEINFO]) { - struct ifa_cacheinfo *ci; - - ci = nla_data(tb[IFA_CACHEINFO]); - if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) { - NL_SET_ERR_MSG(extack, "ipv4: address lifetime invalid"); - err = -EINVAL; - goto errout_free; - } - *pvalid_lft = ci->ifa_valid; - *pprefered_lft = ci->ifa_prefered; - } - return ifa; -errout_free: - inet_free_ifa(ifa); errout: return ERR_PTR(err); } @@ -964,15 +968,21 @@ static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa) static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + __u32 prefered_lft = INFINITY_LIFE_TIME; + __u32 valid_lft = INFINITY_LIFE_TIME; struct net *net = sock_net(skb->sk); - struct in_ifaddr *ifa; struct in_ifaddr *ifa_existing; - __u32 valid_lft = INFINITY_LIFE_TIME; - __u32 prefered_lft = INFINITY_LIFE_TIME; + struct nlattr *tb[IFA_MAX + 1]; + struct in_ifaddr *ifa; + int ret; ASSERT_RTNL(); - ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft, extack); + ret = inet_validate_rtm(nlh, tb, extack, &valid_lft, &prefered_lft); + if (ret < 0) + return ret; + + ifa = inet_rtm_to_ifa(net, nlh, tb, extack); if (IS_ERR(ifa)) return PTR_ERR(ifa); @@ -983,8 +993,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, */ set_ifa_lifetime(ifa, valid_lft, prefered_lft); if (ifa->ifa_flags & IFA_F_MCAUTOJOIN) { - int ret = ip_mc_autojoin_config(net, true, ifa); - + ret = ip_mc_autojoin_config(net, true, ifa); if (ret < 0) { NL_SET_ERR_MSG(extack, "ipv4: Multicast auto join failed"); inet_free_ifa(ifa); -- cgit v1.3 From abd0deff03d854cb34818e1e01490296d0314ea1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 21 Oct 2024 11:32:31 -0700 Subject: ipv4: Don't allocate ifa for 0.0.0.0 in inet_rtm_newaddr(). When we pass 0.0.0.0 to __inet_insert_ifa(), it frees ifa and returns 0. We can do this check much earlier for RTM_NEWADDR even before allocating struct in_ifaddr. Let's move the validation to 1. inet_insert_ifa() for ioctl() 2. inet_rtm_newaddr() for RTM_NEWADDR Now, we can remove the same check in find_matching_ifa(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/ipv4/devinet.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index da5412fb34e7..8db84c70ebed 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -508,11 +508,6 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, ASSERT_RTNL(); - if (!ifa->ifa_local) { - inet_free_ifa(ifa); - return 0; - } - ifa->ifa_flags &= ~IFA_F_SECONDARY; last_primary = &in_dev->ifa_list; @@ -584,6 +579,11 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, static int inet_insert_ifa(struct in_ifaddr *ifa) { + if (!ifa->ifa_local) { + inet_free_ifa(ifa); + return 0; + } + return __inet_insert_ifa(ifa, NULL, 0, NULL); } @@ -953,15 +953,13 @@ static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa) struct in_device *in_dev = ifa->ifa_dev; struct in_ifaddr *ifa1; - if (!ifa->ifa_local) - return NULL; - in_dev_for_each_ifa_rtnl(ifa1, in_dev) { if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa) && ifa1->ifa_local == ifa->ifa_local) return ifa1; } + return NULL; } @@ -982,6 +980,9 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, if (ret < 0) return ret; + if (!nla_get_in_addr(tb[IFA_LOCAL])) + return 0; + ifa = inet_rtm_to_ifa(net, nlh, tb, extack); if (IS_ERR(ifa)) return PTR_ERR(ifa); -- cgit v1.3 From 487257786b71172648664164ba567e807e1e11fc Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 21 Oct 2024 11:32:32 -0700 Subject: ipv4: Convert RTM_NEWADDR to per-netns RTNL. The address hash table and GC are already namespacified. Let's push down RTNL into inet_rtm_newaddr() as rtnl_net_lock(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/ipv4/devinet.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 8db84c70ebed..7f24bc38981b 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -974,8 +974,6 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, struct in_ifaddr *ifa; int ret; - ASSERT_RTNL(); - ret = inet_validate_rtm(nlh, tb, extack, &valid_lft, &prefered_lft); if (ret < 0) return ret; @@ -983,9 +981,13 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, if (!nla_get_in_addr(tb[IFA_LOCAL])) return 0; + rtnl_net_lock(net); + ifa = inet_rtm_to_ifa(net, nlh, tb, extack); - if (IS_ERR(ifa)) - return PTR_ERR(ifa); + if (IS_ERR(ifa)) { + ret = PTR_ERR(ifa); + goto unlock; + } ifa_existing = find_matching_ifa(ifa); if (!ifa_existing) { @@ -998,11 +1000,11 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, if (ret < 0) { NL_SET_ERR_MSG(extack, "ipv4: Multicast auto join failed"); inet_free_ifa(ifa); - return ret; + goto unlock; } } - return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid, - extack); + + ret = __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid, extack); } else { u32 new_metric = ifa->ifa_rt_priority; u8 new_proto = ifa->ifa_proto; @@ -1012,7 +1014,8 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, if (nlh->nlmsg_flags & NLM_F_EXCL || !(nlh->nlmsg_flags & NLM_F_REPLACE)) { NL_SET_ERR_MSG(extack, "ipv4: Address already assigned"); - return -EEXIST; + ret = -EEXIST; + goto unlock; } ifa = ifa_existing; @@ -1029,7 +1032,11 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, &net->ipv4.addr_chk_work, 0); rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid); } - return 0; + +unlock: + rtnl_net_unlock(net); + + return ret; } /* @@ -2823,7 +2830,8 @@ static struct rtnl_af_ops inet_af_ops __read_mostly = { }; static const struct rtnl_msg_handler devinet_rtnl_msg_handlers[] __initconst = { - {.protocol = PF_INET, .msgtype = RTM_NEWADDR, .doit = inet_rtm_newaddr}, + {.protocol = PF_INET, .msgtype = RTM_NEWADDR, .doit = inet_rtm_newaddr, + .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_DELADDR, .doit = inet_rtm_deladdr}, {.protocol = PF_INET, .msgtype = RTM_GETADDR, .dumpit = inet_dump_ifaddr, .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, -- cgit v1.3 From d4b483208b2606add41a22bdd3c8cd6d36009319 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 21 Oct 2024 11:32:33 -0700 Subject: ipv4: Use per-netns RTNL helpers in inet_rtm_newaddr(). inet_rtm_to_ifa() and find_matching_ifa() are called under rtnl_net_lock(). __in_dev_get_rtnl() and in_dev_for_each_ifa_rtnl() there can use per-netns RTNL helpers. Let's define and use __in_dev_get_rtnl_net() and in_dev_for_each_ifa_rtnl_net(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/linux/inetdevice.h | 9 +++++++++ net/ipv4/devinet.c | 8 ++++---- 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index d9c690c8c80b..5730ba6b1cfa 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -226,6 +226,10 @@ static __inline__ bool bad_mask(__be32 mask, __be32 addr) for (ifa = rtnl_dereference((in_dev)->ifa_list); ifa; \ ifa = rtnl_dereference(ifa->ifa_next)) +#define in_dev_for_each_ifa_rtnl_net(net, ifa, in_dev) \ + for (ifa = rtnl_net_dereference(net, (in_dev)->ifa_list); ifa; \ + ifa = rtnl_net_dereference(net, ifa->ifa_next)) + #define in_dev_for_each_ifa_rcu(ifa, in_dev) \ for (ifa = rcu_dereference((in_dev)->ifa_list); ifa; \ ifa = rcu_dereference(ifa->ifa_next)) @@ -252,6 +256,11 @@ static inline struct in_device *__in_dev_get_rtnl(const struct net_device *dev) return rtnl_dereference(dev->ip_ptr); } +static inline struct in_device *__in_dev_get_rtnl_net(const struct net_device *dev) +{ + return rtnl_net_dereference(dev_net(dev), dev->ip_ptr); +} + /* called with rcu_read_lock or rtnl held */ static inline bool ip_ignore_linkdown(const struct net_device *dev) { diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 7f24bc38981b..e14e35c22054 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -901,7 +901,7 @@ static struct in_ifaddr *inet_rtm_to_ifa(struct net *net, struct nlmsghdr *nlh, goto errout; } - in_dev = __in_dev_get_rtnl(dev); + in_dev = __in_dev_get_rtnl_net(dev); err = -ENOBUFS; if (!in_dev) goto errout; @@ -948,12 +948,12 @@ errout: return ERR_PTR(err); } -static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa) +static struct in_ifaddr *find_matching_ifa(struct net *net, struct in_ifaddr *ifa) { struct in_device *in_dev = ifa->ifa_dev; struct in_ifaddr *ifa1; - in_dev_for_each_ifa_rtnl(ifa1, in_dev) { + in_dev_for_each_ifa_rtnl_net(net, ifa1, in_dev) { if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa) && ifa1->ifa_local == ifa->ifa_local) @@ -989,7 +989,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, goto unlock; } - ifa_existing = find_matching_ifa(ifa); + ifa_existing = find_matching_ifa(net, ifa); if (!ifa_existing) { /* It would be best to check for !NLM_F_CREATE here but * userspace already relies on not having to provide this. -- cgit v1.3 From 4df5066f079cfbc563c2da031b02b4ba2d9e1ba0 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 21 Oct 2024 11:32:34 -0700 Subject: ipv4: Convert RTM_DELADDR to per-netns RTNL. Let's push down RTNL into inet_rtm_deladdr() as rtnl_net_lock(). Now, ip_mc_autojoin_config() is always called under per-netns RTNL, so ASSERT_RTNL() can be replaced with ASSERT_RTNL_NET(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/ipv4/devinet.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index e14e35c22054..6b7780e12f34 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -645,7 +645,7 @@ static int ip_mc_autojoin_config(struct net *net, bool join, struct sock *sk = net->ipv4.mc_autojoin_sk; int ret; - ASSERT_RTNL(); + ASSERT_RTNL_NET(net); lock_sock(sk); if (join) @@ -671,22 +671,24 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, struct in_ifaddr *ifa; int err; - ASSERT_RTNL(); - err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, extack); if (err < 0) - goto errout; + goto out; ifm = nlmsg_data(nlh); + + rtnl_net_lock(net); + in_dev = inetdev_by_index(net, ifm->ifa_index); if (!in_dev) { NL_SET_ERR_MSG(extack, "ipv4: Device not found"); err = -ENODEV; - goto errout; + goto unlock; } - for (ifap = &in_dev->ifa_list; (ifa = rtnl_dereference(*ifap)) != NULL; + for (ifap = &in_dev->ifa_list; + (ifa = rtnl_net_dereference(net, *ifap)) != NULL; ifap = &ifa->ifa_next) { if (tb[IFA_LOCAL] && ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL])) @@ -702,13 +704,16 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, if (ipv4_is_multicast(ifa->ifa_address)) ip_mc_autojoin_config(net, false, ifa); + __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid); - return 0; + goto unlock; } NL_SET_ERR_MSG(extack, "ipv4: Address not found"); err = -EADDRNOTAVAIL; -errout: +unlock: + rtnl_net_unlock(net); +out: return err; } @@ -2832,7 +2837,8 @@ static struct rtnl_af_ops inet_af_ops __read_mostly = { static const struct rtnl_msg_handler devinet_rtnl_msg_handlers[] __initconst = { {.protocol = PF_INET, .msgtype = RTM_NEWADDR, .doit = inet_rtm_newaddr, .flags = RTNL_FLAG_DOIT_PERNET}, - {.protocol = PF_INET, .msgtype = RTM_DELADDR, .doit = inet_rtm_deladdr}, + {.protocol = PF_INET, .msgtype = RTM_DELADDR, .doit = inet_rtm_deladdr, + .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_GETADDR, .dumpit = inet_dump_ifaddr, .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, {.protocol = PF_INET, .msgtype = RTM_GETNETCONF, -- cgit v1.3 From c350c4761e7f4767dea59aef036ce13276466fd0 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 21 Oct 2024 11:32:35 -0700 Subject: ipv4: Convert check_lifetime() to per-netns RTNL. Since commit 1675f385213e ("ipv4: Namespacify IPv4 address GC."), check_lifetime() works on a per-netns basis. Let's use rtnl_net_lock() and rtnl_net_dereference(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/ipv4/devinet.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 6b7780e12f34..5eaef3bbb987 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -771,7 +771,8 @@ static void check_lifetime(struct work_struct *work) rcu_read_unlock(); if (!change_needed) continue; - rtnl_lock(); + + rtnl_net_lock(net); hlist_for_each_entry_safe(ifa, n, head, addr_lst) { unsigned long age; @@ -788,7 +789,7 @@ static void check_lifetime(struct work_struct *work) struct in_ifaddr *tmp; ifap = &ifa->ifa_dev->ifa_list; - tmp = rtnl_dereference(*ifap); + tmp = rtnl_net_dereference(net, *ifap); while (tmp) { if (tmp == ifa) { inet_del_ifa(ifa->ifa_dev, @@ -796,7 +797,7 @@ static void check_lifetime(struct work_struct *work) break; } ifap = &tmp->ifa_next; - tmp = rtnl_dereference(*ifap); + tmp = rtnl_net_dereference(net, *ifap); } } else if (ifa->ifa_preferred_lft != INFINITY_LIFE_TIME && @@ -806,7 +807,7 @@ static void check_lifetime(struct work_struct *work) rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); } } - rtnl_unlock(); + rtnl_net_unlock(net); } next_sec = round_jiffies_up(next); -- cgit v1.3 From d1c81818aa227b37d65b40f9883109c5256b9bfb Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 21 Oct 2024 11:32:36 -0700 Subject: rtnetlink: Define rtnl_net_trylock(). We will need the per-netns version of rtnl_trylock(). rtnl_net_trylock() calls __rtnl_net_lock() only when rtnl_trylock() successfully holds RTNL. When RTNL is removed, we will use mutex_trylock() for per-netns RTNL. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/linux/rtnetlink.h | 6 ++++++ net/core/rtnetlink.c | 11 +++++++++++ 2 files changed, 17 insertions(+) (limited to 'net') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 0e62918de63b..14b88f551920 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -101,6 +101,7 @@ void __rtnl_net_lock(struct net *net); void __rtnl_net_unlock(struct net *net); void rtnl_net_lock(struct net *net); void rtnl_net_unlock(struct net *net); +int rtnl_net_trylock(struct net *net); int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b); bool rtnl_net_is_locked(struct net *net); @@ -132,6 +133,11 @@ static inline void rtnl_net_unlock(struct net *net) rtnl_unlock(); } +static inline int rtnl_net_trylock(struct net *net) +{ + return rtnl_trylock(); +} + static inline void ASSERT_RTNL_NET(struct net *net) { ASSERT_RTNL(); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 194a81e5f608..dda8230fdfd4 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -210,6 +210,17 @@ void rtnl_net_unlock(struct net *net) } EXPORT_SYMBOL(rtnl_net_unlock); +int rtnl_net_trylock(struct net *net) +{ + int ret = rtnl_trylock(); + + if (ret) + __rtnl_net_lock(net); + + return ret; +} +EXPORT_SYMBOL(rtnl_net_trylock); + static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b) { if (net_eq(net_a, net_b)) -- cgit v1.3 From 77453d428d4c9c613341de7f9b943f0c83f37a27 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 21 Oct 2024 11:32:37 -0700 Subject: ipv4: Convert devinet_sysctl_forward() to per-netns RTNL. devinet_sysctl_forward() touches only a single netns. Let's use rtnl_trylock() and __in_dev_get_rtnl_net(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/ipv4/devinet.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 5eaef3bbb987..bd65e0ef774e 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2395,7 +2395,7 @@ static void inet_forward_change(struct net *net) if (on) dev_disable_lro(dev); - in_dev = __in_dev_get_rtnl(dev); + in_dev = __in_dev_get_rtnl_net(dev); if (in_dev) { IN_DEV_CONF_SET(in_dev, FORWARDING, on); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, @@ -2486,7 +2486,7 @@ static int devinet_sysctl_forward(const struct ctl_table *ctl, int write, if (write && *valp != val) { if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) { - if (!rtnl_trylock()) { + if (!rtnl_net_trylock(net)) { /* Restore the original values before restarting */ *valp = val; *ppos = pos; @@ -2505,7 +2505,7 @@ static int devinet_sysctl_forward(const struct ctl_table *ctl, int write, idev->dev->ifindex, cnf); } - rtnl_unlock(); + rtnl_net_unlock(net); rt_cache_flush(net); } else inet_netconf_notify_devconf(net, RTM_NEWNETCONF, -- cgit v1.3 From 88d1f8770690791cbe5d8f60b17137df05476299 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 21 Oct 2024 11:32:38 -0700 Subject: ipv4: Convert devinet_ioctl() to per-netns RTNL except for SIOCSIFFLAGS. Basically, devinet_ioctl() operates on a single netns. However, ioctl(SIOCSIFFLAGS) will trigger the netdev notifier that could touch another netdev in different netns. Let's use per-netns RTNL helper in devinet_ioctl() and place ASSERT_RTNL() for SIOCSIFFLAGS. We will remove ASSERT_RTNL() once RTM_SETLINK and RTM_DELLINK are converted. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/ipv4/devinet.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index bd65e0ef774e..fb4bc63b8fa2 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -589,9 +589,7 @@ static int inet_insert_ifa(struct in_ifaddr *ifa) static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) { - struct in_device *in_dev = __in_dev_get_rtnl(dev); - - ASSERT_RTNL(); + struct in_device *in_dev = __in_dev_get_rtnl_net(dev); ipv4_devconf_setall(in_dev); neigh_parms_data_state_setall(in_dev->arp_parms); @@ -1129,7 +1127,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) goto out; } - rtnl_lock(); + rtnl_net_lock(net); ret = -ENODEV; dev = __dev_get_by_name(net, ifr->ifr_name); @@ -1139,7 +1137,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) if (colon) *colon = ':'; - in_dev = __in_dev_get_rtnl(dev); + in_dev = __in_dev_get_rtnl_net(dev); if (in_dev) { if (tryaddrmatch) { /* Matthias Andree */ @@ -1149,7 +1147,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) This is checked above. */ for (ifap = &in_dev->ifa_list; - (ifa = rtnl_dereference(*ifap)) != NULL; + (ifa = rtnl_net_dereference(net, *ifap)) != NULL; ifap = &ifa->ifa_next) { if (!strcmp(ifr->ifr_name, ifa->ifa_label) && sin_orig.sin_addr.s_addr == @@ -1163,7 +1161,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) comparing just the label */ if (!ifa) { for (ifap = &in_dev->ifa_list; - (ifa = rtnl_dereference(*ifap)) != NULL; + (ifa = rtnl_net_dereference(net, *ifap)) != NULL; ifap = &ifa->ifa_next) if (!strcmp(ifr->ifr_name, ifa->ifa_label)) break; @@ -1205,6 +1203,9 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) inet_del_ifa(in_dev, ifap, 1); break; } + + /* NETDEV_UP/DOWN/CHANGE could touch a peer dev */ + ASSERT_RTNL(); ret = dev_change_flags(dev, ifr->ifr_flags, NULL); break; @@ -1306,7 +1307,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) break; } done: - rtnl_unlock(); + rtnl_net_unlock(net); out: return ret; } -- cgit v1.3 From 7ed8da17bfb2b033e42afa842ca22641821e231c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 21 Oct 2024 11:32:39 -0700 Subject: ipv4: Convert devinet_ioctl to per-netns RTNL. ioctl(SIOCGIFCONF) calls dev_ifconf() that operates on the current netns. Let's use per-netns RTNL helpers in dev_ifconf() and inet_gifconf(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/core/dev_ioctl.c | 6 +++--- net/ipv4/devinet.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 473c437b6b53..46d43b950471 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -64,7 +64,7 @@ int dev_ifconf(struct net *net, struct ifconf __user *uifc) } /* Loop over the interfaces, and write an info block for each. */ - rtnl_lock(); + rtnl_net_lock(net); for_each_netdev(net, dev) { if (!pos) done = inet_gifconf(dev, NULL, 0, size); @@ -72,12 +72,12 @@ int dev_ifconf(struct net *net, struct ifconf __user *uifc) done = inet_gifconf(dev, pos + total, len - total, size); if (done < 0) { - rtnl_unlock(); + rtnl_net_unlock(net); return -EFAULT; } total += done; } - rtnl_unlock(); + rtnl_net_unlock(net); return put_user(total, &uifc->ifc_len); } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index fb4bc63b8fa2..f58f39a9ee87 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1314,7 +1314,7 @@ out: int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size) { - struct in_device *in_dev = __in_dev_get_rtnl(dev); + struct in_device *in_dev = __in_dev_get_rtnl_net(dev); const struct in_ifaddr *ifa; struct ifreq ifr; int done = 0; @@ -1325,7 +1325,7 @@ int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size) if (!in_dev) goto out; - in_dev_for_each_ifa_rtnl(ifa, in_dev) { + in_dev_for_each_ifa_rtnl_net(dev_net(dev), ifa, in_dev) { if (!buf) { done += size; continue; -- cgit v1.3 From 1ddf9916ac09313128e40d6581cef889c0b4ce84 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 23 Oct 2024 12:53:42 +0200 Subject: xfrm: Add support for per cpu xfrm state handling. Currently all flows for a certain SA must be processed by the same cpu to avoid packet reordering and lock contention of the xfrm state lock. To get rid of this limitation, the IETF standardized per cpu SAs in RFC 9611. This patch implements the xfrm part of it. We add the cpu as a lookup key for xfrm states and a config option to generate acquire messages for each cpu. With that, we can have on each cpu a SA with identical traffic selector so that flows can be processed in parallel on all cpus. Signed-off-by: Steffen Klassert Tested-by: Antony Antony Tested-by: Tobias Brunner --- include/net/xfrm.h | 5 ++-- include/uapi/linux/xfrm.h | 2 ++ net/key/af_key.c | 7 +++--- net/xfrm/xfrm_compat.c | 6 +++-- net/xfrm/xfrm_state.c | 58 ++++++++++++++++++++++++++++++++++++++--------- net/xfrm/xfrm_user.c | 56 +++++++++++++++++++++++++++++++++++++++++---- 6 files changed, 112 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index a0bdd58f401c..f5275618e744 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -188,6 +188,7 @@ struct xfrm_state { refcount_t refcnt; spinlock_t lock; + u32 pcpu_num; struct xfrm_id id; struct xfrm_selector sel; struct xfrm_mark mark; @@ -1684,7 +1685,7 @@ struct xfrmk_spdinfo { u32 spdhmcnt; }; -struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq); +struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num); int xfrm_state_delete(struct xfrm_state *x); int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync); int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid); @@ -1796,7 +1797,7 @@ int verify_spi_info(u8 proto, u32 min, u32 max, struct netlink_ext_ack *extack); int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi, struct netlink_ext_ack *extack); struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, - u8 mode, u32 reqid, u32 if_id, u8 proto, + u8 mode, u32 reqid, u32 if_id, u32 pcpu_num, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create, unsigned short family); diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index f28701500714..d73a97e3030a 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -322,6 +322,7 @@ enum xfrm_attr_type_t { XFRMA_MTIMER_THRESH, /* __u32 in seconds for input SA */ XFRMA_SA_DIR, /* __u8 */ XFRMA_NAT_KEEPALIVE_INTERVAL, /* __u32 in seconds for NAT keepalive */ + XFRMA_SA_PCPU, /* __u32 */ __XFRMA_MAX #define XFRMA_OUTPUT_MARK XFRMA_SET_MARK /* Compatibility */ @@ -437,6 +438,7 @@ struct xfrm_userpolicy_info { #define XFRM_POLICY_LOCALOK 1 /* Allow user to override global policy */ /* Automatically expand selector to include matching ICMP payloads. */ #define XFRM_POLICY_ICMP 2 +#define XFRM_POLICY_CPU_ACQUIRE 4 __u8 share; }; diff --git a/net/key/af_key.c b/net/key/af_key.c index f79fb99271ed..c56bb4f451e6 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1354,7 +1354,7 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_ } if (hdr->sadb_msg_seq) { - x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq); + x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq, UINT_MAX); if (x && !xfrm_addr_equal(&x->id.daddr, xdaddr, family)) { xfrm_state_put(x); x = NULL; @@ -1362,7 +1362,8 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_ } if (!x) - x = xfrm_find_acq(net, &dummy_mark, mode, reqid, 0, proto, xdaddr, xsaddr, 1, family); + x = xfrm_find_acq(net, &dummy_mark, mode, reqid, 0, UINT_MAX, + proto, xdaddr, xsaddr, 1, family); if (x == NULL) return -ENOENT; @@ -1417,7 +1418,7 @@ static int pfkey_acquire(struct sock *sk, struct sk_buff *skb, const struct sadb if (hdr->sadb_msg_seq == 0 || hdr->sadb_msg_errno == 0) return 0; - x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq); + x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq, UINT_MAX); if (x == NULL) return 0; diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index 91357ccaf4af..5b9ee63e30b6 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -132,6 +132,7 @@ static const struct nla_policy compat_policy[XFRMA_MAX+1] = { [XFRMA_MTIMER_THRESH] = { .type = NLA_U32 }, [XFRMA_SA_DIR] = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT), [XFRMA_NAT_KEEPALIVE_INTERVAL] = { .type = NLA_U32 }, + [XFRMA_SA_PCPU] = { .type = NLA_U32 }, }; static struct nlmsghdr *xfrm_nlmsg_put_compat(struct sk_buff *skb, @@ -282,9 +283,10 @@ static int xfrm_xlate64_attr(struct sk_buff *dst, const struct nlattr *src) case XFRMA_MTIMER_THRESH: case XFRMA_SA_DIR: case XFRMA_NAT_KEEPALIVE_INTERVAL: + case XFRMA_SA_PCPU: return xfrm_nla_cpy(dst, src, nla_len(src)); default: - BUILD_BUG_ON(XFRMA_MAX != XFRMA_NAT_KEEPALIVE_INTERVAL); + BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_PCPU); pr_warn_once("unsupported nla_type %d\n", src->nla_type); return -EOPNOTSUPP; } @@ -439,7 +441,7 @@ static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla, int err; if (type > XFRMA_MAX) { - BUILD_BUG_ON(XFRMA_MAX != XFRMA_NAT_KEEPALIVE_INTERVAL); + BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_PCPU); NL_SET_ERR_MSG(extack, "Bad attribute"); return -EOPNOTSUPP; } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 37478d36a8df..ebef07b80afa 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -679,6 +679,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net) x->lft.hard_packet_limit = XFRM_INF; x->replay_maxage = 0; x->replay_maxdiff = 0; + x->pcpu_num = UINT_MAX; spin_lock_init(&x->lock); } return x; @@ -1155,6 +1156,12 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x, struct xfrm_state **best, int *acq_in_progress, int *error) { + /* We need the cpu id just as a lookup key, + * we don't require it to be stable. + */ + unsigned int pcpu_id = get_cpu(); + put_cpu(); + /* Resolution logic: * 1. There is a valid state with matching selector. Done. * 2. Valid state with inappropriate selector. Skip. @@ -1174,13 +1181,18 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x, &fl->u.__fl_common)) return; + if (x->pcpu_num != UINT_MAX && x->pcpu_num != pcpu_id) + return; + if (!*best || + ((*best)->pcpu_num == UINT_MAX && x->pcpu_num == pcpu_id) || (*best)->km.dying > x->km.dying || ((*best)->km.dying == x->km.dying && (*best)->curlft.add_time < x->curlft.add_time)) *best = x; } else if (x->km.state == XFRM_STATE_ACQ) { - *acq_in_progress = 1; + if (!*best || x->pcpu_num == pcpu_id) + *acq_in_progress = 1; } else if (x->km.state == XFRM_STATE_ERROR || x->km.state == XFRM_STATE_EXPIRED) { if ((!x->sel.family || @@ -1209,6 +1221,13 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short encap_family = tmpl->encap_family; unsigned int sequence; struct km_event c; + unsigned int pcpu_id; + + /* We need the cpu id just as a lookup key, + * we don't require it to be stable. + */ + pcpu_id = get_cpu(); + put_cpu(); to_put = NULL; @@ -1282,7 +1301,10 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, } found: - x = best; + if (!(pol->flags & XFRM_POLICY_CPU_ACQUIRE) || + (best && (best->pcpu_num == pcpu_id))) + x = best; + if (!x && !error && !acquire_in_progress) { if (tmpl->id.spi && (x0 = __xfrm_state_lookup_all(net, mark, daddr, @@ -1314,6 +1336,8 @@ found: xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family); memcpy(&x->mark, &pol->mark, sizeof(x->mark)); x->if_id = if_id; + if ((pol->flags & XFRM_POLICY_CPU_ACQUIRE) && best) + x->pcpu_num = pcpu_id; error = security_xfrm_state_alloc_acquire(x, pol->security, fl->flowi_secid); if (error) { @@ -1392,6 +1416,11 @@ found: x = NULL; error = -ESRCH; } + + /* Use the already installed 'fallback' while the CPU-specific + * SA acquire is handled*/ + if (best) + x = best; } out: if (x) { @@ -1524,12 +1553,14 @@ static void __xfrm_state_bump_genids(struct xfrm_state *xnew) unsigned int h; u32 mark = xnew->mark.v & xnew->mark.m; u32 if_id = xnew->if_id; + u32 cpu_id = xnew->pcpu_num; h = xfrm_dst_hash(net, &xnew->id.daddr, &xnew->props.saddr, reqid, family); hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { if (x->props.family == family && x->props.reqid == reqid && x->if_id == if_id && + x->pcpu_num == cpu_id && (mark & x->mark.m) == x->mark.v && xfrm_addr_equal(&x->id.daddr, &xnew->id.daddr, family) && xfrm_addr_equal(&x->props.saddr, &xnew->props.saddr, family)) @@ -1552,7 +1583,7 @@ EXPORT_SYMBOL(xfrm_state_insert); static struct xfrm_state *__find_acq_core(struct net *net, const struct xfrm_mark *m, unsigned short family, u8 mode, - u32 reqid, u32 if_id, u8 proto, + u32 reqid, u32 if_id, u32 pcpu_num, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create) @@ -1569,6 +1600,7 @@ static struct xfrm_state *__find_acq_core(struct net *net, x->id.spi != 0 || x->id.proto != proto || (mark & x->mark.m) != x->mark.v || + x->pcpu_num != pcpu_num || !xfrm_addr_equal(&x->id.daddr, daddr, family) || !xfrm_addr_equal(&x->props.saddr, saddr, family)) continue; @@ -1602,6 +1634,7 @@ static struct xfrm_state *__find_acq_core(struct net *net, break; } + x->pcpu_num = pcpu_num; x->km.state = XFRM_STATE_ACQ; x->id.proto = proto; x->props.family = family; @@ -1630,7 +1663,7 @@ static struct xfrm_state *__find_acq_core(struct net *net, return x; } -static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq); +static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num); int xfrm_state_add(struct xfrm_state *x) { @@ -1656,7 +1689,7 @@ int xfrm_state_add(struct xfrm_state *x) } if (use_spi && x->km.seq) { - x1 = __xfrm_find_acq_byseq(net, mark, x->km.seq); + x1 = __xfrm_find_acq_byseq(net, mark, x->km.seq, x->pcpu_num); if (x1 && ((x1->id.proto != x->id.proto) || !xfrm_addr_equal(&x1->id.daddr, &x->id.daddr, family))) { to_put = x1; @@ -1666,7 +1699,7 @@ int xfrm_state_add(struct xfrm_state *x) if (use_spi && !x1) x1 = __find_acq_core(net, &x->mark, family, x->props.mode, - x->props.reqid, x->if_id, x->id.proto, + x->props.reqid, x->if_id, x->pcpu_num, x->id.proto, &x->id.daddr, &x->props.saddr, 0); __xfrm_state_bump_genids(x); @@ -1791,6 +1824,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, x->props.flags = orig->props.flags; x->props.extra_flags = orig->props.extra_flags; + x->pcpu_num = orig->pcpu_num; x->if_id = orig->if_id; x->tfcpad = orig->tfcpad; x->replay_maxdiff = orig->replay_maxdiff; @@ -2066,13 +2100,14 @@ EXPORT_SYMBOL(xfrm_state_lookup_byaddr); struct xfrm_state * xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid, - u32 if_id, u8 proto, const xfrm_address_t *daddr, + u32 if_id, u32 pcpu_num, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create, unsigned short family) { struct xfrm_state *x; spin_lock_bh(&net->xfrm.xfrm_state_lock); - x = __find_acq_core(net, mark, family, mode, reqid, if_id, proto, daddr, saddr, create); + x = __find_acq_core(net, mark, family, mode, reqid, if_id, pcpu_num, + proto, daddr, saddr, create); spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; @@ -2207,7 +2242,7 @@ xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n, /* Silly enough, but I'm lazy to build resolution list */ -static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq) +static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num) { unsigned int h = xfrm_seq_hash(net, seq); struct xfrm_state *x; @@ -2215,6 +2250,7 @@ static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 s hlist_for_each_entry_rcu(x, net->xfrm.state_byseq + h, byseq) { if (x->km.seq == seq && (mark & x->mark.m) == x->mark.v && + x->pcpu_num == pcpu_num && x->km.state == XFRM_STATE_ACQ) { xfrm_state_hold(x); return x; @@ -2224,12 +2260,12 @@ static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 s return NULL; } -struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq) +struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num) { struct xfrm_state *x; spin_lock_bh(&net->xfrm.xfrm_state_lock); - x = __xfrm_find_acq_byseq(net, mark, seq); + x = __xfrm_find_acq_byseq(net, mark, seq, pcpu_num); spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index e3b8ce89831a..e4d448950d05 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -460,6 +460,12 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, } } + if (!sa_dir && attrs[XFRMA_SA_PCPU]) { + NL_SET_ERR_MSG(extack, "SA_PCPU only supported with SA_DIR"); + err = -EINVAL; + goto out; + } + out: return err; } @@ -841,6 +847,12 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, x->nat_keepalive_interval = nla_get_u32(attrs[XFRMA_NAT_KEEPALIVE_INTERVAL]); + if (attrs[XFRMA_SA_PCPU]) { + x->pcpu_num = nla_get_u32(attrs[XFRMA_SA_PCPU]); + if (x->pcpu_num >= num_possible_cpus()) + goto error; + } + err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV], extack); if (err) goto error; @@ -1296,6 +1308,11 @@ static int copy_to_user_state_extra(struct xfrm_state *x, if (ret) goto out; } + if (x->pcpu_num != UINT_MAX) { + ret = nla_put_u32(skb, XFRMA_SA_PCPU, x->pcpu_num); + if (ret) + goto out; + } if (x->dir) ret = nla_put_u8(skb, XFRMA_SA_DIR, x->dir); @@ -1700,6 +1717,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, u32 mark; struct xfrm_mark m; u32 if_id = 0; + u32 pcpu_num = UINT_MAX; p = nlmsg_data(nlh); err = verify_spi_info(p->info.id.proto, p->min, p->max, extack); @@ -1716,8 +1734,16 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, if (attrs[XFRMA_IF_ID]) if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + if (attrs[XFRMA_SA_PCPU]) { + pcpu_num = nla_get_u32(attrs[XFRMA_SA_PCPU]); + if (pcpu_num >= num_possible_cpus()) { + err = -EINVAL; + goto out_noput; + } + } + if (p->info.seq) { - x = xfrm_find_acq_byseq(net, mark, p->info.seq); + x = xfrm_find_acq_byseq(net, mark, p->info.seq, pcpu_num); if (x && !xfrm_addr_equal(&x->id.daddr, daddr, family)) { xfrm_state_put(x); x = NULL; @@ -1726,7 +1752,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, if (!x) x = xfrm_find_acq(net, &m, p->info.mode, p->info.reqid, - if_id, p->info.id.proto, daddr, + if_id, pcpu_num, p->info.id.proto, daddr, &p->info.saddr, 1, family); err = -ENOENT; @@ -2526,7 +2552,8 @@ static inline unsigned int xfrm_aevent_msgsize(struct xfrm_state *x) + nla_total_size(sizeof(struct xfrm_mark)) + nla_total_size(4) /* XFRM_AE_RTHR */ + nla_total_size(4) /* XFRM_AE_ETHR */ - + nla_total_size(sizeof(x->dir)); /* XFRMA_SA_DIR */ + + nla_total_size(sizeof(x->dir)) /* XFRMA_SA_DIR */ + + nla_total_size(4); /* XFRMA_SA_PCPU */ } static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c) @@ -2582,6 +2609,8 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct err = xfrm_if_id_put(skb, x->if_id); if (err) goto out_cancel; + if (x->pcpu_num != UINT_MAX) + err = nla_put_u32(skb, XFRMA_SA_PCPU, x->pcpu_num); if (x->dir) { err = nla_put_u8(skb, XFRMA_SA_DIR, x->dir); @@ -2852,6 +2881,13 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh, xfrm_mark_get(attrs, &mark); + if (attrs[XFRMA_SA_PCPU]) { + x->pcpu_num = nla_get_u32(attrs[XFRMA_SA_PCPU]); + err = -EINVAL; + if (x->pcpu_num >= num_possible_cpus()) + goto free_state; + } + err = verify_newpolicy_info(&ua->policy, extack); if (err) goto free_state; @@ -3182,6 +3218,7 @@ const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_MTIMER_THRESH] = { .type = NLA_U32 }, [XFRMA_SA_DIR] = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT), [XFRMA_NAT_KEEPALIVE_INTERVAL] = { .type = NLA_U32 }, + [XFRMA_SA_PCPU] = { .type = NLA_U32 }, }; EXPORT_SYMBOL_GPL(xfrma_policy); @@ -3348,7 +3385,8 @@ static inline unsigned int xfrm_expire_msgsize(void) { return NLMSG_ALIGN(sizeof(struct xfrm_user_expire)) + nla_total_size(sizeof(struct xfrm_mark)) + - nla_total_size(sizeof_field(struct xfrm_state, dir)); + nla_total_size(sizeof_field(struct xfrm_state, dir)) + + nla_total_size(4); /* XFRMA_SA_PCPU */ } static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c) @@ -3374,6 +3412,11 @@ static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct err = xfrm_if_id_put(skb, x->if_id); if (err) return err; + if (x->pcpu_num != UINT_MAX) { + err = nla_put_u32(skb, XFRMA_SA_PCPU, x->pcpu_num); + if (err) + return err; + } if (x->dir) { err = nla_put_u8(skb, XFRMA_SA_DIR, x->dir); @@ -3481,6 +3524,8 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x) } if (x->if_id) l += nla_total_size(sizeof(x->if_id)); + if (x->pcpu_num) + l += nla_total_size(sizeof(x->pcpu_num)); /* Must count x->lastused as it may become non-zero behind our back. */ l += nla_total_size_64bit(sizeof(u64)); @@ -3587,6 +3632,7 @@ static inline unsigned int xfrm_acquire_msgsize(struct xfrm_state *x, + nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr) + nla_total_size(sizeof(struct xfrm_mark)) + nla_total_size(xfrm_user_sec_ctx_size(x->security)) + + nla_total_size(4) /* XFRMA_SA_PCPU */ + userpolicy_type_attrsize(); } @@ -3623,6 +3669,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x, err = xfrm_if_id_put(skb, xp->if_id); if (!err && xp->xdo.dev) err = copy_user_offload(&xp->xdo, skb); + if (!err && x->pcpu_num != UINT_MAX) + err = nla_put_u32(skb, XFRMA_SA_PCPU, x->pcpu_num); if (err) { nlmsg_cancel(skb, nlh); return err; -- cgit v1.3 From 0045e3d80613cc7174dc15f189ee6fc4e73b9365 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 23 Oct 2024 12:53:43 +0200 Subject: xfrm: Cache used outbound xfrm states at the policy. Now that we can have percpu xfrm states, the number of active states might increase. To get a better lookup performance, we cache the used xfrm states at the policy for outbound IPsec traffic. Signed-off-by: Steffen Klassert Tested-by: Antony Antony Tested-by: Tobias Brunner --- include/net/xfrm.h | 4 ++++ net/xfrm/xfrm_policy.c | 12 +++++++++++ net/xfrm/xfrm_state.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index f5275618e744..0b394c5fb5f3 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -184,6 +184,7 @@ struct xfrm_state { }; struct hlist_node byspi; struct hlist_node byseq; + struct hlist_node state_cache; refcount_t refcnt; spinlock_t lock; @@ -537,6 +538,7 @@ struct xfrm_policy_queue { * @xp_net: network namespace the policy lives in * @bydst: hlist node for SPD hash table or rbtree list * @byidx: hlist node for index hash table + * @state_cache_list: hlist head for policy cached xfrm states * @lock: serialize changes to policy structure members * @refcnt: reference count, freed once it reaches 0 * @pos: kernel internal tie-breaker to determine age of policy @@ -567,6 +569,8 @@ struct xfrm_policy { struct hlist_node bydst; struct hlist_node byidx; + struct hlist_head state_cache_list; + /* This lock only affects elements except for entry. */ rwlock_t lock; refcount_t refcnt; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index a2ea9dbac90b..8a1b83191a6c 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -434,6 +434,7 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp) if (policy) { write_pnet(&policy->xp_net, net); INIT_LIST_HEAD(&policy->walk.all); + INIT_HLIST_HEAD(&policy->state_cache_list); INIT_HLIST_NODE(&policy->bydst); INIT_HLIST_NODE(&policy->byidx); rwlock_init(&policy->lock); @@ -475,6 +476,9 @@ EXPORT_SYMBOL(xfrm_policy_destroy); static void xfrm_policy_kill(struct xfrm_policy *policy) { + struct net *net = xp_net(policy); + struct xfrm_state *x; + xfrm_dev_policy_delete(policy); write_lock_bh(&policy->lock); @@ -490,6 +494,13 @@ static void xfrm_policy_kill(struct xfrm_policy *policy) if (del_timer(&policy->timer)) xfrm_pol_put(policy); + /* XXX: Flush state cache */ + spin_lock_bh(&net->xfrm.xfrm_state_lock); + hlist_for_each_entry_rcu(x, &policy->state_cache_list, state_cache) { + hlist_del_init_rcu(&x->state_cache); + } + spin_unlock_bh(&net->xfrm.xfrm_state_lock); + xfrm_pol_put(policy); } @@ -3275,6 +3286,7 @@ no_transform: dst_release(dst); dst = dst_orig; } + ok: xfrm_pols_put(pols, drop_pols); if (dst && dst->xfrm && diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index ebef07b80afa..a2047825f6c8 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -665,6 +665,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net) refcount_set(&x->refcnt, 1); atomic_set(&x->tunnel_users, 0); INIT_LIST_HEAD(&x->km.all); + INIT_HLIST_NODE(&x->state_cache); INIT_HLIST_NODE(&x->bydst); INIT_HLIST_NODE(&x->bysrc); INIT_HLIST_NODE(&x->byspi); @@ -744,12 +745,15 @@ int __xfrm_state_delete(struct xfrm_state *x) if (x->km.state != XFRM_STATE_DEAD) { x->km.state = XFRM_STATE_DEAD; + spin_lock(&net->xfrm.xfrm_state_lock); list_del(&x->km.all); hlist_del_rcu(&x->bydst); hlist_del_rcu(&x->bysrc); if (x->km.seq) hlist_del_rcu(&x->byseq); + if (!hlist_unhashed(&x->state_cache)) + hlist_del_rcu(&x->state_cache); if (x->id.spi) hlist_del_rcu(&x->byspi); net->xfrm.state_num--; @@ -1222,6 +1226,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned int sequence; struct km_event c; unsigned int pcpu_id; + bool cached = false; /* We need the cpu id just as a lookup key, * we don't require it to be stable. @@ -1234,6 +1239,46 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, sequence = read_seqcount_begin(&net->xfrm.xfrm_state_hash_generation); rcu_read_lock(); + hlist_for_each_entry_rcu(x, &pol->state_cache_list, state_cache) { + if (x->props.family == encap_family && + x->props.reqid == tmpl->reqid && + (mark & x->mark.m) == x->mark.v && + x->if_id == if_id && + !(x->props.flags & XFRM_STATE_WILDRECV) && + xfrm_state_addr_check(x, daddr, saddr, encap_family) && + tmpl->mode == x->props.mode && + tmpl->id.proto == x->id.proto && + (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) + xfrm_state_look_at(pol, x, fl, encap_family, + &best, &acquire_in_progress, &error); + } + + if (best) + goto cached; + + hlist_for_each_entry_rcu(x, &pol->state_cache_list, state_cache) { + if (x->props.family == encap_family && + x->props.reqid == tmpl->reqid && + (mark & x->mark.m) == x->mark.v && + x->if_id == if_id && + !(x->props.flags & XFRM_STATE_WILDRECV) && + xfrm_addr_equal(&x->id.daddr, daddr, encap_family) && + tmpl->mode == x->props.mode && + tmpl->id.proto == x->id.proto && + (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) + xfrm_state_look_at(pol, x, fl, family, + &best, &acquire_in_progress, &error); + } + +cached: + cached = true; + if (best) + goto found; + else if (error) + best = NULL; + else if (acquire_in_progress) /* XXX: acquire_in_progress should not happen */ + WARN_ON(1); + h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family); hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h, bydst) { #ifdef CONFIG_XFRM_OFFLOAD @@ -1383,6 +1428,7 @@ found: XFRM_STATE_INSERT(bysrc, &x->bysrc, net->xfrm.state_bysrc + h, x->xso.type); + INIT_HLIST_NODE(&x->state_cache); if (x->id.spi) { h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family); XFRM_STATE_INSERT(byspi, &x->byspi, @@ -1431,6 +1477,15 @@ out: } else { *err = acquire_in_progress ? -EAGAIN : error; } + + if (x && x->km.state == XFRM_STATE_VALID && !cached && + (!(pol->flags & XFRM_POLICY_CPU_ACQUIRE) || x->pcpu_num == pcpu_id)) { + spin_lock_bh(&net->xfrm.xfrm_state_lock); + if (hlist_unhashed(&x->state_cache)) + hlist_add_head_rcu(&x->state_cache, &pol->state_cache_list); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); + } + rcu_read_unlock(); if (to_put) xfrm_state_put(to_put); -- cgit v1.3 From 81a331a0e72ddc2f75092603d9577bd1a0ca23ad Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 23 Oct 2024 12:53:44 +0200 Subject: xfrm: Add an inbound percpu state cache. Now that we can have percpu xfrm states, the number of active states might increase. To get a better lookup performance, we add a percpu cache to cache the used inbound xfrm states. Signed-off-by: Steffen Klassert Tested-by: Antony Antony Tested-by: Tobias Brunner --- include/net/netns/xfrm.h | 1 + include/net/xfrm.h | 5 +++++ net/ipv4/esp4_offload.c | 6 ++--- net/ipv6/esp6_offload.c | 6 ++--- net/xfrm/xfrm_input.c | 2 +- net/xfrm/xfrm_state.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 70 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index ae60d6664095..23dd647fe024 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -43,6 +43,7 @@ struct netns_xfrm { struct hlist_head __rcu *state_bysrc; struct hlist_head __rcu *state_byspi; struct hlist_head __rcu *state_byseq; + struct hlist_head __percpu *state_cache_input; unsigned int state_hmask; unsigned int state_num; struct work_struct state_hash_work; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 0b394c5fb5f3..2b87999bd5aa 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -185,6 +185,7 @@ struct xfrm_state { struct hlist_node byspi; struct hlist_node byseq; struct hlist_node state_cache; + struct hlist_node state_cache_input; refcount_t refcnt; spinlock_t lock; @@ -1650,6 +1651,10 @@ int xfrm_state_update(struct xfrm_state *x); struct xfrm_state *xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family); +struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark, + const xfrm_address_t *daddr, + __be32 spi, u8 proto, + unsigned short family); struct xfrm_state *xfrm_state_lookup_byaddr(struct net *net, u32 mark, const xfrm_address_t *daddr, const xfrm_address_t *saddr, diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 80c4ea0e12f4..e0d94270da28 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -53,9 +53,9 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head, if (sp->len == XFRM_MAX_DEPTH) goto out_reset; - x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, - (xfrm_address_t *)&ip_hdr(skb)->daddr, - spi, IPPROTO_ESP, AF_INET); + x = xfrm_input_state_lookup(dev_net(skb->dev), skb->mark, + (xfrm_address_t *)&ip_hdr(skb)->daddr, + spi, IPPROTO_ESP, AF_INET); if (unlikely(x && x->dir && x->dir != XFRM_SA_DIR_IN)) { /* non-offload path will record the error and audit log */ diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 919ebfabbe4e..7b41fb4f00b5 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -80,9 +80,9 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head, if (sp->len == XFRM_MAX_DEPTH) goto out_reset; - x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, - (xfrm_address_t *)&ipv6_hdr(skb)->daddr, - spi, IPPROTO_ESP, AF_INET6); + x = xfrm_input_state_lookup(dev_net(skb->dev), skb->mark, + (xfrm_address_t *)&ipv6_hdr(skb)->daddr, + spi, IPPROTO_ESP, AF_INET6); if (unlikely(x && x->dir && x->dir != XFRM_SA_DIR_IN)) { /* non-offload path will record the error and audit log */ diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 749e7eea99e4..841a60a6fbfe 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -572,7 +572,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) goto drop; } - x = xfrm_state_lookup(net, mark, daddr, spi, nexthdr, family); + x = xfrm_input_state_lookup(net, mark, daddr, spi, nexthdr, family); if (x == NULL) { secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index a2047825f6c8..e3266a5d4f90 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -754,6 +754,9 @@ int __xfrm_state_delete(struct xfrm_state *x) hlist_del_rcu(&x->byseq); if (!hlist_unhashed(&x->state_cache)) hlist_del_rcu(&x->state_cache); + if (!hlist_unhashed(&x->state_cache_input)) + hlist_del_rcu(&x->state_cache_input); + if (x->id.spi) hlist_del_rcu(&x->byspi); net->xfrm.state_num--; @@ -1106,6 +1109,52 @@ static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, return NULL; } +struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark, + const xfrm_address_t *daddr, + __be32 spi, u8 proto, + unsigned short family) +{ + struct hlist_head *state_cache_input; + struct xfrm_state *x = NULL; + int cpu = get_cpu(); + + state_cache_input = per_cpu_ptr(net->xfrm.state_cache_input, cpu); + + rcu_read_lock(); + hlist_for_each_entry_rcu(x, state_cache_input, state_cache_input) { + if (x->props.family != family || + x->id.spi != spi || + x->id.proto != proto || + !xfrm_addr_equal(&x->id.daddr, daddr, family)) + continue; + + if ((mark & x->mark.m) != x->mark.v) + continue; + if (!xfrm_state_hold_rcu(x)) + continue; + goto out; + } + + x = __xfrm_state_lookup(net, mark, daddr, spi, proto, family); + + if (x && x->km.state == XFRM_STATE_VALID) { + spin_lock_bh(&net->xfrm.xfrm_state_lock); + if (hlist_unhashed(&x->state_cache_input)) { + hlist_add_head_rcu(&x->state_cache_input, state_cache_input); + } else { + hlist_del_rcu(&x->state_cache_input); + hlist_add_head_rcu(&x->state_cache_input, state_cache_input); + } + spin_unlock_bh(&net->xfrm.xfrm_state_lock); + } + +out: + rcu_read_unlock(); + put_cpu(); + return x; +} +EXPORT_SYMBOL(xfrm_input_state_lookup); + static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark, const xfrm_address_t *daddr, const xfrm_address_t *saddr, @@ -3079,6 +3128,11 @@ int __net_init xfrm_state_init(struct net *net) net->xfrm.state_byseq = xfrm_hash_alloc(sz); if (!net->xfrm.state_byseq) goto out_byseq; + + net->xfrm.state_cache_input = alloc_percpu(struct hlist_head); + if (!net->xfrm.state_cache_input) + goto out_state_cache_input; + net->xfrm.state_hmask = ((sz / sizeof(struct hlist_head)) - 1); net->xfrm.state_num = 0; @@ -3088,6 +3142,8 @@ int __net_init xfrm_state_init(struct net *net) &net->xfrm.xfrm_state_lock); return 0; +out_state_cache_input: + xfrm_hash_free(net->xfrm.state_byseq, sz); out_byseq: xfrm_hash_free(net->xfrm.state_byspi, sz); out_byspi: @@ -3117,6 +3173,7 @@ void xfrm_state_fini(struct net *net) xfrm_hash_free(net->xfrm.state_bysrc, sz); WARN_ON(!hlist_empty(net->xfrm.state_bydst)); xfrm_hash_free(net->xfrm.state_bydst, sz); + free_percpu(net->xfrm.state_cache_input); } #ifdef CONFIG_AUDITSYSCALL -- cgit v1.3 From 83dfce38c49f3242c7edf5baab5c79c9ec360ecc Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 23 Oct 2024 12:53:45 +0200 Subject: xfrm: Restrict percpu SA attribute to specific netlink message types Reject the usage of XFRMA_SA_PCPU in xfrm netlink messages when it's not applicable. Signed-off-by: Steffen Klassert Tested-by: Antony Antony Tested-by: Tobias Brunner --- net/xfrm/xfrm_user.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index e4d448950d05..b6ce2b3c6b87 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -3282,6 +3282,20 @@ static int xfrm_reject_unused_attr(int type, struct nlattr **attrs, } } + if (attrs[XFRMA_SA_PCPU]) { + switch (type) { + case XFRM_MSG_NEWSA: + case XFRM_MSG_UPDSA: + case XFRM_MSG_ALLOCSPI: + case XFRM_MSG_ACQUIRE: + + break; + default: + NL_SET_ERR_MSG(extack, "Invalid attribute SA_PCPU"); + return -EINVAL; + } + } + return 0; } -- cgit v1.3 From b76ebf22c578375e69b35061b5d47149efd957f9 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 22 Oct 2024 11:48:00 +0200 Subject: ipv4: Prepare fib_compute_spec_dst() to future .flowi4_tos conversion. Use ip4h_dscp() to get the DSCP from the IPv4 header, then convert the dscp_t value to __u8 with inet_dscp_to_dsfield(). Then, when we'll convert .flowi4_tos to dscp_t, we'll just have to drop the inet_dscp_to_dsfield() call. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/a0eba69cce94f747e4c7516184a85ffd0abbe3f0.1729530028.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_frontend.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 53bd26315df5..0c9ce934b490 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -293,7 +293,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, - .flowi4_tos = ip_hdr(skb)->tos & INET_DSCP_MASK, + .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(skb))), .flowi4_scope = scope, .flowi4_mark = vmark ? skb->mark : 0, }; -- cgit v1.3 From 0ed373390c5c180d19a40f258c2e72754f641eb9 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 22 Oct 2024 11:48:08 +0200 Subject: ipv4: Prepare icmp_reply() to future .flowi4_tos conversion. Use ip4h_dscp() to get the DSCP from the IPv4 header, then convert the dscp_t value to __u8 with inet_dscp_to_dsfield(). Then, when we'll convert .flowi4_tos to dscp_t, we'll just have to drop the inet_dscp_to_dsfield() call. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/61b7563563f8b0a562b5b62032fe5260034d0aac.1729530028.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv4/icmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 23664434922e..33eec844a5a0 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -445,7 +445,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.saddr = saddr; fl4.flowi4_mark = mark; fl4.flowi4_uid = sock_net_uid(net, NULL); - fl4.flowi4_tos = ip_hdr(skb)->tos & INET_DSCP_MASK; + fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(skb))); fl4.flowi4_proto = IPPROTO_ICMP; fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev); security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); -- cgit v1.3 From 6ab04392dd087d896910dc618b4a14e54c58a499 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 22 Oct 2024 11:48:15 +0200 Subject: ipv4: Prepare ipmr_rt_fib_lookup() to future .flowi4_tos conversion. Use ip4h_dscp() to get the DSCP from the IPv4 header, then convert the dscp_t value to __u8 with inet_dscp_to_dsfield(). Then, when we'll convert .flowi4_tos to dscp_t, we'll just have to drop the inet_dscp_to_dsfield() call. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/462402a097260357a7aba80228612305f230b6a9.1729530028.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv4/ipmr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index b4fc443481ce..99e7cd0531d9 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2081,7 +2081,7 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, - .flowi4_tos = iph->tos & INET_DSCP_MASK, + .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)), .flowi4_oif = (rt_is_output_route(rt) ? skb->dev->ifindex : 0), .flowi4_iif = (rt_is_output_route(rt) ? -- cgit v1.3 From 85ef52e8693c4d3d23f33dc8007ebf11d5d4d4ce Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 22 Oct 2024 11:48:23 +0200 Subject: ipv4: Prepare ip_rt_get_source() to future .flowi4_tos conversion. Use ip4h_dscp() to get the DSCP from the IPv4 header, then convert the dscp_t value to __u8 with inet_dscp_to_dsfield(). Then, when we'll convert .flowi4_tos to dscp_t, we'll just have to drop the inet_dscp_to_dsfield() call. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/0a13a200f31809841975e38633914af1061e0c04.1729530028.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 18a08b4f4a5a..763398e08b7d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1263,7 +1263,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, - .flowi4_tos = iph->tos & INET_DSCP_MASK, + .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)), .flowi4_oif = rt->dst.dev->ifindex, .flowi4_iif = skb->dev->ifindex, .flowi4_mark = skb->mark, -- cgit v1.3 From bdd85ddce5a9fb786daecbc7ed73bf8cdee06856 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 22 Oct 2024 14:03:20 -0700 Subject: rtnetlink: Fix kdoc of rtnl_af_register(). Commit 26eebdc4b005 ("rtnetlink: Return int from rtnl_af_register().") made rtnl_af_register() return int again, and kdoc needs to be fixed up. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241022210320.86111-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index dda8230fdfd4..b70f90b98714 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -708,7 +708,7 @@ static void rtnl_af_put(struct rtnl_af_ops *ops, int srcu_index) * rtnl_af_register - Register rtnl_af_ops with rtnetlink. * @ops: struct rtnl_af_ops * to register * - * Returns 0 on success or a negative error code. + * Return: 0 on success or a negative error code. */ int rtnl_af_register(struct rtnl_af_ops *ops) { -- cgit v1.3 From 4bbd360a5084d8f890f814327e1d9fbb1f0f6fa1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 24 Oct 2024 13:14:58 -0700 Subject: socket: Print pf->create() when it does not clear sock->sk on failure. I suggested to put DEBUG_NET_WARN_ON_ONCE() in __sock_create() to catch possible use-after-free. But the warning itself was not useful because our interest is in the callee than the caller. Let's define DEBUG_NET_WARN_ONCE() and print the name of pf->create() and the socket identifier. While at it, we enclose DEBUG_NET_WARN_ON_ONCE() in parentheses too to avoid a checkpatch error. Note that %pf or %pF were obsoleted and will be removed later as per comment in lib/vsprintf.c. Link: https://lore.kernel.org/netdev/202410231427.633734b3-lkp@intel.com/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241024201458.49412-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/net_debug.h | 4 +++- net/socket.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/net_debug.h b/include/net/net_debug.h index 1e74684cbbdb..9fecb1496be3 100644 --- a/include/net/net_debug.h +++ b/include/net/net_debug.h @@ -149,9 +149,11 @@ do { \ #if defined(CONFIG_DEBUG_NET) -#define DEBUG_NET_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) +#define DEBUG_NET_WARN_ON_ONCE(cond) ((void)WARN_ON_ONCE(cond)) +#define DEBUG_NET_WARN_ONCE(cond, format...) ((void)WARN_ONCE(cond, format)) #else #define DEBUG_NET_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) +#define DEBUG_NET_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) #endif #endif /* _LINUX_NET_DEBUG_H */ diff --git a/net/socket.c b/net/socket.c index 9a8e4452b9b2..5fb3d265e492 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1578,7 +1578,9 @@ int __sock_create(struct net *net, int family, int type, int protocol, /* ->create should release the allocated sock->sk object on error * and make sure sock->sk is set to NULL to avoid use-after-free */ - DEBUG_NET_WARN_ON_ONCE(sock->sk); + DEBUG_NET_WARN_ONCE(sock->sk, + "%ps must clear sock->sk on failure, family: %d, type: %d, protocol: %d\n", + pf->create, family, type, protocol); goto out_module_put; } -- cgit v1.3 From da3ee3cd79ca900ae435777bd3193080197c2aca Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Wed, 23 Oct 2024 15:09:01 +0200 Subject: devlink: introduce devlink_nl_put_u64() Add devlink_nl_put_u64() that abstracts padding for u64 values. All u64 values are passed with the very same padding option. Reviewed-by: Wojciech Drewek Reviewed-by: Jiri Pirko Reviewed-by: Joe Damato Signed-off-by: Przemek Kitszel Link: https://patch.msgid.link/20241023131248.27192-2-przemyslaw.kitszel@intel.com Signed-off-by: Jakub Kicinski --- net/devlink/devl_internal.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index a9f064ab9ed9..14eaad9cfe35 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -181,6 +181,11 @@ devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink) return 0; } +static inline int devlink_nl_put_u64(struct sk_buff *msg, int attrtype, u64 val) +{ + return nla_put_u64_64bit(msg, attrtype, val, DEVLINK_ATTR_PAD); +} + int devlink_nl_put_nested_handle(struct sk_buff *msg, struct net *net, struct devlink *devlink, int attrtype); int devlink_nl_msg_reply_and_new(struct sk_buff **msg, struct genl_info *info); -- cgit v1.3 From a788acf154eb62a29bed75886d6e626744379cf4 Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Wed, 23 Oct 2024 15:09:02 +0200 Subject: devlink: use devlink_nl_put_u64() helper Use devlink_nl_put_u64() shortcut added by prev commit on all devlink/. Reviewed-by: Wojciech Drewek Reviewed-by: Jiri Pirko Reviewed-by: Joe Damato Signed-off-by: Przemek Kitszel Link: https://patch.msgid.link/20241023131248.27192-3-przemyslaw.kitszel@intel.com Signed-off-by: Jakub Kicinski --- net/devlink/dev.c | 12 ++++++------ net/devlink/dpipe.c | 18 ++++++++---------- net/devlink/health.c | 25 +++++++++++-------------- net/devlink/rate.c | 8 ++++---- net/devlink/region.c | 11 ++++------- net/devlink/resource.c | 27 ++++++++++++--------------- net/devlink/trap.c | 34 ++++++++++++++-------------------- 7 files changed, 59 insertions(+), 76 deletions(-) (limited to 'net') diff --git a/net/devlink/dev.c b/net/devlink/dev.c index 13c73f50da3d..9264bbc90d0c 100644 --- a/net/devlink/dev.c +++ b/net/devlink/dev.c @@ -971,14 +971,14 @@ static int devlink_nl_flash_update_fill(struct sk_buff *msg, nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_COMPONENT, params->component)) goto nla_put_failure; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE, - params->done, DEVLINK_ATTR_PAD)) + if (devlink_nl_put_u64(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE, + params->done)) goto nla_put_failure; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL, - params->total, DEVLINK_ATTR_PAD)) + if (devlink_nl_put_u64(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL, + params->total)) goto nla_put_failure; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT, - params->timeout, DEVLINK_ATTR_PAD)) + if (devlink_nl_put_u64(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT, + params->timeout)) goto nla_put_failure; out: diff --git a/net/devlink/dpipe.c b/net/devlink/dpipe.c index 55009b377447..e55701b007f0 100644 --- a/net/devlink/dpipe.c +++ b/net/devlink/dpipe.c @@ -165,18 +165,17 @@ static int devlink_dpipe_table_put(struct sk_buff *skb, return -EMSGSIZE; if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_TABLE_NAME, table->name) || - nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_SIZE, table_size, - DEVLINK_ATTR_PAD)) + devlink_nl_put_u64(skb, DEVLINK_ATTR_DPIPE_TABLE_SIZE, table_size)) goto nla_put_failure; if (nla_put_u8(skb, DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED, table->counters_enabled)) goto nla_put_failure; if (table->resource_valid) { - if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID, - table->resource_id, DEVLINK_ATTR_PAD) || - nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS, - table->resource_units, DEVLINK_ATTR_PAD)) + if (devlink_nl_put_u64(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID, + table->resource_id) || + devlink_nl_put_u64(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS, + table->resource_units)) goto nla_put_failure; } if (devlink_dpipe_matches_put(table, skb)) @@ -403,12 +402,11 @@ static int devlink_dpipe_entry_put(struct sk_buff *skb, if (!entry_attr) return -EMSGSIZE; - if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_ENTRY_INDEX, entry->index, - DEVLINK_ATTR_PAD)) + if (devlink_nl_put_u64(skb, DEVLINK_ATTR_DPIPE_ENTRY_INDEX, entry->index)) goto nla_put_failure; if (entry->counter_valid) - if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_ENTRY_COUNTER, - entry->counter, DEVLINK_ATTR_PAD)) + if (devlink_nl_put_u64(skb, DEVLINK_ATTR_DPIPE_ENTRY_COUNTER, + entry->counter)) goto nla_put_failure; matches_attr = nla_nest_start_noflag(skb, diff --git a/net/devlink/health.c b/net/devlink/health.c index acb8c0e174bb..b8d3084e6fe0 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -287,29 +287,27 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg, if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE, reporter->health_state)) goto reporter_nest_cancel; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR_COUNT, - reporter->error_count, DEVLINK_ATTR_PAD)) + if (devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR_COUNT, + reporter->error_count)) goto reporter_nest_cancel; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT, - reporter->recovery_count, DEVLINK_ATTR_PAD)) + if (devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT, + reporter->recovery_count)) goto reporter_nest_cancel; if (reporter->ops->recover && - nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, - reporter->graceful_period, - DEVLINK_ATTR_PAD)) + devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, + reporter->graceful_period)) goto reporter_nest_cancel; if (reporter->ops->recover && nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, reporter->auto_recover)) goto reporter_nest_cancel; if (reporter->dump_fmsg && - nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, - jiffies_to_msecs(reporter->dump_ts), - DEVLINK_ATTR_PAD)) + devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, + jiffies_to_msecs(reporter->dump_ts))) goto reporter_nest_cancel; if (reporter->dump_fmsg && - nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, - reporter->dump_real_ts, DEVLINK_ATTR_PAD)) + devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, + reporter->dump_real_ts)) goto reporter_nest_cancel; if (reporter->ops->dump && nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, @@ -963,8 +961,7 @@ devlink_fmsg_item_fill_data(struct devlink_fmsg_item *msg, struct sk_buff *skb) case NLA_U32: return nla_put_u32(skb, attrtype, *(u32 *)msg->value); case NLA_U64: - return nla_put_u64_64bit(skb, attrtype, *(u64 *)msg->value, - DEVLINK_ATTR_PAD); + return devlink_nl_put_u64(skb, attrtype, *(u64 *)msg->value); case NLA_NUL_STRING: return nla_put_string(skb, attrtype, (char *)&msg->value); case NLA_BINARY: diff --git a/net/devlink/rate.c b/net/devlink/rate.c index 7139e67e93ae..8828ffaf6cbc 100644 --- a/net/devlink/rate.c +++ b/net/devlink/rate.c @@ -108,12 +108,12 @@ static int devlink_nl_rate_fill(struct sk_buff *msg, goto nla_put_failure; } - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_RATE_TX_SHARE, - devlink_rate->tx_share, DEVLINK_ATTR_PAD)) + if (devlink_nl_put_u64(msg, DEVLINK_ATTR_RATE_TX_SHARE, + devlink_rate->tx_share)) goto nla_put_failure; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_RATE_TX_MAX, - devlink_rate->tx_max, DEVLINK_ATTR_PAD)) + if (devlink_nl_put_u64(msg, DEVLINK_ATTR_RATE_TX_MAX, + devlink_rate->tx_max)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_PRIORITY, diff --git a/net/devlink/region.c b/net/devlink/region.c index 7319127c5913..0a75a2fbd4d7 100644 --- a/net/devlink/region.c +++ b/net/devlink/region.c @@ -145,9 +145,7 @@ static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink, if (err) goto nla_put_failure; - err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE, - region->size, - DEVLINK_ATTR_PAD); + err = devlink_nl_put_u64(msg, DEVLINK_ATTR_REGION_SIZE, region->size); if (err) goto nla_put_failure; @@ -210,8 +208,8 @@ devlink_nl_region_notify_build(struct devlink_region *region, if (err) goto out_cancel_msg; } else { - err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE, - region->size, DEVLINK_ATTR_PAD); + err = devlink_nl_put_u64(msg, DEVLINK_ATTR_REGION_SIZE, + region->size); if (err) goto out_cancel_msg; } @@ -773,8 +771,7 @@ static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg, if (err) goto nla_put_failure; - err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_CHUNK_ADDR, addr, - DEVLINK_ATTR_PAD); + err = devlink_nl_put_u64(msg, DEVLINK_ATTR_REGION_CHUNK_ADDR, addr); if (err) goto nla_put_failure; diff --git a/net/devlink/resource.c b/net/devlink/resource.c index 594c8aeb3bfa..5ce05e94f484 100644 --- a/net/devlink/resource.c +++ b/net/devlink/resource.c @@ -141,12 +141,12 @@ devlink_resource_size_params_put(struct devlink_resource *resource, struct devlink_resource_size_params *size_params; size_params = &resource->size_params; - if (nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN, - size_params->size_granularity, DEVLINK_ATTR_PAD) || - nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX, - size_params->size_max, DEVLINK_ATTR_PAD) || - nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN, - size_params->size_min, DEVLINK_ATTR_PAD) || + if (devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN, + size_params->size_granularity) || + devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX, + size_params->size_max) || + devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN, + size_params->size_min) || nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, size_params->unit)) return -EMSGSIZE; return 0; @@ -157,9 +157,8 @@ static int devlink_resource_occ_put(struct devlink_resource *resource, { if (!resource->occ_get) return 0; - return nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC, - resource->occ_get(resource->occ_get_priv), - DEVLINK_ATTR_PAD); + return devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_OCC, + resource->occ_get(resource->occ_get_priv)); } static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb, @@ -174,14 +173,12 @@ static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb, return -EMSGSIZE; if (nla_put_string(skb, DEVLINK_ATTR_RESOURCE_NAME, resource->name) || - nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE, resource->size, - DEVLINK_ATTR_PAD) || - nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_ID, resource->id, - DEVLINK_ATTR_PAD)) + devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE, resource->size) || + devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_ID, resource->id)) goto nla_put_failure; if (resource->size != resource->size_new && - nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW, - resource->size_new, DEVLINK_ATTR_PAD)) + devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW, + resource->size_new)) goto nla_put_failure; if (devlink_resource_occ_put(resource, skb)) goto nla_put_failure; diff --git a/net/devlink/trap.c b/net/devlink/trap.c index 5d18c7424df1..f36087f90db5 100644 --- a/net/devlink/trap.c +++ b/net/devlink/trap.c @@ -189,14 +189,12 @@ devlink_trap_group_stats_put(struct sk_buff *msg, if (!attr) return -EMSGSIZE; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_PACKETS, - u64_stats_read(&stats.rx_packets), - DEVLINK_ATTR_PAD)) + if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_PACKETS, + u64_stats_read(&stats.rx_packets))) goto nla_put_failure; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_BYTES, - u64_stats_read(&stats.rx_bytes), - DEVLINK_ATTR_PAD)) + if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_BYTES, + u64_stats_read(&stats.rx_bytes))) goto nla_put_failure; nla_nest_end(msg, attr); @@ -231,18 +229,15 @@ static int devlink_trap_stats_put(struct sk_buff *msg, struct devlink *devlink, return -EMSGSIZE; if (devlink->ops->trap_drop_counter_get && - nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops, - DEVLINK_ATTR_PAD)) + devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops)) goto nla_put_failure; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_PACKETS, - u64_stats_read(&stats.rx_packets), - DEVLINK_ATTR_PAD)) + if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_PACKETS, + u64_stats_read(&stats.rx_packets))) goto nla_put_failure; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_BYTES, - u64_stats_read(&stats.rx_bytes), - DEVLINK_ATTR_PAD)) + if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_BYTES, + u64_stats_read(&stats.rx_bytes))) goto nla_put_failure; nla_nest_end(msg, attr); @@ -750,8 +745,7 @@ devlink_trap_policer_stats_put(struct sk_buff *msg, struct devlink *devlink, if (!attr) return -EMSGSIZE; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops, - DEVLINK_ATTR_PAD)) + if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops)) goto nla_put_failure; nla_nest_end(msg, attr); @@ -783,12 +777,12 @@ devlink_nl_trap_policer_fill(struct sk_buff *msg, struct devlink *devlink, policer_item->policer->id)) goto nla_put_failure; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_TRAP_POLICER_RATE, - policer_item->rate, DEVLINK_ATTR_PAD)) + if (devlink_nl_put_u64(msg, DEVLINK_ATTR_TRAP_POLICER_RATE, + policer_item->rate)) goto nla_put_failure; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_TRAP_POLICER_BURST, - policer_item->burst, DEVLINK_ATTR_PAD)) + if (devlink_nl_put_u64(msg, DEVLINK_ATTR_TRAP_POLICER_BURST, + policer_item->burst)) goto nla_put_failure; err = devlink_trap_policer_stats_put(msg, devlink, -- cgit v1.3 From e0b140c44f322230c2bb97a7e8ac773419f7e81a Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Wed, 23 Oct 2024 15:09:03 +0200 Subject: devlink: devl_resource_register(): differentiate error codes Differentiate error codes of devl_resource_register(). Replace one of -EINVAL exit paths by -EEXIST. This should aid developers introducing new resources and registering them in the wrong order. Reviewed-by: Wojciech Drewek Reviewed-by: Jiri Pirko Reviewed-by: Joe Damato Signed-off-by: Przemek Kitszel Link: https://patch.msgid.link/20241023131248.27192-4-przemyslaw.kitszel@intel.com Signed-off-by: Jakub Kicinski --- net/devlink/resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/devlink/resource.c b/net/devlink/resource.c index 5ce05e94f484..96c0ff24b65a 100644 --- a/net/devlink/resource.c +++ b/net/devlink/resource.c @@ -345,7 +345,7 @@ int devl_resource_register(struct devlink *devlink, resource = devlink_resource_find(devlink, NULL, resource_id); if (resource) - return -EINVAL; + return -EEXIST; resource = kzalloc(sizeof(*resource), GFP_KERNEL); if (!resource) -- cgit v1.3 From 72429e9e0cfb1bd7480a968d741edf787c134f06 Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Wed, 23 Oct 2024 15:09:04 +0200 Subject: devlink: region: snapshot IDs: consolidate error values Consolidate error codes for too big message size. Current code is written to return -EINVAL when tailroom in the skb msg would be exhausted precisely when it's time to nest, and return -EMSGSIZE in all other "not enough space" conditions. Reviewed-by: Wojciech Drewek Reviewed-by: Jiri Pirko Reviewed-by: Joe Damato Signed-off-by: Przemek Kitszel Link: https://patch.msgid.link/20241023131248.27192-5-przemyslaw.kitszel@intel.com Signed-off-by: Jakub Kicinski --- net/devlink/region.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/devlink/region.c b/net/devlink/region.c index 0a75a2fbd4d7..63fb297f6d67 100644 --- a/net/devlink/region.c +++ b/net/devlink/region.c @@ -77,7 +77,7 @@ static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg, snap_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_REGION_SNAPSHOT); if (!snap_attr) - return -EINVAL; + return -EMSGSIZE; err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, snapshot->id); if (err) @@ -102,7 +102,7 @@ static int devlink_nl_region_snapshots_id_put(struct sk_buff *msg, snapshots_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_REGION_SNAPSHOTS); if (!snapshots_attr) - return -EINVAL; + return -EMSGSIZE; list_for_each_entry(snapshot, ®ion->snapshot_list, list) { err = devlink_nl_region_snapshot_id_put(msg, devlink, snapshot); -- cgit v1.3 From d5020cb41e3c19196fe6f180950867ab7510d398 Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Wed, 23 Oct 2024 15:09:05 +0200 Subject: net: dsa: replace devlink resource registration calls by devl_ variants Replace devlink_resource_register(), devlink_resource_occ_get_register(), and devlink_resource_occ_get_unregister() calls by respective devl_* variants. Mentioned functions have no direct users in any drivers, and are going to be removed in subsequent patches. Reviewed-by: Wojciech Drewek Reviewed-by: Jiri Pirko Reviewed-by: Joe Damato Signed-off-by: Przemek Kitszel Link: https://patch.msgid.link/20241023131248.27192-6-przemyslaw.kitszel@intel.com Signed-off-by: Jakub Kicinski --- net/dsa/devlink.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/dsa/devlink.c b/net/dsa/devlink.c index 0aac887d0098..f41f9fc2194e 100644 --- a/net/dsa/devlink.c +++ b/net/dsa/devlink.c @@ -229,10 +229,15 @@ int dsa_devlink_resource_register(struct dsa_switch *ds, u64 parent_resource_id, const struct devlink_resource_size_params *size_params) { - return devlink_resource_register(ds->devlink, resource_name, - resource_size, resource_id, - parent_resource_id, - size_params); + int ret; + + devl_lock(ds->devlink); + ret = devl_resource_register(ds->devlink, resource_name, resource_size, + resource_id, parent_resource_id, + size_params); + devl_unlock(ds->devlink); + + return ret; } EXPORT_SYMBOL_GPL(dsa_devlink_resource_register); @@ -247,15 +252,19 @@ void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds, devlink_resource_occ_get_t *occ_get, void *occ_get_priv) { - return devlink_resource_occ_get_register(ds->devlink, resource_id, - occ_get, occ_get_priv); + devl_lock(ds->devlink); + devl_resource_occ_get_register(ds->devlink, resource_id, occ_get, + occ_get_priv); + devl_unlock(ds->devlink); } EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_register); void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds, u64 resource_id) { - devlink_resource_occ_get_unregister(ds->devlink, resource_id); + devl_lock(ds->devlink); + devl_resource_occ_get_unregister(ds->devlink, resource_id); + devl_unlock(ds->devlink); } EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_unregister); -- cgit v1.3 From 2a0df10434ddeb8b5b5aded90a5659aff3b106d7 Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Wed, 23 Oct 2024 15:09:06 +0200 Subject: devlink: remove unused devlink_resource_occ_get_register() and _unregister() Remove not used devlink_resource_occ_get_register() and devlink_resource_occ_get_unregister() functions; current devlink resource users are fine with devl_ variants of the two. Reviewed-by: Wojciech Drewek Reviewed-by: Jiri Pirko Reviewed-by: Joe Damato Signed-off-by: Przemek Kitszel Link: https://patch.msgid.link/20241023131248.27192-7-przemyslaw.kitszel@intel.com Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 7 ------- net/devlink/resource.c | 39 --------------------------------------- 2 files changed, 46 deletions(-) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index db5eff6cb60f..fdd6a0f9891d 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1797,15 +1797,8 @@ void devl_resource_occ_get_register(struct devlink *devlink, u64 resource_id, devlink_resource_occ_get_t *occ_get, void *occ_get_priv); -void devlink_resource_occ_get_register(struct devlink *devlink, - u64 resource_id, - devlink_resource_occ_get_t *occ_get, - void *occ_get_priv); void devl_resource_occ_get_unregister(struct devlink *devlink, u64 resource_id); - -void devlink_resource_occ_get_unregister(struct devlink *devlink, - u64 resource_id); int devl_params_register(struct devlink *devlink, const struct devlink_param *params, size_t params_count); diff --git a/net/devlink/resource.c b/net/devlink/resource.c index 96c0ff24b65a..a923222bbde8 100644 --- a/net/devlink/resource.c +++ b/net/devlink/resource.c @@ -513,28 +513,6 @@ void devl_resource_occ_get_register(struct devlink *devlink, } EXPORT_SYMBOL_GPL(devl_resource_occ_get_register); -/** - * devlink_resource_occ_get_register - register occupancy getter - * - * @devlink: devlink - * @resource_id: resource id - * @occ_get: occupancy getter callback - * @occ_get_priv: occupancy getter callback priv - * - * Context: Takes and release devlink->lock . - */ -void devlink_resource_occ_get_register(struct devlink *devlink, - u64 resource_id, - devlink_resource_occ_get_t *occ_get, - void *occ_get_priv) -{ - devl_lock(devlink); - devl_resource_occ_get_register(devlink, resource_id, - occ_get, occ_get_priv); - devl_unlock(devlink); -} -EXPORT_SYMBOL_GPL(devlink_resource_occ_get_register); - /** * devl_resource_occ_get_unregister - unregister occupancy getter * @@ -557,20 +535,3 @@ void devl_resource_occ_get_unregister(struct devlink *devlink, resource->occ_get_priv = NULL; } EXPORT_SYMBOL_GPL(devl_resource_occ_get_unregister); - -/** - * devlink_resource_occ_get_unregister - unregister occupancy getter - * - * @devlink: devlink - * @resource_id: resource id - * - * Context: Takes and release devlink->lock . - */ -void devlink_resource_occ_get_unregister(struct devlink *devlink, - u64 resource_id) -{ - devl_lock(devlink); - devl_resource_occ_get_unregister(devlink, resource_id); - devl_unlock(devlink); -} -EXPORT_SYMBOL_GPL(devlink_resource_occ_get_unregister); -- cgit v1.3 From e3302f9a503a632c125170dc3c72b2886a910b0a Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Wed, 23 Oct 2024 15:09:07 +0200 Subject: devlink: remove unused devlink_resource_register() Remove unused devlink_resource_register(); all the drivers use devl_resource_register() variant instead. Reviewed-by: Wojciech Drewek Reviewed-by: Jiri Pirko Reviewed-by: Joe Damato Signed-off-by: Przemek Kitszel Link: https://patch.msgid.link/20241023131248.27192-8-przemyslaw.kitszel@intel.com Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 6 ------ net/devlink/resource.c | 33 --------------------------------- 2 files changed, 39 deletions(-) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index fdd6a0f9891d..fbb9a2668e24 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1779,12 +1779,6 @@ int devl_resource_register(struct devlink *devlink, u64 resource_id, u64 parent_resource_id, const struct devlink_resource_size_params *size_params); -int devlink_resource_register(struct devlink *devlink, - const char *resource_name, - u64 resource_size, - u64 resource_id, - u64 parent_resource_id, - const struct devlink_resource_size_params *size_params); void devl_resources_unregister(struct devlink *devlink); void devlink_resources_unregister(struct devlink *devlink); int devl_resource_size_get(struct devlink *devlink, diff --git a/net/devlink/resource.c b/net/devlink/resource.c index a923222bbde8..2d6324f3d91f 100644 --- a/net/devlink/resource.c +++ b/net/devlink/resource.c @@ -381,39 +381,6 @@ int devl_resource_register(struct devlink *devlink, } EXPORT_SYMBOL_GPL(devl_resource_register); -/** - * devlink_resource_register - devlink resource register - * - * @devlink: devlink - * @resource_name: resource's name - * @resource_size: resource's size - * @resource_id: resource's id - * @parent_resource_id: resource's parent id - * @size_params: size parameters - * - * Generic resources should reuse the same names across drivers. - * Please see the generic resources list at: - * Documentation/networking/devlink/devlink-resource.rst - * - * Context: Takes and release devlink->lock . - */ -int devlink_resource_register(struct devlink *devlink, - const char *resource_name, - u64 resource_size, - u64 resource_id, - u64 parent_resource_id, - const struct devlink_resource_size_params *size_params) -{ - int err; - - devl_lock(devlink); - err = devl_resource_register(devlink, resource_name, resource_size, - resource_id, parent_resource_id, size_params); - devl_unlock(devlink); - return err; -} -EXPORT_SYMBOL_GPL(devlink_resource_register); - static void devlink_resource_unregister(struct devlink *devlink, struct devlink_resource *resource) { -- cgit v1.3 From 668d663989c77fcb2a92748645e4c394b03d5988 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Wed, 23 Oct 2024 16:14:52 +0800 Subject: tcp: add more warn of socket in tcp_send_loss_probe() Add two fields to print in the helper which here covers tcp_send_loss_probe(). Link: https://lore.kernel.org/all/5632e043-bdba-4d75-bc7e-bf58014492fd@redhat.com/ Suggested-by: Paolo Abeni Signed-off-by: Jason Xing Cc: Neal Cardwell Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/tcp.h | 3 ++- net/ipv4/tcp_output.c | 4 +--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 8b8d94bb1746..e9b37b76e894 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2433,8 +2433,9 @@ void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb); static inline void tcp_warn_once(const struct sock *sk, bool cond, const char *str) { WARN_ONCE(cond, - "%sout:%u sacked:%u lost:%u retrans:%u tlp_high_seq:%u sk_state:%u ca_state:%u advmss:%u mss_cache:%u pmtu:%u\n", + "%scwn:%u out:%u sacked:%u lost:%u retrans:%u tlp_high_seq:%u sk_state:%u ca_state:%u advmss:%u mss_cache:%u pmtu:%u\n", str, + tcp_snd_cwnd(tcp_sk(sk)), tcp_sk(sk)->packets_out, tcp_sk(sk)->sacked_out, tcp_sk(sk)->lost_out, tcp_sk(sk)->retrans_out, tcp_sk(sk)->tlp_high_seq, sk->sk_state, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 054244ce5117..5485a70b5fe5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2954,9 +2954,7 @@ void tcp_send_loss_probe(struct sock *sk) } skb = skb_rb_last(&sk->tcp_rtx_queue); if (unlikely(!skb)) { - WARN_ONCE(tp->packets_out, - "invalid inflight: %u state %u cwnd %u mss %d\n", - tp->packets_out, sk->sk_state, tcp_snd_cwnd(tp), mss); + tcp_warn_once(sk, tp->packets_out, "invalid inflight: "); smp_store_release(&inet_csk(sk)->icsk_pending, 0); return; } -- cgit v1.3 From 6a4794d5a3e2bf10233ce8a5e53f168e23715e8a Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Sat, 26 Oct 2024 12:53:37 +0000 Subject: bpf: bpf_csum_diff: Optimize and homogenize for all archs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Optimization ------------ The current implementation copies the 'from' and 'to' buffers to a scratchpad and it takes the bitwise NOT of 'from' buffer while copying. In the next step csum_partial() is called with this scratchpad. so, mathematically, the current implementation is doing: result = csum(to - from) Here, 'to' and '~ from' are copied in to the scratchpad buffer, we need it in the scratchpad buffer because csum_partial() takes a single contiguous buffer and not two disjoint buffers like 'to' and 'from'. We can re write this equation to: result = csum(to) - csum(from) using the distributive property of csum(). this allows 'to' and 'from' to be at different locations and therefore this scratchpad and copying is not needed. This in C code will look like: result = csum_sub(csum_partial(to, to_size, seed), csum_partial(from, from_size, 0)); 2. Homogenization -------------- The bpf_csum_diff() helper calls csum_partial() which is implemented by some architectures like arm and x86 but other architectures rely on the generic implementation in lib/checksum.c The generic implementation in lib/checksum.c returns a 16 bit value but the arch specific implementations can return more than 16 bits, this works out in most places because before the result is used, it is passed through csum_fold() that turns it into a 16-bit value. bpf_csum_diff() directly returns the value from csum_partial() and therefore the returned values could be different on different architectures. see discussion in [1]: for the int value 28 the calculated checksums are: x86 : -29 : 0xffffffe3 generic (arm64, riscv) : 65507 : 0x0000ffe3 arm : 131042 : 0x0001ffe2 Pass the result of bpf_csum_diff() through from32to16() before returning to homogenize this result for all architectures. NOTE: from32to16() is used instead of csum_fold() because csum_fold() does from32to16() + bitwise NOT of the result, which is not what we want to do here. [1] https://lore.kernel.org/bpf/CAJ+HfNiQbOcqCLxFUP2FMm5QrLXUUaj852Fxe3hn_2JNiucn6g@mail.gmail.com/ Signed-off-by: Puranjay Mohan Signed-off-by: Daniel Borkmann Reviewed-by: Toke Høiland-Jørgensen Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20241026125339.26459-3-puranjay@kernel.org --- net/core/filter.c | 39 +++++++++++---------------------------- 1 file changed, 11 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index a88e6924c4c0..f215d151f77d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1654,18 +1654,6 @@ void sk_reuseport_prog_free(struct bpf_prog *prog) bpf_prog_destroy(prog); } -struct bpf_scratchpad { - union { - __be32 diff[MAX_BPF_STACK / sizeof(__be32)]; - u8 buff[MAX_BPF_STACK]; - }; - local_lock_t bh_lock; -}; - -static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp) = { - .bh_lock = INIT_LOCAL_LOCK(bh_lock), -}; - static inline int __bpf_try_make_writable(struct sk_buff *skb, unsigned int write_len) { @@ -2022,11 +2010,6 @@ static const struct bpf_func_proto bpf_l4_csum_replace_proto = { BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, __be32 *, to, u32, to_size, __wsum, seed) { - struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); - u32 diff_size = from_size + to_size; - int i, j = 0; - __wsum ret; - /* This is quite flexible, some examples: * * from_size == 0, to_size > 0, seed := csum --> pushing data @@ -2035,19 +2018,19 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, * * Even for diffing, from_size and to_size don't need to be equal. */ - if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) || - diff_size > sizeof(sp->diff))) - return -EINVAL; - local_lock_nested_bh(&bpf_sp.bh_lock); - for (i = 0; i < from_size / sizeof(__be32); i++, j++) - sp->diff[j] = ~from[i]; - for (i = 0; i < to_size / sizeof(__be32); i++, j++) - sp->diff[j] = to[i]; + __wsum ret = seed; - ret = csum_partial(sp->diff, diff_size, seed); - local_unlock_nested_bh(&bpf_sp.bh_lock); - return ret; + if (from_size && to_size) + ret = csum_sub(csum_partial(to, to_size, ret), + csum_partial(from, from_size, 0)); + else if (to_size) + ret = csum_partial(to, to_size, ret); + + else if (from_size) + ret = ~csum_partial(from, from_size, ~ret); + + return csum_from32to16((__force unsigned int)ret); } static const struct bpf_func_proto bpf_csum_diff_proto = { -- cgit v1.3 From b35108a51cf7bab58d7eace1267d7965978bcdb8 Mon Sep 17 00:00:00 2001 From: Easwar Hariharan Date: Wed, 30 Oct 2024 17:47:35 +0000 Subject: jiffies: Define secs_to_jiffies() secs_to_jiffies() is defined in hci_event.c and cannot be reused by other call sites. Hoist it into the core code to allow conversion of the ~1150 usages of msecs_to_jiffies() that either: - use a multiplier value of 1000 or equivalently MSEC_PER_SEC, or - have timeouts that are denominated in seconds (i.e. end in 000) It's implemented as a macro to allow usage in static initializers. This will also allow conversion of yet more sites that use (sec * HZ) directly, and improve their readability. Suggested-by: Michael Kelley Signed-off-by: Easwar Hariharan Signed-off-by: Thomas Gleixner Reviewed-by: Luiz Augusto von Dentz Link: https://lore.kernel.org/all/20241030-open-coded-timeouts-v3-1-9ba123facf88@linux.microsoft.com --- include/linux/jiffies.h | 13 +++++++++++++ net/bluetooth/hci_event.c | 2 -- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 5d21dacd62bc..ed945f42e064 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -526,6 +526,19 @@ static __always_inline unsigned long msecs_to_jiffies(const unsigned int m) } } +/** + * secs_to_jiffies: - convert seconds to jiffies + * @_secs: time in seconds + * + * Conversion is done by simple multiplication with HZ + * + * secs_to_jiffies() is defined as a macro rather than a static inline + * function so it can be used in static initializers. + * + * Return: jiffies value + */ +#define secs_to_jiffies(_secs) ((_secs) * HZ) + extern unsigned long __usecs_to_jiffies(const unsigned int u); #if !(USEC_PER_SEC % HZ) static inline unsigned long _usecs_to_jiffies(const unsigned int u) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 1c82dcdf6e8f..4bd94d432bcf 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -42,8 +42,6 @@ #define ZERO_KEY "\x00\x00\x00\x00\x00\x00\x00\x00" \ "\x00\x00\x00\x00\x00\x00\x00\x00" -#define secs_to_jiffies(_secs) msecs_to_jiffies((_secs) * 1000) - /* Handle HCI Event packets */ static void *hci_ev_skb_pull(struct hci_dev *hdev, struct sk_buff *skb, -- cgit v1.3 From a0af7162ccb501a22ac448ad94dad81757743725 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 23 Oct 2024 16:52:47 +0300 Subject: net: dsa: clean up dsa_user_add_cls_matchall() The body is a bit hard to read, hard to extend, and has duplicated conditions. Clean up the "if (many conditions) else if (many conditions, some of them repeated)" pattern by: - Moving the repeated conditions out - Replacing the repeated tests for the same variable with a switch/case - Moving the protocol check inside the dsa_user_add_cls_matchall_mirred() function call. This is pure refactoring, no logic has been changed, though some tests were reordered. The order does not matter - they are independent things to be tested for. Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20241023135251.1752488-3-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/user.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/dsa/user.c b/net/dsa/user.c index 91a1fa5f8ab0..15f69fa6a38b 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -1376,6 +1376,9 @@ dsa_user_add_cls_matchall_mirred(struct net_device *dev, struct dsa_port *to_dp; int err; + if (cls->common.protocol != htons(ETH_P_ALL)) + return -EOPNOTSUPP; + if (!ds->ops->port_mirror_add) return -EOPNOTSUPP; @@ -1485,17 +1488,21 @@ static int dsa_user_add_cls_matchall(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress) { - int err = -EOPNOTSUPP; + const struct flow_action *action = &cls->rule->action; - if (cls->common.protocol == htons(ETH_P_ALL) && - flow_offload_has_one_action(&cls->rule->action) && - cls->rule->action.entries[0].id == FLOW_ACTION_MIRRED) - err = dsa_user_add_cls_matchall_mirred(dev, cls, ingress); - else if (flow_offload_has_one_action(&cls->rule->action) && - cls->rule->action.entries[0].id == FLOW_ACTION_POLICE) - err = dsa_user_add_cls_matchall_police(dev, cls, ingress); + if (!flow_offload_has_one_action(action)) + return -EOPNOTSUPP; - return err; + switch (action->entries[0].id) { + case FLOW_ACTION_MIRRED: + return dsa_user_add_cls_matchall_mirred(dev, cls, ingress); + case FLOW_ACTION_POLICE: + return dsa_user_add_cls_matchall_police(dev, cls, ingress); + default: + break; + } + + return -EOPNOTSUPP; } static void dsa_user_del_cls_matchall(struct net_device *dev, -- cgit v1.3 From c11ace14d9db3a2e2e7b473ff8f79c7b1c998191 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 23 Oct 2024 16:52:48 +0300 Subject: net: dsa: use "extack" as argument to flow_action_basic_hw_stats_check() We already have an "extack" stack variable in dsa_user_add_cls_matchall_police() and dsa_user_add_cls_matchall_mirred(), there is no need to retrieve it again from cls->common.extack. Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20241023135251.1752488-4-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/user.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/dsa/user.c b/net/dsa/user.c index 15f69fa6a38b..a89425a8de2e 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -1382,8 +1382,7 @@ dsa_user_add_cls_matchall_mirred(struct net_device *dev, if (!ds->ops->port_mirror_add) return -EOPNOTSUPP; - if (!flow_action_basic_hw_stats_check(&cls->rule->action, - cls->common.extack)) + if (!flow_action_basic_hw_stats_check(&cls->rule->action, extack)) return -EOPNOTSUPP; act = &cls->rule->action.entries[0]; @@ -1449,8 +1448,7 @@ dsa_user_add_cls_matchall_police(struct net_device *dev, return -EOPNOTSUPP; } - if (!flow_action_basic_hw_stats_check(&cls->rule->action, - cls->common.extack)) + if (!flow_action_basic_hw_stats_check(&cls->rule->action, extack)) return -EOPNOTSUPP; list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) { -- cgit v1.3 From 4cc4394a897eae27405396d1d756a9f75a3addc3 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 23 Oct 2024 16:52:49 +0300 Subject: net: dsa: add more extack messages in dsa_user_add_cls_matchall_mirred() Do not leave -EOPNOTSUPP errors without an explanation. It is confusing for the user to figure out what is wrong otherwise. Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20241023135251.1752488-5-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/user.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/dsa/user.c b/net/dsa/user.c index a89425a8de2e..398418cd0b78 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -1376,11 +1376,17 @@ dsa_user_add_cls_matchall_mirred(struct net_device *dev, struct dsa_port *to_dp; int err; - if (cls->common.protocol != htons(ETH_P_ALL)) + if (cls->common.protocol != htons(ETH_P_ALL)) { + NL_SET_ERR_MSG_MOD(extack, + "Can only offload \"protocol all\" matchall filter"); return -EOPNOTSUPP; + } - if (!ds->ops->port_mirror_add) + if (!ds->ops->port_mirror_add) { + NL_SET_ERR_MSG_MOD(extack, + "Switch does not support mirroring operation"); return -EOPNOTSUPP; + } if (!flow_action_basic_hw_stats_check(&cls->rule->action, extack)) return -EOPNOTSUPP; @@ -1487,9 +1493,13 @@ static int dsa_user_add_cls_matchall(struct net_device *dev, bool ingress) { const struct flow_action *action = &cls->rule->action; + struct netlink_ext_ack *extack = cls->common.extack; - if (!flow_offload_has_one_action(action)) + if (!flow_offload_has_one_action(action)) { + NL_SET_ERR_MSG_MOD(extack, + "Cannot offload matchall filter with more than one action"); return -EOPNOTSUPP; + } switch (action->entries[0].id) { case FLOW_ACTION_MIRRED: @@ -1497,6 +1507,7 @@ static int dsa_user_add_cls_matchall(struct net_device *dev, case FLOW_ACTION_POLICE: return dsa_user_add_cls_matchall_police(dev, cls, ingress); default: + NL_SET_ERR_MSG_MOD(extack, "Unknown action"); break; } -- cgit v1.3 From 3535d70df9c80b382a202baa430aa8fbb2433bfa Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 23 Oct 2024 16:52:50 +0300 Subject: net: dsa: allow matchall mirroring rules towards the CPU If the CPU bandwidth capacity permits, it may be useful to mirror the entire ingress of a user port to software. This is in fact possible to express even if there is no net_device representation for the CPU port. In fact, that approach was already exhausted and that representation wouldn't have even helped [1]. The idea behind implementing this is that currently, we refuse to offload any mirroring towards a non-DSA target net_device. But if we acknowledge the fact that to reach any foreign net_device, the switch must send the packet to the CPU anyway, then we can simply offload just that part, and let the software do the rest. There is only one condition we need to uphold: the filter needs to be present in the software data path as well (no skip_sw). There are 2 actions to consider: FLOW_ACTION_MIRRED (redirect to egress of target interface) and FLOW_ACTION_MIRRED_INGRESS (redirect to ingress of target interface). We don't have the ability/API to offload FLOW_ACTION_MIRRED_INGRESS when the target port is also a DSA user port, but we could also permit that through mirred to the CPU + software. Example: $ ip link add dummy0 type dummy; ip link set dummy0 up $ tc qdisc add dev swp0 clsact $ tc filter add dev swp0 ingress matchall action mirred ingress mirror dev dummy0 Any DSA driver with a ds->ops->port_mirror_add() implementation can now make use of this with no additional change. [1] https://lore.kernel.org/netdev/20191002233750.13566-1-olteanv@gmail.com/ Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20241023135251.1752488-6-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/user.c | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/dsa/user.c b/net/dsa/user.c index 398418cd0b78..b18ad0105b01 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -1364,7 +1364,7 @@ dsa_user_mall_tc_entry_find(struct net_device *dev, unsigned long cookie) static int dsa_user_add_cls_matchall_mirred(struct net_device *dev, struct tc_cls_matchall_offload *cls, - bool ingress) + bool ingress, bool ingress_target) { struct netlink_ext_ack *extack = cls->common.extack; struct dsa_port *dp = dsa_user_to_port(dev); @@ -1396,10 +1396,30 @@ dsa_user_add_cls_matchall_mirred(struct net_device *dev, if (!act->dev) return -EINVAL; - if (!dsa_user_dev_check(act->dev)) - return -EOPNOTSUPP; - - to_dp = dsa_user_to_port(act->dev); + if (dsa_user_dev_check(act->dev)) { + if (ingress_target) { + /* We can only fulfill this using software assist */ + if (cls->common.skip_sw) { + NL_SET_ERR_MSG_MOD(extack, + "Can only mirred to ingress of DSA user port if filter also runs in software"); + return -EOPNOTSUPP; + } + to_dp = dp->cpu_dp; + } else { + to_dp = dsa_user_to_port(act->dev); + } + } else { + /* Handle mirroring to foreign target ports as a mirror towards + * the CPU. The software tc rule will take the packets from + * there. + */ + if (cls->common.skip_sw) { + NL_SET_ERR_MSG_MOD(extack, + "Can only mirred to CPU if filter also runs in software"); + return -EOPNOTSUPP; + } + to_dp = dp->cpu_dp; + } if (dp->ds != to_dp->ds) { NL_SET_ERR_MSG_MOD(extack, @@ -1503,7 +1523,11 @@ static int dsa_user_add_cls_matchall(struct net_device *dev, switch (action->entries[0].id) { case FLOW_ACTION_MIRRED: - return dsa_user_add_cls_matchall_mirred(dev, cls, ingress); + return dsa_user_add_cls_matchall_mirred(dev, cls, ingress, + false); + case FLOW_ACTION_MIRRED_INGRESS: + return dsa_user_add_cls_matchall_mirred(dev, cls, ingress, + true); case FLOW_ACTION_POLICE: return dsa_user_add_cls_matchall_police(dev, cls, ingress); default: -- cgit v1.3 From bd03e7627c377c69508b56ab63d1339bf45a4552 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 26 Oct 2024 16:17:44 +0200 Subject: rtnetlink: Fix an error handling path in rtnl_newlink() When some code has been moved in the commit in Fixes, some "return err;" have correctly been changed in goto but this one was missed. Should "ops->maxtype > RTNL_MAX_TYPE" happen, then some resources would leak. Go through the error handling path to fix these leaks. Fixes: 0d3008d1a9ae ("rtnetlink: Move ops->validate to rtnl_newlink().") Signed-off-by: Christophe JAILLET Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/eca90eeb4d9e9a0545772b68aeaab883d9fe2279.1729952228.git.christophe.jaillet@wanadoo.fr Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index b70f90b98714..7b96f89333d3 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3840,8 +3840,10 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, } if (ops) { - if (ops->maxtype > RTNL_MAX_TYPE) - return -EINVAL; + if (ops->maxtype > RTNL_MAX_TYPE) { + ret = -EINVAL; + goto put_ops; + } if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { ret = nla_parse_nested_deprecated(tbs->attr, ops->maxtype, -- cgit v1.3 From cf44bd08cdeeeceb61a439f9dc437ded23adb75d Mon Sep 17 00:00:00 2001 From: Pengcheng Yang Date: Fri, 25 Oct 2024 16:45:44 +0800 Subject: tcp: only release congestion control if it has been initialized Currently, when cleaning up congestion control, we always call the release regardless of whether it has been initialized. There is no need to release when closing TCP_LISTEN and TCP_CLOSE (close immediately after socket()). In this case, tcp_cdg calls kfree(NULL) in release without causing an exception, but for some customized ca, this could lead to unexpected exceptions. We need to ensure that init and release are called in pairs. Signed-off-by: Pengcheng Yang Link: https://patch.msgid.link/1729845944-6003-1-git-send-email-yangpc@wangsu.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_cong.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 82cc4a5633ce..0d704bda6c41 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3336,7 +3336,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->window_clamp = 0; tp->delivered = 0; tp->delivered_ce = 0; - if (icsk->icsk_ca_ops->release) + if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); icsk->icsk_ca_initialized = 0; diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 0306d257fa64..df758adbb445 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -270,8 +270,9 @@ void tcp_cleanup_congestion_control(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - if (icsk->icsk_ca_ops->release) + if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); + icsk->icsk_ca_initialized = 0; bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); } -- cgit v1.3 From bc74d329ceba23f998ead4f716266da5afe319f7 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Wed, 30 Oct 2024 09:21:47 +0800 Subject: netlink: Remove the dead code in netlink_proto_init() In the error path of netlink_proto_init(), frees the already allocated bucket table for new hash tables in a loop, but it is going to panic, so it is not necessary to clean up the resources, just remove the dead code. Suggested-by: Kuniyuki Iwashima Signed-off-by: Jinjie Ruan Link: https://patch.msgid.link/20241030012147.357400-1-ruanjinjie@huawei.com Signed-off-by: Jakub Kicinski --- net/netlink/af_netlink.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 0a9287fadb47..52a7c7233cab 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2935,12 +2935,8 @@ static int __init netlink_proto_init(void) for (i = 0; i < MAX_LINKS; i++) { if (rhashtable_init(&nl_table[i].hash, - &netlink_rhashtable_params) < 0) { - while (--i > 0) - rhashtable_destroy(&nl_table[i].hash); - kfree(nl_table); + &netlink_rhashtable_params) < 0) goto panic; - } } netlink_add_usersock_entry(); -- cgit v1.3 From 53c0a58beb60b76e105a61aae518fd780eec03d9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 25 May 2024 23:32:20 -0400 Subject: net/socket.c: switch to CLASS(fd) The important part in sockfd_lookup_light() is avoiding needless file refcount operations, not the marginal reduction of the register pressure from not keeping a struct file pointer in the caller. Switch to use fdget()/fdpu(); with sane use of CLASS(fd) we can get a better code generation... Would be nice if somebody tested it on networking test suites (including benchmarks)... sockfd_lookup_light() does fdget(), uses sock_from_file() to get the associated socket and returns the struct socket reference to the caller, along with "do we need to fput()" flag. No matching fdput(), the caller does its equivalent manually, using the fact that sock->file points to the struct file the socket has come from. Get rid of that - have the callers do fdget()/fdput() and use sock_from_file() directly. That kills sockfd_lookup_light() and fput_light() (no users left). What's more, we can get rid of explicit fdget()/fdput() by switching to CLASS(fd, ...) - code generation does not suffer, since now fdput() inserted on "descriptor is not opened" failure exit is recognized to be a no-op by compiler. [folded a fix for braino in do_recvmmsg() caught by Simon Horman] Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- include/linux/file.h | 6 - net/socket.c | 303 +++++++++++++++++++++++---------------------------- 2 files changed, 137 insertions(+), 172 deletions(-) (limited to 'net') diff --git a/include/linux/file.h b/include/linux/file.h index f98de143245a..b49a92295b3f 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -30,12 +30,6 @@ extern struct file *alloc_file_pseudo_noaccount(struct inode *, struct vfsmount extern struct file *alloc_file_clone(struct file *, int flags, const struct file_operations *); -static inline void fput_light(struct file *file, int fput_needed) -{ - if (fput_needed) - fput(file); -} - /* either a reference to struct file + flags * (cloned vs. borrowed, pos locked), with * flags stored in lower bits of value, diff --git a/net/socket.c b/net/socket.c index 601ad74930ef..c3ac02d060c0 100644 --- a/net/socket.c +++ b/net/socket.c @@ -509,7 +509,7 @@ static int sock_map_fd(struct socket *sock, int flags) struct socket *sock_from_file(struct file *file) { - if (file->f_op == &socket_file_ops) + if (likely(file->f_op == &socket_file_ops)) return file->private_data; /* set in sock_alloc_file */ return NULL; @@ -549,24 +549,6 @@ struct socket *sockfd_lookup(int fd, int *err) } EXPORT_SYMBOL(sockfd_lookup); -static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed) -{ - struct fd f = fdget(fd); - struct socket *sock; - - *err = -EBADF; - if (fd_file(f)) { - sock = sock_from_file(fd_file(f)); - if (likely(sock)) { - *fput_needed = f.word & FDPUT_FPUT; - return sock; - } - *err = -ENOTSOCK; - fdput(f); - } - return NULL; -} - static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer, size_t size) { @@ -1853,16 +1835,20 @@ int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen) { struct socket *sock; struct sockaddr_storage address; - int err, fput_needed; - - sock = sockfd_lookup_light(fd, &err, &fput_needed); - if (sock) { - err = move_addr_to_kernel(umyaddr, addrlen, &address); - if (!err) - err = __sys_bind_socket(sock, &address, addrlen); - fput_light(sock->file, fput_needed); - } - return err; + CLASS(fd, f)(fd); + int err; + + if (fd_empty(f)) + return -EBADF; + sock = sock_from_file(fd_file(f)); + if (unlikely(!sock)) + return -ENOTSOCK; + + err = move_addr_to_kernel(umyaddr, addrlen, &address); + if (unlikely(err)) + return err; + + return __sys_bind_socket(sock, &address, addrlen); } SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen) @@ -1891,15 +1877,16 @@ int __sys_listen_socket(struct socket *sock, int backlog) int __sys_listen(int fd, int backlog) { + CLASS(fd, f)(fd); struct socket *sock; - int err, fput_needed; - sock = sockfd_lookup_light(fd, &err, &fput_needed); - if (sock) { - err = __sys_listen_socket(sock, backlog); - fput_light(sock->file, fput_needed); - } - return err; + if (fd_empty(f)) + return -EBADF; + sock = sock_from_file(fd_file(f)); + if (unlikely(!sock)) + return -ENOTSOCK; + + return __sys_listen_socket(sock, backlog); } SYSCALL_DEFINE2(listen, int, fd, int, backlog) @@ -2009,17 +1996,12 @@ static int __sys_accept4_file(struct file *file, struct sockaddr __user *upeer_s int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen, int flags) { - int ret = -EBADF; - struct fd f; + CLASS(fd, f)(fd); - f = fdget(fd); - if (fd_file(f)) { - ret = __sys_accept4_file(fd_file(f), upeer_sockaddr, + if (fd_empty(f)) + return -EBADF; + return __sys_accept4_file(fd_file(f), upeer_sockaddr, upeer_addrlen, flags); - fdput(f); - } - - return ret; } SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr, @@ -2071,20 +2053,18 @@ out: int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen) { - int ret = -EBADF; - struct fd f; + struct sockaddr_storage address; + CLASS(fd, f)(fd); + int ret; - f = fdget(fd); - if (fd_file(f)) { - struct sockaddr_storage address; + if (fd_empty(f)) + return -EBADF; - ret = move_addr_to_kernel(uservaddr, addrlen, &address); - if (!ret) - ret = __sys_connect_file(fd_file(f), &address, addrlen, 0); - fdput(f); - } + ret = move_addr_to_kernel(uservaddr, addrlen, &address); + if (ret) + return ret; - return ret; + return __sys_connect_file(fd_file(f), &address, addrlen, 0); } SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr, @@ -2103,26 +2083,25 @@ int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, { struct socket *sock; struct sockaddr_storage address; - int err, fput_needed; + CLASS(fd, f)(fd); + int err; - sock = sockfd_lookup_light(fd, &err, &fput_needed); - if (!sock) - goto out; + if (fd_empty(f)) + return -EBADF; + sock = sock_from_file(fd_file(f)); + if (unlikely(!sock)) + return -ENOTSOCK; err = security_socket_getsockname(sock); if (err) - goto out_put; + return err; err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 0); if (err < 0) - goto out_put; - /* "err" is actually length in this case */ - err = move_addr_to_user(&address, err, usockaddr, usockaddr_len); + return err; -out_put: - fput_light(sock->file, fput_needed); -out: - return err; + /* "err" is actually length in this case */ + return move_addr_to_user(&address, err, usockaddr, usockaddr_len); } SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr, @@ -2141,26 +2120,25 @@ int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, { struct socket *sock; struct sockaddr_storage address; - int err, fput_needed; + CLASS(fd, f)(fd); + int err; - sock = sockfd_lookup_light(fd, &err, &fput_needed); - if (sock != NULL) { - const struct proto_ops *ops = READ_ONCE(sock->ops); + if (fd_empty(f)) + return -EBADF; + sock = sock_from_file(fd_file(f)); + if (unlikely(!sock)) + return -ENOTSOCK; - err = security_socket_getpeername(sock); - if (err) { - fput_light(sock->file, fput_needed); - return err; - } + err = security_socket_getpeername(sock); + if (err) + return err; - err = ops->getname(sock, (struct sockaddr *)&address, 1); - if (err >= 0) - /* "err" is actually length in this case */ - err = move_addr_to_user(&address, err, usockaddr, - usockaddr_len); - fput_light(sock->file, fput_needed); - } - return err; + err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 1); + if (err < 0) + return err; + + /* "err" is actually length in this case */ + return move_addr_to_user(&address, err, usockaddr, usockaddr_len); } SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr, @@ -2181,14 +2159,17 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, struct sockaddr_storage address; int err; struct msghdr msg; - int fput_needed; err = import_ubuf(ITER_SOURCE, buff, len, &msg.msg_iter); if (unlikely(err)) return err; - sock = sockfd_lookup_light(fd, &err, &fput_needed); - if (!sock) - goto out; + + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; + sock = sock_from_file(fd_file(f)); + if (unlikely(!sock)) + return -ENOTSOCK; msg.msg_name = NULL; msg.msg_control = NULL; @@ -2198,7 +2179,7 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, if (addr) { err = move_addr_to_kernel(addr, addr_len, &address); if (err < 0) - goto out_put; + return err; msg.msg_name = (struct sockaddr *)&address; msg.msg_namelen = addr_len; } @@ -2206,12 +2187,7 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; msg.msg_flags = flags; - err = __sock_sendmsg(sock, &msg); - -out_put: - fput_light(sock->file, fput_needed); -out: - return err; + return __sock_sendmsg(sock, &msg); } SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, @@ -2246,14 +2222,18 @@ int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags, }; struct socket *sock; int err, err2; - int fput_needed; err = import_ubuf(ITER_DEST, ubuf, size, &msg.msg_iter); if (unlikely(err)) return err; - sock = sockfd_lookup_light(fd, &err, &fput_needed); - if (!sock) - goto out; + + CLASS(fd, f)(fd); + + if (fd_empty(f)) + return -EBADF; + sock = sock_from_file(fd_file(f)); + if (unlikely(!sock)) + return -ENOTSOCK; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; @@ -2265,9 +2245,6 @@ int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags, if (err2 < 0) err = err2; } - - fput_light(sock->file, fput_needed); -out: return err; } @@ -2342,17 +2319,16 @@ int __sys_setsockopt(int fd, int level, int optname, char __user *user_optval, { sockptr_t optval = USER_SOCKPTR(user_optval); bool compat = in_compat_syscall(); - int err, fput_needed; struct socket *sock; + CLASS(fd, f)(fd); - sock = sockfd_lookup_light(fd, &err, &fput_needed); - if (!sock) - return err; - - err = do_sock_setsockopt(sock, compat, level, optname, optval, optlen); + if (fd_empty(f)) + return -EBADF; + sock = sock_from_file(fd_file(f)); + if (unlikely(!sock)) + return -ENOTSOCK; - fput_light(sock->file, fput_needed); - return err; + return do_sock_setsockopt(sock, compat, level, optname, optval, optlen); } SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, @@ -2408,20 +2384,17 @@ EXPORT_SYMBOL(do_sock_getsockopt); int __sys_getsockopt(int fd, int level, int optname, char __user *optval, int __user *optlen) { - int err, fput_needed; struct socket *sock; - bool compat; + CLASS(fd, f)(fd); - sock = sockfd_lookup_light(fd, &err, &fput_needed); - if (!sock) - return err; + if (fd_empty(f)) + return -EBADF; + sock = sock_from_file(fd_file(f)); + if (unlikely(!sock)) + return -ENOTSOCK; - compat = in_compat_syscall(); - err = do_sock_getsockopt(sock, compat, level, optname, + return do_sock_getsockopt(sock, in_compat_syscall(), level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); - - fput_light(sock->file, fput_needed); - return err; } SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname, @@ -2447,15 +2420,16 @@ int __sys_shutdown_sock(struct socket *sock, int how) int __sys_shutdown(int fd, int how) { - int err, fput_needed; struct socket *sock; + CLASS(fd, f)(fd); - sock = sockfd_lookup_light(fd, &err, &fput_needed); - if (sock != NULL) { - err = __sys_shutdown_sock(sock, how); - fput_light(sock->file, fput_needed); - } - return err; + if (fd_empty(f)) + return -EBADF; + sock = sock_from_file(fd_file(f)); + if (unlikely(!sock)) + return -ENOTSOCK; + + return __sys_shutdown_sock(sock, how); } SYSCALL_DEFINE2(shutdown, int, fd, int, how) @@ -2671,22 +2645,21 @@ long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg, long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, bool forbid_cmsg_compat) { - int fput_needed, err; struct msghdr msg_sys; struct socket *sock; if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT)) return -EINVAL; - sock = sockfd_lookup_light(fd, &err, &fput_needed); - if (!sock) - goto out; + CLASS(fd, f)(fd); - err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0); + if (fd_empty(f)) + return -EBADF; + sock = sock_from_file(fd_file(f)); + if (unlikely(!sock)) + return -ENOTSOCK; - fput_light(sock->file, fput_needed); -out: - return err; + return ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0); } SYSCALL_DEFINE3(sendmsg, int, fd, struct user_msghdr __user *, msg, unsigned int, flags) @@ -2701,7 +2674,7 @@ SYSCALL_DEFINE3(sendmsg, int, fd, struct user_msghdr __user *, msg, unsigned int int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, bool forbid_cmsg_compat) { - int fput_needed, err, datagrams; + int err, datagrams; struct socket *sock; struct mmsghdr __user *entry; struct compat_mmsghdr __user *compat_entry; @@ -2717,9 +2690,13 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, datagrams = 0; - sock = sockfd_lookup_light(fd, &err, &fput_needed); - if (!sock) - return err; + CLASS(fd, f)(fd); + + if (fd_empty(f)) + return -EBADF; + sock = sock_from_file(fd_file(f)); + if (unlikely(!sock)) + return -ENOTSOCK; used_address.name_len = UINT_MAX; entry = mmsg; @@ -2756,8 +2733,6 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, cond_resched(); } - fput_light(sock->file, fput_needed); - /* We only return an error if no datagrams were able to be sent */ if (datagrams != 0) return datagrams; @@ -2879,22 +2854,21 @@ long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg, long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, bool forbid_cmsg_compat) { - int fput_needed, err; struct msghdr msg_sys; struct socket *sock; if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT)) return -EINVAL; - sock = sockfd_lookup_light(fd, &err, &fput_needed); - if (!sock) - goto out; + CLASS(fd, f)(fd); - err = ___sys_recvmsg(sock, msg, &msg_sys, flags, 0); + if (fd_empty(f)) + return -EBADF; + sock = sock_from_file(fd_file(f)); + if (unlikely(!sock)) + return -ENOTSOCK; - fput_light(sock->file, fput_needed); -out: - return err; + return ___sys_recvmsg(sock, msg, &msg_sys, flags, 0); } SYSCALL_DEFINE3(recvmsg, int, fd, struct user_msghdr __user *, msg, @@ -2911,7 +2885,7 @@ static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, struct timespec64 *timeout) { - int fput_needed, err, datagrams; + int err = 0, datagrams; struct socket *sock; struct mmsghdr __user *entry; struct compat_mmsghdr __user *compat_entry; @@ -2926,16 +2900,18 @@ static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg, datagrams = 0; - sock = sockfd_lookup_light(fd, &err, &fput_needed); - if (!sock) - return err; + CLASS(fd, f)(fd); + + if (fd_empty(f)) + return -EBADF; + sock = sock_from_file(fd_file(f)); + if (unlikely(!sock)) + return -ENOTSOCK; if (likely(!(flags & MSG_ERRQUEUE))) { err = sock_error(sock->sk); - if (err) { - datagrams = err; - goto out_put; - } + if (err) + return err; } entry = mmsg; @@ -2992,12 +2968,10 @@ static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg, } if (err == 0) - goto out_put; + return datagrams; - if (datagrams == 0) { - datagrams = err; - goto out_put; - } + if (datagrams == 0) + return err; /* * We may return less entries than requested (vlen) if the @@ -3012,9 +2986,6 @@ static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg, */ WRITE_ONCE(sock->sk->sk_err, -err); } -out_put: - fput_light(sock->file, fput_needed); - return datagrams; } -- cgit v1.3 From f302edb9d822804e72df3fa6ba270234050c678b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 Jul 2024 21:49:04 -0400 Subject: switch netlink_getsockbyfilp() to taking descriptor the only call site (in do_mq_notify()) obtains the argument from an immediately preceding fdget() and it is immediately followed by fdput(); might as well just replace it with a variant that would take a descriptor instead of struct file * and have file lookups handled inside that function. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- include/linux/netlink.h | 2 +- ipc/mqueue.c | 8 +------- net/netlink/af_netlink.c | 9 +++++++-- 3 files changed, 9 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index b332c2048c75..a48a30842d84 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -239,7 +239,7 @@ int netlink_register_notifier(struct notifier_block *nb); int netlink_unregister_notifier(struct notifier_block *nb); /* finegrained unicast helpers: */ -struct sock *netlink_getsockbyfilp(struct file *filp); +struct sock *netlink_getsockbyfd(int fd); int netlink_attachskb(struct sock *sk, struct sk_buff *skb, long *timeo, struct sock *ssk); void netlink_detachskb(struct sock *sk, struct sk_buff *skb); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 34fa0bd8bb11..fd05e3d4f7b6 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -1355,13 +1355,7 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification) skb_put(nc, NOTIFY_COOKIE_LEN); /* and attach it to the socket */ retry: - f = fdget(notification->sigev_signo); - if (!fd_file(f)) { - ret = -EBADF; - goto out; - } - sock = netlink_getsockbyfilp(fd_file(f)); - fdput(f); + sock = netlink_getsockbyfd(notification->sigev_signo); if (IS_ERR(sock)) { ret = PTR_ERR(sock); goto free_skb; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 0b7a89db3ab7..42451ac355d0 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1180,11 +1180,16 @@ static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid) return sock; } -struct sock *netlink_getsockbyfilp(struct file *filp) +struct sock *netlink_getsockbyfd(int fd) { - struct inode *inode = file_inode(filp); + CLASS(fd, f)(fd); + struct inode *inode; struct sock *sock; + if (fd_empty(f)) + return ERR_PTR(-EBADF); + + inode = file_inode(fd_file(f)); if (!S_ISSOCK(inode->i_mode)) return ERR_PTR(-ENOTSOCK); -- cgit v1.3 From 6348be02eead77bdd1562154ed6b3296ad3b3750 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 19 Jul 2024 20:17:58 -0400 Subject: fdget(), trivial conversions fdget() is the first thing done in scope, all matching fdput() are immediately followed by leaving the scope. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- arch/powerpc/kvm/book3s_64_vio.c | 21 ++++------------ arch/powerpc/kvm/powerpc.c | 24 ++++++------------ arch/powerpc/platforms/cell/spu_syscalls.c | 6 ++--- arch/x86/kernel/cpu/sgx/main.c | 10 +++----- arch/x86/kvm/svm/sev.c | 39 ++++++++++-------------------- drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c | 23 ++++++------------ drivers/gpu/drm/drm_syncobj.c | 9 +++---- drivers/media/rc/lirc_dev.c | 13 +++------- fs/btrfs/ioctl.c | 5 ++-- fs/eventfd.c | 9 +++---- fs/eventpoll.c | 23 ++++++------------ fs/fhandle.c | 5 ++-- fs/ioctl.c | 23 ++++++------------ fs/kernel_read_file.c | 12 +++------ fs/notify/fanotify/fanotify_user.c | 15 ++++-------- fs/notify/inotify/inotify_user.c | 17 ++++--------- fs/open.c | 36 ++++++++++++--------------- fs/read_write.c | 28 +++++++-------------- fs/signalfd.c | 9 +++---- fs/sync.c | 29 +++++++++------------- io_uring/sqpoll.c | 29 ++++++---------------- kernel/events/core.c | 14 ++++------- kernel/nsproxy.c | 5 ++-- kernel/pid.c | 7 ++---- kernel/sys.c | 15 ++++-------- kernel/watch_queue.c | 6 ++--- mm/fadvise.c | 10 +++----- mm/readahead.c | 17 ++++--------- net/core/net_namespace.c | 10 +++----- security/landlock/syscalls.c | 26 ++++++-------------- virt/kvm/vfio.c | 8 ++---- 31 files changed, 164 insertions(+), 339 deletions(-) (limited to 'net') diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 34c0adb9fdbf..742aa58a7c7e 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -115,10 +115,9 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, struct iommu_table_group *table_group; long i; struct kvmppc_spapr_tce_iommu_table *stit; - struct fd f; + CLASS(fd, f)(tablefd); - f = fdget(tablefd); - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; rcu_read_lock(); @@ -130,16 +129,12 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, } rcu_read_unlock(); - if (!found) { - fdput(f); + if (!found) return -EINVAL; - } table_group = iommu_group_get_iommudata(grp); - if (WARN_ON(!table_group)) { - fdput(f); + if (WARN_ON(!table_group)) return -EFAULT; - } for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { struct iommu_table *tbltmp = table_group->tables[i]; @@ -160,10 +155,8 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, break; } } - if (!tbl) { - fdput(f); + if (!tbl) return -EINVAL; - } rcu_read_lock(); list_for_each_entry_rcu(stit, &stt->iommu_tables, next) { @@ -174,7 +167,6 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, /* stit is being destroyed */ iommu_tce_table_put(tbl); rcu_read_unlock(); - fdput(f); return -ENOTTY; } /* @@ -182,7 +174,6 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, * its KVM reference counter and can return. */ rcu_read_unlock(); - fdput(f); return 0; } rcu_read_unlock(); @@ -190,7 +181,6 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, stit = kzalloc(sizeof(*stit), GFP_KERNEL); if (!stit) { iommu_tce_table_put(tbl); - fdput(f); return -ENOMEM; } @@ -199,7 +189,6 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, list_add_rcu(&stit->next, &stt->iommu_tables); - fdput(f); return 0; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index f14329989e9a..b3b37ea77849 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -1933,12 +1933,11 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, #endif #ifdef CONFIG_KVM_MPIC case KVM_CAP_IRQ_MPIC: { - struct fd f; + CLASS(fd, f)(cap->args[0]); struct kvm_device *dev; r = -EBADF; - f = fdget(cap->args[0]); - if (!fd_file(f)) + if (fd_empty(f)) break; r = -EPERM; @@ -1946,18 +1945,16 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, if (dev) r = kvmppc_mpic_connect_vcpu(dev, vcpu, cap->args[1]); - fdput(f); break; } #endif #ifdef CONFIG_KVM_XICS case KVM_CAP_IRQ_XICS: { - struct fd f; + CLASS(fd, f)(cap->args[0]); struct kvm_device *dev; r = -EBADF; - f = fdget(cap->args[0]); - if (!fd_file(f)) + if (fd_empty(f)) break; r = -EPERM; @@ -1968,34 +1965,27 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, else r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]); } - - fdput(f); break; } #endif /* CONFIG_KVM_XICS */ #ifdef CONFIG_KVM_XIVE case KVM_CAP_PPC_IRQ_XIVE: { - struct fd f; + CLASS(fd, f)(cap->args[0]); struct kvm_device *dev; r = -EBADF; - f = fdget(cap->args[0]); - if (!fd_file(f)) + if (fd_empty(f)) break; r = -ENXIO; - if (!xive_enabled()) { - fdput(f); + if (!xive_enabled()) break; - } r = -EPERM; dev = kvm_device_from_filp(fd_file(f)); if (dev) r = kvmppc_xive_native_connect_vcpu(dev, vcpu, cap->args[1]); - - fdput(f); break; } #endif /* CONFIG_KVM_XIVE */ diff --git a/arch/powerpc/platforms/cell/spu_syscalls.c b/arch/powerpc/platforms/cell/spu_syscalls.c index cd7d42fc12a6..da4fad7fc8bf 100644 --- a/arch/powerpc/platforms/cell/spu_syscalls.c +++ b/arch/powerpc/platforms/cell/spu_syscalls.c @@ -64,12 +64,10 @@ SYSCALL_DEFINE4(spu_create, const char __user *, name, unsigned int, flags, return -ENOSYS; if (flags & SPU_CREATE_AFFINITY_SPU) { - struct fd neighbor = fdget(neighbor_fd); + CLASS(fd, neighbor)(neighbor_fd); ret = -EBADF; - if (fd_file(neighbor)) { + if (!fd_empty(neighbor)) ret = calls->create_thread(name, flags, mode, fd_file(neighbor)); - fdput(neighbor); - } } else ret = calls->create_thread(name, flags, mode, NULL); diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 9ace84486499..eb5848d1851a 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -901,19 +901,15 @@ static struct miscdevice sgx_dev_provision = { int sgx_set_attribute(unsigned long *allowed_attributes, unsigned int attribute_fd) { - struct fd f = fdget(attribute_fd); + CLASS(fd, f)(attribute_fd); - if (!fd_file(f)) + if (fd_empty(f)) return -EINVAL; - if (fd_file(f)->f_op != &sgx_provision_fops) { - fdput(f); + if (fd_file(f)->f_op != &sgx_provision_fops) return -EINVAL; - } *allowed_attributes |= SGX_ATTR_PROVISIONKEY; - - fdput(f); return 0; } EXPORT_SYMBOL_GPL(sgx_set_attribute); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0b851ef937f2..34304f6c36be 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -530,17 +530,12 @@ static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) static int __sev_issue_cmd(int fd, int id, void *data, int *error) { - struct fd f; - int ret; + CLASS(fd, f)(fd); - f = fdget(fd); - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; - ret = sev_issue_cmd_external_user(fd_file(f), id, data, error); - - fdput(f); - return ret; + return sev_issue_cmd_external_user(fd_file(f), id, data, error); } static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error) @@ -2073,23 +2068,21 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) { struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_info *src_sev, *cg_cleanup_sev; - struct fd f = fdget(source_fd); + CLASS(fd, f)(source_fd); struct kvm *source_kvm; bool charged = false; int ret; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; - if (!file_is_kvm(fd_file(f))) { - ret = -EBADF; - goto out_fput; - } + if (!file_is_kvm(fd_file(f))) + return -EBADF; source_kvm = fd_file(f)->private_data; ret = sev_lock_two_vms(kvm, source_kvm); if (ret) - goto out_fput; + return ret; if (kvm->arch.vm_type != source_kvm->arch.vm_type || sev_guest(kvm) || !sev_guest(source_kvm)) { @@ -2136,8 +2129,6 @@ out_dst_cgroup: cg_cleanup_sev->misc_cg = NULL; out_unlock: sev_unlock_two_vms(kvm, source_kvm); -out_fput: - fdput(f); return ret; } @@ -2798,23 +2789,21 @@ failed: int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) { - struct fd f = fdget(source_fd); + CLASS(fd, f)(source_fd); struct kvm *source_kvm; struct kvm_sev_info *source_sev, *mirror_sev; int ret; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; - if (!file_is_kvm(fd_file(f))) { - ret = -EBADF; - goto e_source_fput; - } + if (!file_is_kvm(fd_file(f))) + return -EBADF; source_kvm = fd_file(f)->private_data; ret = sev_lock_two_vms(kvm, source_kvm); if (ret) - goto e_source_fput; + return ret; /* * Mirrors of mirrors should work, but let's not get silly. Also @@ -2857,8 +2846,6 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) e_unlock: sev_unlock_two_vms(kvm, source_kvm); -e_source_fput: - fdput(f); return ret; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c index b0a8abc7a8ec..341beec59537 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c @@ -35,21 +35,19 @@ static int amdgpu_sched_process_priority_override(struct amdgpu_device *adev, int fd, int32_t priority) { - struct fd f = fdget(fd); + CLASS(fd, f)(fd); struct amdgpu_fpriv *fpriv; struct amdgpu_ctx_mgr *mgr; struct amdgpu_ctx *ctx; uint32_t id; int r; - if (!fd_file(f)) + if (fd_empty(f)) return -EINVAL; r = amdgpu_file_to_fpriv(fd_file(f), &fpriv); - if (r) { - fdput(f); + if (r) return r; - } mgr = &fpriv->ctx_mgr; mutex_lock(&mgr->lock); @@ -57,7 +55,6 @@ static int amdgpu_sched_process_priority_override(struct amdgpu_device *adev, amdgpu_ctx_priority_override(ctx, priority); mutex_unlock(&mgr->lock); - fdput(f); return 0; } @@ -66,31 +63,25 @@ static int amdgpu_sched_context_priority_override(struct amdgpu_device *adev, unsigned ctx_id, int32_t priority) { - struct fd f = fdget(fd); + CLASS(fd, f)(fd); struct amdgpu_fpriv *fpriv; struct amdgpu_ctx *ctx; int r; - if (!fd_file(f)) + if (fd_empty(f)) return -EINVAL; r = amdgpu_file_to_fpriv(fd_file(f), &fpriv); - if (r) { - fdput(f); + if (r) return r; - } ctx = amdgpu_ctx_get(fpriv, ctx_id); - if (!ctx) { - fdput(f); + if (!ctx) return -EINVAL; - } amdgpu_ctx_priority_override(ctx, priority); amdgpu_ctx_put(ctx); - fdput(f); - return 0; } diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c index 8e3d2d7060f8..4f2ab8a7b50f 100644 --- a/drivers/gpu/drm/drm_syncobj.c +++ b/drivers/gpu/drm/drm_syncobj.c @@ -712,16 +712,14 @@ static int drm_syncobj_fd_to_handle(struct drm_file *file_private, int fd, u32 *handle) { struct drm_syncobj *syncobj; - struct fd f = fdget(fd); + CLASS(fd, f)(fd); int ret; - if (!fd_file(f)) + if (fd_empty(f)) return -EINVAL; - if (fd_file(f)->f_op != &drm_syncobj_file_fops) { - fdput(f); + if (fd_file(f)->f_op != &drm_syncobj_file_fops) return -EINVAL; - } /* take a reference to put in the idr */ syncobj = fd_file(f)->private_data; @@ -739,7 +737,6 @@ static int drm_syncobj_fd_to_handle(struct drm_file *file_private, } else drm_syncobj_put(syncobj); - fdput(f); return ret; } diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c index f042f3f14afa..a2257dc2f25d 100644 --- a/drivers/media/rc/lirc_dev.c +++ b/drivers/media/rc/lirc_dev.c @@ -815,28 +815,23 @@ void __exit lirc_dev_exit(void) struct rc_dev *rc_dev_get_from_fd(int fd, bool write) { - struct fd f = fdget(fd); + CLASS(fd, f)(fd); struct lirc_fh *fh; struct rc_dev *dev; - if (!fd_file(f)) + if (fd_empty(f)) return ERR_PTR(-EBADF); - if (fd_file(f)->f_op != &lirc_fops) { - fdput(f); + if (fd_file(f)->f_op != &lirc_fops) return ERR_PTR(-EINVAL); - } - if (write && !(fd_file(f)->f_mode & FMODE_WRITE)) { - fdput(f); + if (write && !(fd_file(f)->f_mode & FMODE_WRITE)) return ERR_PTR(-EPERM); - } fh = fd_file(f)->private_data; dev = fh->rc; get_device(&dev->dev); - fdput(f); return dev; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 226c91fe31a7..adb591b1d071 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1308,9 +1308,9 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, ret = btrfs_mksubvol(&file->f_path, idmap, name, namelen, NULL, readonly, inherit); } else { - struct fd src = fdget(fd); + CLASS(fd, src)(fd); struct inode *src_inode; - if (!fd_file(src)) { + if (fd_empty(src)) { ret = -EINVAL; goto out_drop_write; } @@ -1341,7 +1341,6 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, BTRFS_I(src_inode)->root, readonly, inherit); } - fdput(src); } out_drop_write: mnt_drop_write_file(file); diff --git a/fs/eventfd.c b/fs/eventfd.c index 22c934f3a080..76129bfcd663 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -347,13 +347,10 @@ EXPORT_SYMBOL_GPL(eventfd_fget); */ struct eventfd_ctx *eventfd_ctx_fdget(int fd) { - struct eventfd_ctx *ctx; - struct fd f = fdget(fd); - if (!fd_file(f)) + CLASS(fd, f)(fd); + if (fd_empty(f)) return ERR_PTR(-EBADF); - ctx = eventfd_ctx_fileget(fd_file(f)); - fdput(f); - return ctx; + return eventfd_ctx_fileget(fd_file(f)); } EXPORT_SYMBOL_GPL(eventfd_ctx_fdget); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1ae4542f0bd8..4607dcbc2851 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -2254,25 +2254,22 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, { int error; int full_check = 0; - struct fd f, tf; struct eventpoll *ep; struct epitem *epi; struct eventpoll *tep = NULL; - error = -EBADF; - f = fdget(epfd); - if (!fd_file(f)) - goto error_return; + CLASS(fd, f)(epfd); + if (fd_empty(f)) + return -EBADF; /* Get the "struct file *" for the target file */ - tf = fdget(fd); - if (!fd_file(tf)) - goto error_fput; + CLASS(fd, tf)(fd); + if (fd_empty(tf)) + return -EBADF; /* The target file descriptor must support poll */ - error = -EPERM; if (!file_can_poll(fd_file(tf))) - goto error_tgt_fput; + return -EPERM; /* Check if EPOLLWAKEUP is allowed */ if (ep_op_has_event(op)) @@ -2391,12 +2388,6 @@ error_tgt_fput: loop_check_gen++; mutex_unlock(&epnested_mutex); } - - fdput(tf); -error_fput: - fdput(f); -error_return: - return error; } diff --git a/fs/fhandle.c b/fs/fhandle.c index 82df28d45cd7..5f801139358e 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -139,12 +139,11 @@ static int get_path_from_fd(int fd, struct path *root) path_get(root); spin_unlock(&fs->lock); } else { - struct fd f = fdget(fd); - if (!fd_file(f)) + CLASS(fd, f)(fd); + if (fd_empty(f)) return -EBADF; *root = fd_file(f)->f_path; path_get(root); - fdput(f); } return 0; diff --git a/fs/ioctl.c b/fs/ioctl.c index 6e0c954388d4..638a36be31c1 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -231,11 +231,11 @@ static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, u64 off, u64 olen, u64 destoff) { - struct fd src_file = fdget(srcfd); + CLASS(fd, src_file)(srcfd); loff_t cloned; int ret; - if (!fd_file(src_file)) + if (fd_empty(src_file)) return -EBADF; cloned = vfs_clone_file_range(fd_file(src_file), off, dst_file, destoff, olen, 0); @@ -245,7 +245,6 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, ret = -EINVAL; else ret = 0; - fdput(src_file); return ret; } @@ -892,22 +891,20 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd, SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { - struct fd f = fdget(fd); + CLASS(fd, f)(fd); int error; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; error = security_file_ioctl(fd_file(f), cmd, arg); if (error) - goto out; + return error; error = do_vfs_ioctl(fd_file(f), fd, cmd, arg); if (error == -ENOIOCTLCMD) error = vfs_ioctl(fd_file(f), cmd, arg); -out: - fdput(f); return error; } @@ -950,15 +947,15 @@ EXPORT_SYMBOL(compat_ptr_ioctl); COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, compat_ulong_t, arg) { - struct fd f = fdget(fd); + CLASS(fd, f)(fd); int error; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; error = security_file_ioctl_compat(fd_file(f), cmd, arg); if (error) - goto out; + return error; switch (cmd) { /* FICLONE takes an int argument, so don't use compat_ptr() */ @@ -1009,10 +1006,6 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, error = -ENOTTY; break; } - - out: - fdput(f); - return error; } #endif diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c index 9ff37ae650ea..de32c95d823d 100644 --- a/fs/kernel_read_file.c +++ b/fs/kernel_read_file.c @@ -175,15 +175,11 @@ ssize_t kernel_read_file_from_fd(int fd, loff_t offset, void **buf, size_t buf_size, size_t *file_size, enum kernel_read_file_id id) { - struct fd f = fdget(fd); - ssize_t ret = -EBADF; + CLASS(fd, f)(fd); - if (!fd_file(f) || !(fd_file(f)->f_mode & FMODE_READ)) - goto out; + if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ)) + return -EBADF; - ret = kernel_read_file(fd_file(f), offset, buf, buf_size, file_size, id); -out: - fdput(f); - return ret; + return kernel_read_file(fd_file(f), offset, buf, buf_size, file_size, id); } EXPORT_SYMBOL_GPL(kernel_read_file_from_fd); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 9644bc72e457..07c5ffc8523b 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1003,22 +1003,17 @@ static int fanotify_find_path(int dfd, const char __user *filename, dfd, filename, flags); if (filename == NULL) { - struct fd f = fdget(dfd); + CLASS(fd, f)(dfd); - ret = -EBADF; - if (!fd_file(f)) - goto out; + if (fd_empty(f)) + return -EBADF; - ret = -ENOTDIR; if ((flags & FAN_MARK_ONLYDIR) && - !(S_ISDIR(file_inode(fd_file(f))->i_mode))) { - fdput(f); - goto out; - } + !(S_ISDIR(file_inode(fd_file(f))->i_mode))) + return -ENOTDIR; *path = fd_file(f)->f_path; path_get(path); - fdput(f); } else { unsigned int lookup_flags = 0; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 0794dcaf1e47..dc645af2a6ad 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -794,33 +794,26 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd) { struct fsnotify_group *group; struct inotify_inode_mark *i_mark; - struct fd f; - int ret = -EINVAL; + CLASS(fd, f)(fd); - f = fdget(fd); - if (unlikely(!fd_file(f))) + if (fd_empty(f)) return -EBADF; /* verify that this is indeed an inotify instance */ if (unlikely(fd_file(f)->f_op != &inotify_fops)) - goto out; + return -EINVAL; group = fd_file(f)->private_data; i_mark = inotify_idr_find(group, wd); if (unlikely(!i_mark)) - goto out; - - ret = 0; + return -EINVAL; fsnotify_destroy_mark(&i_mark->fsn_mark, group); /* match ref taken by inotify_idr_find */ fsnotify_put_mark(&i_mark->fsn_mark); - -out: - fdput(f); - return ret; + return 0; } /* diff --git a/fs/open.c b/fs/open.c index a0c1fa3f60d5..24d22f4222f0 100644 --- a/fs/open.c +++ b/fs/open.c @@ -349,14 +349,12 @@ EXPORT_SYMBOL_GPL(vfs_fallocate); int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len) { - struct fd f = fdget(fd); - int error = -EBADF; + CLASS(fd, f)(fd); - if (fd_file(f)) { - error = vfs_fallocate(fd_file(f), mode, offset, len); - fdput(f); - } - return error; + if (fd_empty(f)) + return -EBADF; + + return vfs_fallocate(fd_file(f), mode, offset, len); } SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) @@ -666,14 +664,12 @@ int vfs_fchmod(struct file *file, umode_t mode) SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) { - struct fd f = fdget(fd); - int err = -EBADF; + CLASS(fd, f)(fd); - if (fd_file(f)) { - err = vfs_fchmod(fd_file(f), mode); - fdput(f); - } - return err; + if (fd_empty(f)) + return -EBADF; + + return vfs_fchmod(fd_file(f), mode); } static int do_fchmodat(int dfd, const char __user *filename, umode_t mode, @@ -860,14 +856,12 @@ int vfs_fchown(struct file *file, uid_t user, gid_t group) int ksys_fchown(unsigned int fd, uid_t user, gid_t group) { - struct fd f = fdget(fd); - int error = -EBADF; + CLASS(fd, f)(fd); - if (fd_file(f)) { - error = vfs_fchown(fd_file(f), user, group); - fdput(f); - } - return error; + if (fd_empty(f)) + return -EBADF; + + return vfs_fchown(fd_file(f), user, group); } SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) diff --git a/fs/read_write.c b/fs/read_write.c index ef3ee3725714..5e3df2d39283 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1663,36 +1663,32 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, { loff_t pos_in; loff_t pos_out; - struct fd f_in; - struct fd f_out; ssize_t ret = -EBADF; - f_in = fdget(fd_in); - if (!fd_file(f_in)) - goto out2; + CLASS(fd, f_in)(fd_in); + if (fd_empty(f_in)) + return -EBADF; - f_out = fdget(fd_out); - if (!fd_file(f_out)) - goto out1; + CLASS(fd, f_out)(fd_out); + if (fd_empty(f_out)) + return -EBADF; - ret = -EFAULT; if (off_in) { if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) - goto out; + return -EFAULT; } else { pos_in = fd_file(f_in)->f_pos; } if (off_out) { if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) - goto out; + return -EFAULT; } else { pos_out = fd_file(f_out)->f_pos; } - ret = -EINVAL; if (flags != 0) - goto out; + return -EINVAL; ret = vfs_copy_file_range(fd_file(f_in), pos_in, fd_file(f_out), pos_out, len, flags); @@ -1714,12 +1710,6 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, fd_file(f_out)->f_pos = pos_out; } } - -out: - fdput(f_out); -out1: - fdput(f_in); -out2: return ret; } diff --git a/fs/signalfd.c b/fs/signalfd.c index 736bebf93591..d1a5f43ce466 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -288,20 +288,17 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags) fd_install(ufd, file); } else { - struct fd f = fdget(ufd); - if (!fd_file(f)) + CLASS(fd, f)(ufd); + if (fd_empty(f)) return -EBADF; ctx = fd_file(f)->private_data; - if (fd_file(f)->f_op != &signalfd_fops) { - fdput(f); + if (fd_file(f)->f_op != &signalfd_fops) return -EINVAL; - } spin_lock_irq(¤t->sighand->siglock); ctx->sigmask = *mask; spin_unlock_irq(¤t->sighand->siglock); wake_up(¤t->sighand->signalfd_wqh); - fdput(f); } return ufd; diff --git a/fs/sync.c b/fs/sync.c index 67df255eb189..2955cd4c77a3 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -148,11 +148,11 @@ void emergency_sync(void) */ SYSCALL_DEFINE1(syncfs, int, fd) { - struct fd f = fdget(fd); + CLASS(fd, f)(fd); struct super_block *sb; int ret, ret2; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; sb = fd_file(f)->f_path.dentry->d_sb; @@ -162,7 +162,6 @@ SYSCALL_DEFINE1(syncfs, int, fd) ret2 = errseq_check_and_advance(&sb->s_wb_err, &fd_file(f)->f_sb_err); - fdput(f); return ret ? ret : ret2; } @@ -205,14 +204,12 @@ EXPORT_SYMBOL(vfs_fsync); static int do_fsync(unsigned int fd, int datasync) { - struct fd f = fdget(fd); - int ret = -EBADF; + CLASS(fd, f)(fd); - if (fd_file(f)) { - ret = vfs_fsync(fd_file(f), datasync); - fdput(f); - } - return ret; + if (fd_empty(f)) + return -EBADF; + + return vfs_fsync(fd_file(f), datasync); } SYSCALL_DEFINE1(fsync, unsigned int, fd) @@ -355,16 +352,12 @@ out: int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes, unsigned int flags) { - int ret; - struct fd f; + CLASS(fd, f)(fd); - ret = -EBADF; - f = fdget(fd); - if (fd_file(f)) - ret = sync_file_range(fd_file(f), offset, nbytes, flags); + if (fd_empty(f)) + return -EBADF; - fdput(f); - return ret; + return sync_file_range(fd_file(f), offset, nbytes, flags); } SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index a26593979887..d5f0c3d9c35f 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -106,29 +106,21 @@ static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) { struct io_ring_ctx *ctx_attach; struct io_sq_data *sqd; - struct fd f; + CLASS(fd, f)(p->wq_fd); - f = fdget(p->wq_fd); - if (!fd_file(f)) + if (fd_empty(f)) return ERR_PTR(-ENXIO); - if (!io_is_uring_fops(fd_file(f))) { - fdput(f); + if (!io_is_uring_fops(fd_file(f))) return ERR_PTR(-EINVAL); - } ctx_attach = fd_file(f)->private_data; sqd = ctx_attach->sq_data; - if (!sqd) { - fdput(f); + if (!sqd) return ERR_PTR(-EINVAL); - } - if (sqd->task_tgid != current->tgid) { - fdput(f); + if (sqd->task_tgid != current->tgid) return ERR_PTR(-EPERM); - } refcount_inc(&sqd->refs); - fdput(f); return sqd; } @@ -417,16 +409,11 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, /* Retain compatibility with failing for an invalid attach attempt */ if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) == IORING_SETUP_ATTACH_WQ) { - struct fd f; - - f = fdget(p->wq_fd); - if (!fd_file(f)) + CLASS(fd, f)(p->wq_fd); + if (fd_empty(f)) return -ENXIO; - if (!io_is_uring_fops(fd_file(f))) { - fdput(f); + if (!io_is_uring_fops(fd_file(f))) return -EINVAL; - } - fdput(f); } if (ctx->flags & IORING_SETUP_SQPOLL) { struct task_struct *tsk; diff --git a/kernel/events/core.c b/kernel/events/core.c index 85b209626dd7..075ce7299973 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -966,22 +966,20 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, { struct perf_cgroup *cgrp; struct cgroup_subsys_state *css; - struct fd f = fdget(fd); + CLASS(fd, f)(fd); int ret = 0; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; css = css_tryget_online_from_dir(fd_file(f)->f_path.dentry, &perf_event_cgrp_subsys); - if (IS_ERR(css)) { - ret = PTR_ERR(css); - goto out; - } + if (IS_ERR(css)) + return PTR_ERR(css); ret = perf_cgroup_ensure_storage(event, css); if (ret) - goto out; + return ret; cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; @@ -995,8 +993,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, perf_detach_cgroup(event); ret = -EINVAL; } -out: - fdput(f); return ret; } diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index dc952c3b05af..c9d97ed20122 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -545,12 +545,12 @@ static void commit_nsset(struct nsset *nsset) SYSCALL_DEFINE2(setns, int, fd, int, flags) { - struct fd f = fdget(fd); + CLASS(fd, f)(fd); struct ns_common *ns = NULL; struct nsset nsset = {}; int err = 0; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; if (proc_ns_file(fd_file(f))) { @@ -580,7 +580,6 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags) } put_nsset(&nsset); out: - fdput(f); return err; } diff --git a/kernel/pid.c b/kernel/pid.c index 2715afb77eab..b5bbc1a8a6e4 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -536,11 +536,10 @@ EXPORT_SYMBOL_GPL(find_ge_pid); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) { - struct fd f; + CLASS(fd, f)(fd); struct pid *pid; - f = fdget(fd); - if (!fd_file(f)) + if (fd_empty(f)) return ERR_PTR(-EBADF); pid = pidfd_pid(fd_file(f)); @@ -548,8 +547,6 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) get_pid(pid); *flags = fd_file(f)->f_flags; } - - fdput(f); return pid; } diff --git a/kernel/sys.c b/kernel/sys.c index 4da31f28fda8..ebe10c27a9f4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1911,12 +1911,11 @@ SYSCALL_DEFINE1(umask, int, mask) static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { - struct fd exe; + CLASS(fd, exe)(fd); struct inode *inode; int err; - exe = fdget(fd); - if (!fd_file(exe)) + if (fd_empty(exe)) return -EBADF; inode = file_inode(fd_file(exe)); @@ -1926,18 +1925,14 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) * sure that this one is executable as well, to avoid breaking an * overall picture. */ - err = -EACCES; if (!S_ISREG(inode->i_mode) || path_noexec(&fd_file(exe)->f_path)) - goto exit; + return -EACCES; err = file_permission(fd_file(exe), MAY_EXEC); if (err) - goto exit; + return err; - err = replace_mm_exe_file(mm, fd_file(exe)); -exit: - fdput(exe); - return err; + return replace_mm_exe_file(mm, fd_file(exe)); } /* diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index d36242fd4936..1895fbc32bcb 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -663,16 +663,14 @@ struct watch_queue *get_watch_queue(int fd) { struct pipe_inode_info *pipe; struct watch_queue *wqueue = ERR_PTR(-EINVAL); - struct fd f; + CLASS(fd, f)(fd); - f = fdget(fd); - if (fd_file(f)) { + if (!fd_empty(f)) { pipe = get_pipe_info(fd_file(f), false); if (pipe && pipe->watch_queue) { wqueue = pipe->watch_queue; kref_get(&wqueue->usage); } - fdput(f); } return wqueue; diff --git a/mm/fadvise.c b/mm/fadvise.c index 532dee205c6e..588fe76c5a14 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -190,16 +190,12 @@ EXPORT_SYMBOL(vfs_fadvise); int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) { - struct fd f = fdget(fd); - int ret; + CLASS(fd, f)(fd); - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; - ret = vfs_fadvise(fd_file(f), offset, len, advice); - - fdput(f); - return ret; + return vfs_fadvise(fd_file(f), offset, len, advice); } SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) diff --git a/mm/readahead.c b/mm/readahead.c index 3dc6c7a128dd..9a807727d809 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -673,29 +673,22 @@ EXPORT_SYMBOL_GPL(page_cache_async_ra); ssize_t ksys_readahead(int fd, loff_t offset, size_t count) { - ssize_t ret; - struct fd f; + CLASS(fd, f)(fd); - ret = -EBADF; - f = fdget(fd); - if (!fd_file(f) || !(fd_file(f)->f_mode & FMODE_READ)) - goto out; + if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ)) + return -EBADF; /* * The readahead() syscall is intended to run only on files * that can execute readahead. If readahead is not possible * on this file, then we must return -EINVAL. */ - ret = -EINVAL; if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops || (!S_ISREG(file_inode(fd_file(f))->i_mode) && !S_ISBLK(file_inode(fd_file(f))->i_mode))) - goto out; + return -EINVAL; - ret = vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED); -out: - fdput(f); - return ret; + return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED); } SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index e39479f1c9a4..b231b27d8268 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -694,20 +694,18 @@ EXPORT_SYMBOL_GPL(get_net_ns); struct net *get_net_ns_by_fd(int fd) { - struct fd f = fdget(fd); - struct net *net = ERR_PTR(-EINVAL); + CLASS(fd, f)(fd); - if (!fd_file(f)) + if (fd_empty(f)) return ERR_PTR(-EBADF); if (proc_ns_file(fd_file(f))) { struct ns_common *ns = get_proc_ns(file_inode(fd_file(f))); if (ns->ops == &netns_operations) - net = get_net(container_of(ns, struct net, ns)); + return get_net(container_of(ns, struct net, ns)); } - fdput(f); - return net; + return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(get_net_ns_by_fd); #endif diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c index f32eb38abd0f..f937f748d9e8 100644 --- a/security/landlock/syscalls.c +++ b/security/landlock/syscalls.c @@ -241,31 +241,21 @@ SYSCALL_DEFINE3(landlock_create_ruleset, static struct landlock_ruleset *get_ruleset_from_fd(const int fd, const fmode_t mode) { - struct fd ruleset_f; + CLASS(fd, ruleset_f)(fd); struct landlock_ruleset *ruleset; - ruleset_f = fdget(fd); - if (!fd_file(ruleset_f)) + if (fd_empty(ruleset_f)) return ERR_PTR(-EBADF); /* Checks FD type and access right. */ - if (fd_file(ruleset_f)->f_op != &ruleset_fops) { - ruleset = ERR_PTR(-EBADFD); - goto out_fdput; - } - if (!(fd_file(ruleset_f)->f_mode & mode)) { - ruleset = ERR_PTR(-EPERM); - goto out_fdput; - } + if (fd_file(ruleset_f)->f_op != &ruleset_fops) + return ERR_PTR(-EBADFD); + if (!(fd_file(ruleset_f)->f_mode & mode)) + return ERR_PTR(-EPERM); ruleset = fd_file(ruleset_f)->private_data; - if (WARN_ON_ONCE(ruleset->num_layers != 1)) { - ruleset = ERR_PTR(-EINVAL); - goto out_fdput; - } + if (WARN_ON_ONCE(ruleset->num_layers != 1)) + return ERR_PTR(-EINVAL); landlock_get_ruleset(ruleset); - -out_fdput: - fdput(ruleset_f); return ruleset; } diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index 388ae471d258..53262b8a7656 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -190,11 +190,10 @@ static int kvm_vfio_file_del(struct kvm_device *dev, unsigned int fd) { struct kvm_vfio *kv = dev->private; struct kvm_vfio_file *kvf; - struct fd f; + CLASS(fd, f)(fd); int ret; - f = fdget(fd); - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; ret = -ENOENT; @@ -220,9 +219,6 @@ static int kvm_vfio_file_del(struct kvm_device *dev, unsigned int fd) kvm_vfio_update_coherency(dev); mutex_unlock(&kv->lock); - - fdput(f); - return ret; } -- cgit v1.3 From dbd5e2e79ed8653ac2ae255e42d1189283343a0c Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Thu, 24 Oct 2024 17:37:42 +0800 Subject: net: tcp: Add noinline_for_tracing annotation for tcp_drop_reason() We previously hooked the tcp_drop_reason() function using BPF to monitor TCP drop reasons. However, after upgrading our compiler from GCC 9 to GCC 11, tcp_drop_reason() is now inlined, preventing us from hooking into it. To address this, it would be beneficial to make noinline explicitly for tracing. Link: https://lore.kernel.org/netdev/CANn89iJuShCmidCi_ZkYABtmscwbVjhuDta1MS5LxV_4H9tKOA@mail.gmail.com/ Suggested-by: Eric Dumazet Signed-off-by: Yafang Shao Cc: Menglong Dong Link: https://patch.msgid.link/20241024093742.87681-3-laoar.shao@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2d844e1f867f..5bdf13ac26ef 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4921,8 +4921,8 @@ static bool tcp_ooo_try_coalesce(struct sock *sk, return res; } -static void tcp_drop_reason(struct sock *sk, struct sk_buff *skb, - enum skb_drop_reason reason) +noinline_for_tracing static void +tcp_drop_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) { sk_drops_add(sk, skb); sk_skb_reason_drop(sk, skb, reason); -- cgit v1.3 From f12b363887c706c40611fba645265527a8415832 Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Sun, 27 Oct 2024 21:48:28 -0700 Subject: net: dsa: use ethtool string helpers These are the preferred way to copy ethtool strings. Avoids incrementing pointers all over the place. Signed-off-by: Rosen Penev (for hellcreek driver) Reviewed-by: Kurt Kanzenbach Link: https://patch.msgid.link/20241028044828.1639668-1-rosenp@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/b53/b53_common.c | 3 +- drivers/net/dsa/bcm_sf2.c | 4 +-- drivers/net/dsa/bcm_sf2.h | 4 +-- drivers/net/dsa/bcm_sf2_cfp.c | 22 ++++-------- drivers/net/dsa/dsa_loop.c | 3 +- drivers/net/dsa/hirschmann/hellcreek.c | 8 ++--- drivers/net/dsa/microchip/ksz_common.c | 6 ++-- drivers/net/dsa/mv88e6xxx/chip.c | 57 +++++++++++++------------------ drivers/net/dsa/mv88e6xxx/chip.h | 6 ++-- drivers/net/dsa/mv88e6xxx/serdes.c | 14 ++++---- drivers/net/dsa/mv88e6xxx/serdes.h | 8 ++--- drivers/net/dsa/rzn1_a5psw.c | 6 ++-- drivers/net/dsa/sja1105/sja1105_ethtool.c | 7 ++-- drivers/net/dsa/xrs700x/xrs700x.c | 6 ++-- net/dsa/user.c | 13 +++---- 15 files changed, 64 insertions(+), 103 deletions(-) (limited to 'net') diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index c39cb119e760..285785c942b0 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -989,8 +989,7 @@ void b53_get_strings(struct dsa_switch *ds, int port, u32 stringset, if (stringset == ETH_SS_STATS) { for (i = 0; i < mib_size; i++) - strscpy(data + i * ETH_GSTRING_LEN, - mibs[i].name, ETH_GSTRING_LEN); + ethtool_puts(&data, mibs[i].name); } else if (stringset == ETH_SS_PHY_STATS) { phydev = b53_get_phy_device(ds, port); if (!phydev) diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 9201f07839ad..43bde1f583ff 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -1183,8 +1183,8 @@ static void bcm_sf2_sw_get_strings(struct dsa_switch *ds, int port, int cnt = b53_get_sset_count(ds, port, stringset); b53_get_strings(ds, port, stringset, data); - bcm_sf2_cfp_get_strings(ds, port, stringset, - data + cnt * ETH_GSTRING_LEN); + data += cnt * ETH_GSTRING_LEN; + bcm_sf2_cfp_get_strings(ds, port, stringset, &data); } static void bcm_sf2_sw_get_ethtool_stats(struct dsa_switch *ds, int port, diff --git a/drivers/net/dsa/bcm_sf2.h b/drivers/net/dsa/bcm_sf2.h index 4fda075a3449..be9f3b29019f 100644 --- a/drivers/net/dsa/bcm_sf2.h +++ b/drivers/net/dsa/bcm_sf2.h @@ -228,8 +228,8 @@ int bcm_sf2_set_rxnfc(struct dsa_switch *ds, int port, int bcm_sf2_cfp_rst(struct bcm_sf2_priv *priv); void bcm_sf2_cfp_exit(struct dsa_switch *ds); int bcm_sf2_cfp_resume(struct dsa_switch *ds); -void bcm_sf2_cfp_get_strings(struct dsa_switch *ds, int port, - u32 stringset, uint8_t *data); +void bcm_sf2_cfp_get_strings(struct dsa_switch *ds, int port, u32 stringset, + uint8_t **data); void bcm_sf2_cfp_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data); int bcm_sf2_cfp_get_sset_count(struct dsa_switch *ds, int port, int sset); diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c index c88ee3dd4299..e22362e6f0cd 100644 --- a/drivers/net/dsa/bcm_sf2_cfp.c +++ b/drivers/net/dsa/bcm_sf2_cfp.c @@ -1279,27 +1279,19 @@ static const struct bcm_sf2_cfp_stat { }, }; -void bcm_sf2_cfp_get_strings(struct dsa_switch *ds, int port, - u32 stringset, uint8_t *data) +void bcm_sf2_cfp_get_strings(struct dsa_switch *ds, int port, u32 stringset, + uint8_t **data) { struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); - unsigned int s = ARRAY_SIZE(bcm_sf2_cfp_stats); - char buf[ETH_GSTRING_LEN]; - unsigned int i, j, iter; + unsigned int i, j; if (stringset != ETH_SS_STATS) return; - for (i = 1; i < priv->num_cfp_rules; i++) { - for (j = 0; j < s; j++) { - snprintf(buf, sizeof(buf), - "CFP%03d_%sCntr", - i, bcm_sf2_cfp_stats[j].name); - iter = (i - 1) * s + j; - strscpy(data + iter * ETH_GSTRING_LEN, - buf, ETH_GSTRING_LEN); - } - } + for (i = 1; i < priv->num_cfp_rules; i++) + for (j = 0; j < ARRAY_SIZE(bcm_sf2_cfp_stats); j++) + ethtool_sprintf(data, "CFP%03d_%sCntr", i, + bcm_sf2_cfp_stats[j].name); } void bcm_sf2_cfp_get_ethtool_stats(struct dsa_switch *ds, int port, diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c index c70ed67cc188..adbab544c60f 100644 --- a/drivers/net/dsa/dsa_loop.c +++ b/drivers/net/dsa/dsa_loop.c @@ -121,8 +121,7 @@ static void dsa_loop_get_strings(struct dsa_switch *ds, int port, return; for (i = 0; i < __DSA_LOOP_CNT_MAX; i++) - memcpy(data + i * ETH_GSTRING_LEN, - ps->ports[port].mib[i].name, ETH_GSTRING_LEN); + ethtool_puts(&data, ps->ports[port].mib[i].name); } static void dsa_loop_get_ethtool_stats(struct dsa_switch *ds, int port, diff --git a/drivers/net/dsa/hirschmann/hellcreek.c b/drivers/net/dsa/hirschmann/hellcreek.c index d798f17cf7ea..283ec5a6e23c 100644 --- a/drivers/net/dsa/hirschmann/hellcreek.c +++ b/drivers/net/dsa/hirschmann/hellcreek.c @@ -294,12 +294,8 @@ static void hellcreek_get_strings(struct dsa_switch *ds, int port, { int i; - for (i = 0; i < ARRAY_SIZE(hellcreek_counter); ++i) { - const struct hellcreek_counter *counter = &hellcreek_counter[i]; - - strscpy(data + i * ETH_GSTRING_LEN, - counter->name, ETH_GSTRING_LEN); - } + for (i = 0; i < ARRAY_SIZE(hellcreek_counter); ++i) + ethtool_puts(&data, hellcreek_counter[i].name); } static int hellcreek_get_sset_count(struct dsa_switch *ds, int port, int sset) diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 5290f5ad98f3..f73833e24622 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -2112,10 +2112,8 @@ static void ksz_get_strings(struct dsa_switch *ds, int port, if (stringset != ETH_SS_STATS) return; - for (i = 0; i < dev->info->mib_cnt; i++) { - memcpy(buf + i * ETH_GSTRING_LEN, - dev->info->mib_names[i].string, ETH_GSTRING_LEN); - } + for (i = 0; i < dev->info->mib_cnt; i++) + ethtool_puts(&buf, dev->info->mib_names[i].string); } /** diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index c75005b4d86e..3a792f79270d 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -1153,42 +1153,37 @@ static uint64_t _mv88e6xxx_get_ethtool_stat(struct mv88e6xxx_chip *chip, return value; } -static int mv88e6xxx_stats_get_strings(struct mv88e6xxx_chip *chip, - uint8_t *data, int types) +static void mv88e6xxx_stats_get_strings(struct mv88e6xxx_chip *chip, + uint8_t **data, int types) { const struct mv88e6xxx_hw_stat *stat; - int i, j; + int i; - for (i = 0, j = 0; i < ARRAY_SIZE(mv88e6xxx_hw_stats); i++) { + for (i = 0; i < ARRAY_SIZE(mv88e6xxx_hw_stats); i++) { stat = &mv88e6xxx_hw_stats[i]; - if (stat->type & types) { - memcpy(data + j * ETH_GSTRING_LEN, stat->string, - ETH_GSTRING_LEN); - j++; - } + if (stat->type & types) + ethtool_puts(data, stat->string); } - - return j; } -static int mv88e6095_stats_get_strings(struct mv88e6xxx_chip *chip, - uint8_t *data) +static void mv88e6095_stats_get_strings(struct mv88e6xxx_chip *chip, + uint8_t **data) { - return mv88e6xxx_stats_get_strings(chip, data, - STATS_TYPE_BANK0 | STATS_TYPE_PORT); + mv88e6xxx_stats_get_strings(chip, data, + STATS_TYPE_BANK0 | STATS_TYPE_PORT); } -static int mv88e6250_stats_get_strings(struct mv88e6xxx_chip *chip, - uint8_t *data) +static void mv88e6250_stats_get_strings(struct mv88e6xxx_chip *chip, + uint8_t **data) { - return mv88e6xxx_stats_get_strings(chip, data, STATS_TYPE_BANK0); + mv88e6xxx_stats_get_strings(chip, data, STATS_TYPE_BANK0); } -static int mv88e6320_stats_get_strings(struct mv88e6xxx_chip *chip, - uint8_t *data) +static void mv88e6320_stats_get_strings(struct mv88e6xxx_chip *chip, + uint8_t **data) { - return mv88e6xxx_stats_get_strings(chip, data, - STATS_TYPE_BANK0 | STATS_TYPE_BANK1); + mv88e6xxx_stats_get_strings(chip, data, + STATS_TYPE_BANK0 | STATS_TYPE_BANK1); } static const uint8_t *mv88e6xxx_atu_vtu_stats_strings[] = { @@ -1199,21 +1194,18 @@ static const uint8_t *mv88e6xxx_atu_vtu_stats_strings[] = { "vtu_miss_violation", }; -static void mv88e6xxx_atu_vtu_get_strings(uint8_t *data) +static void mv88e6xxx_atu_vtu_get_strings(uint8_t **data) { unsigned int i; for (i = 0; i < ARRAY_SIZE(mv88e6xxx_atu_vtu_stats_strings); i++) - strscpy(data + i * ETH_GSTRING_LEN, - mv88e6xxx_atu_vtu_stats_strings[i], - ETH_GSTRING_LEN); + ethtool_puts(data, mv88e6xxx_atu_vtu_stats_strings[i]); } static void mv88e6xxx_get_strings(struct dsa_switch *ds, int port, u32 stringset, uint8_t *data) { struct mv88e6xxx_chip *chip = ds->priv; - int count = 0; if (stringset != ETH_SS_STATS) return; @@ -1221,15 +1213,12 @@ static void mv88e6xxx_get_strings(struct dsa_switch *ds, int port, mv88e6xxx_reg_lock(chip); if (chip->info->ops->stats_get_strings) - count = chip->info->ops->stats_get_strings(chip, data); + chip->info->ops->stats_get_strings(chip, &data); - if (chip->info->ops->serdes_get_strings) { - data += count * ETH_GSTRING_LEN; - count = chip->info->ops->serdes_get_strings(chip, port, data); - } + if (chip->info->ops->serdes_get_strings) + chip->info->ops->serdes_get_strings(chip, port, &data); - data += count * ETH_GSTRING_LEN; - mv88e6xxx_atu_vtu_get_strings(data); + mv88e6xxx_atu_vtu_get_strings(&data); mv88e6xxx_reg_unlock(chip); } diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h index 48399ab5355a..9fe8e8a7856b 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.h +++ b/drivers/net/dsa/mv88e6xxx/chip.h @@ -606,7 +606,7 @@ struct mv88e6xxx_ops { /* Return the number of strings describing statistics */ int (*stats_get_sset_count)(struct mv88e6xxx_chip *chip); - int (*stats_get_strings)(struct mv88e6xxx_chip *chip, uint8_t *data); + void (*stats_get_strings)(struct mv88e6xxx_chip *chip, uint8_t **data); size_t (*stats_get_stat)(struct mv88e6xxx_chip *chip, int port, const struct mv88e6xxx_hw_stat *stat, uint64_t *data); @@ -633,8 +633,8 @@ struct mv88e6xxx_ops { /* Statistics from the SERDES interface */ int (*serdes_get_sset_count)(struct mv88e6xxx_chip *chip, int port); - int (*serdes_get_strings)(struct mv88e6xxx_chip *chip, int port, - uint8_t *data); + int (*serdes_get_strings)(struct mv88e6xxx_chip *chip, int port, + uint8_t **data); size_t (*serdes_get_stats)(struct mv88e6xxx_chip *chip, int port, uint64_t *data); diff --git a/drivers/net/dsa/mv88e6xxx/serdes.c b/drivers/net/dsa/mv88e6xxx/serdes.c index 01ea53940786..b3330211edbc 100644 --- a/drivers/net/dsa/mv88e6xxx/serdes.c +++ b/drivers/net/dsa/mv88e6xxx/serdes.c @@ -132,8 +132,8 @@ int mv88e6352_serdes_get_sset_count(struct mv88e6xxx_chip *chip, int port) return ARRAY_SIZE(mv88e6352_serdes_hw_stats); } -int mv88e6352_serdes_get_strings(struct mv88e6xxx_chip *chip, - int port, uint8_t *data) +int mv88e6352_serdes_get_strings(struct mv88e6xxx_chip *chip, int port, + uint8_t **data) { struct mv88e6352_serdes_hw_stat *stat; int err, i; @@ -144,8 +144,7 @@ int mv88e6352_serdes_get_strings(struct mv88e6xxx_chip *chip, for (i = 0; i < ARRAY_SIZE(mv88e6352_serdes_hw_stats); i++) { stat = &mv88e6352_serdes_hw_stats[i]; - memcpy(data + i * ETH_GSTRING_LEN, stat->string, - ETH_GSTRING_LEN); + ethtool_puts(data, stat->string); } return ARRAY_SIZE(mv88e6352_serdes_hw_stats); } @@ -394,8 +393,8 @@ int mv88e6390_serdes_get_sset_count(struct mv88e6xxx_chip *chip, int port) return ARRAY_SIZE(mv88e6390_serdes_hw_stats); } -int mv88e6390_serdes_get_strings(struct mv88e6xxx_chip *chip, - int port, uint8_t *data) +int mv88e6390_serdes_get_strings(struct mv88e6xxx_chip *chip, int port, + uint8_t **data) { struct mv88e6390_serdes_hw_stat *stat; int i; @@ -405,8 +404,7 @@ int mv88e6390_serdes_get_strings(struct mv88e6xxx_chip *chip, for (i = 0; i < ARRAY_SIZE(mv88e6390_serdes_hw_stats); i++) { stat = &mv88e6390_serdes_hw_stats[i]; - memcpy(data + i * ETH_GSTRING_LEN, stat->string, - ETH_GSTRING_LEN); + ethtool_puts(data, stat->string); } return ARRAY_SIZE(mv88e6390_serdes_hw_stats); } diff --git a/drivers/net/dsa/mv88e6xxx/serdes.h b/drivers/net/dsa/mv88e6xxx/serdes.h index ff5c3ab31e15..ad887d8601bc 100644 --- a/drivers/net/dsa/mv88e6xxx/serdes.h +++ b/drivers/net/dsa/mv88e6xxx/serdes.h @@ -125,13 +125,13 @@ unsigned int mv88e6352_serdes_irq_mapping(struct mv88e6xxx_chip *chip, unsigned int mv88e6390_serdes_irq_mapping(struct mv88e6xxx_chip *chip, int port); int mv88e6352_serdes_get_sset_count(struct mv88e6xxx_chip *chip, int port); -int mv88e6352_serdes_get_strings(struct mv88e6xxx_chip *chip, - int port, uint8_t *data); +int mv88e6352_serdes_get_strings(struct mv88e6xxx_chip *chip, int port, + uint8_t **data); size_t mv88e6352_serdes_get_stats(struct mv88e6xxx_chip *chip, int port, uint64_t *data); int mv88e6390_serdes_get_sset_count(struct mv88e6xxx_chip *chip, int port); -int mv88e6390_serdes_get_strings(struct mv88e6xxx_chip *chip, - int port, uint8_t *data); +int mv88e6390_serdes_get_strings(struct mv88e6xxx_chip *chip, int port, + uint8_t **data); size_t mv88e6390_serdes_get_stats(struct mv88e6xxx_chip *chip, int port, uint64_t *data); diff --git a/drivers/net/dsa/rzn1_a5psw.c b/drivers/net/dsa/rzn1_a5psw.c index 1135a32e4b7e..66974379334a 100644 --- a/drivers/net/dsa/rzn1_a5psw.c +++ b/drivers/net/dsa/rzn1_a5psw.c @@ -802,10 +802,8 @@ static void a5psw_get_strings(struct dsa_switch *ds, int port, u32 stringset, if (stringset != ETH_SS_STATS) return; - for (u = 0; u < ARRAY_SIZE(a5psw_stats); u++) { - memcpy(data + u * ETH_GSTRING_LEN, a5psw_stats[u].name, - ETH_GSTRING_LEN); - } + for (u = 0; u < ARRAY_SIZE(a5psw_stats); u++) + ethtool_puts(&data, a5psw_stats[u].name); } static void a5psw_get_ethtool_stats(struct dsa_switch *ds, int port, diff --git a/drivers/net/dsa/sja1105/sja1105_ethtool.c b/drivers/net/dsa/sja1105/sja1105_ethtool.c index decc6c931dc1..2ea64b1d026d 100644 --- a/drivers/net/dsa/sja1105/sja1105_ethtool.c +++ b/drivers/net/dsa/sja1105/sja1105_ethtool.c @@ -586,7 +586,6 @@ void sja1105_get_strings(struct dsa_switch *ds, int port, { struct sja1105_private *priv = ds->priv; enum sja1105_counter_index max_ctr, i; - char *p = data; if (stringset != ETH_SS_STATS) return; @@ -597,10 +596,8 @@ void sja1105_get_strings(struct dsa_switch *ds, int port, else max_ctr = __MAX_SJA1105PQRS_PORT_COUNTER; - for (i = 0; i < max_ctr; i++) { - strscpy(p, sja1105_port_counters[i].name, ETH_GSTRING_LEN); - p += ETH_GSTRING_LEN; - } + for (i = 0; i < max_ctr; i++) + ethtool_puts(&data, sja1105_port_counters[i].name); } int sja1105_get_sset_count(struct dsa_switch *ds, int port, int sset) diff --git a/drivers/net/dsa/xrs700x/xrs700x.c b/drivers/net/dsa/xrs700x/xrs700x.c index de3b768f2ff9..4dbcc49a9e52 100644 --- a/drivers/net/dsa/xrs700x/xrs700x.c +++ b/drivers/net/dsa/xrs700x/xrs700x.c @@ -91,10 +91,8 @@ static void xrs700x_get_strings(struct dsa_switch *ds, int port, if (stringset != ETH_SS_STATS) return; - for (i = 0; i < ARRAY_SIZE(xrs700x_mibs); i++) { - strscpy(data, xrs700x_mibs[i].name, ETH_GSTRING_LEN); - data += ETH_GSTRING_LEN; - } + for (i = 0; i < ARRAY_SIZE(xrs700x_mibs); i++) + ethtool_puts(&data, xrs700x_mibs[i].name); } static int xrs700x_get_sset_count(struct dsa_switch *ds, int port, int sset) diff --git a/net/dsa/user.c b/net/dsa/user.c index b18ad0105b01..06c30a9e29ff 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -1042,15 +1042,12 @@ static void dsa_user_get_strings(struct net_device *dev, struct dsa_switch *ds = dp->ds; if (stringset == ETH_SS_STATS) { - int len = ETH_GSTRING_LEN; - - strscpy_pad(data, "tx_packets", len); - strscpy_pad(data + len, "tx_bytes", len); - strscpy_pad(data + 2 * len, "rx_packets", len); - strscpy_pad(data + 3 * len, "rx_bytes", len); + ethtool_puts(&data, "tx_packets"); + ethtool_puts(&data, "tx_bytes"); + ethtool_puts(&data, "rx_packets"); + ethtool_puts(&data, "rx_bytes"); if (ds->ops->get_strings) - ds->ops->get_strings(ds, dp->index, stringset, - data + 4 * len); + ds->ops->get_strings(ds, dp->index, stringset, data); } else if (stringset == ETH_SS_TEST) { net_selftest_get_strings(data); } -- cgit v1.3 From 0ead60804b64f5bd6999eec88e503c6a1a242d41 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 29 Oct 2024 13:46:21 -0400 Subject: sctp: properly validate chunk size in sctp_sf_ootb() A size validation fix similar to that in Commit 50619dbf8db7 ("sctp: add size validation when walking chunks") is also required in sctp_sf_ootb() to address a crash reported by syzbot: BUG: KMSAN: uninit-value in sctp_sf_ootb+0x7f5/0xce0 net/sctp/sm_statefuns.c:3712 sctp_sf_ootb+0x7f5/0xce0 net/sctp/sm_statefuns.c:3712 sctp_do_sm+0x181/0x93d0 net/sctp/sm_sideeffect.c:1166 sctp_endpoint_bh_rcv+0xc38/0xf90 net/sctp/endpointola.c:407 sctp_inq_push+0x2ef/0x380 net/sctp/inqueue.c:88 sctp_rcv+0x3831/0x3b20 net/sctp/input.c:243 sctp4_rcv+0x42/0x50 net/sctp/protocol.c:1159 ip_protocol_deliver_rcu+0xb51/0x13d0 net/ipv4/ip_input.c:205 ip_local_deliver_finish+0x336/0x500 net/ipv4/ip_input.c:233 Reported-by: syzbot+f0cbb34d39392f2746ca@syzkaller.appspotmail.com Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Xin Long Link: https://patch.msgid.link/a29ebb6d8b9f8affd0f9abb296faafafe10c17d8.1730223981.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski --- net/sctp/sm_statefuns.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 7d315a18612b..a0524ba8d787 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -3751,7 +3751,7 @@ enum sctp_disposition sctp_sf_ootb(struct net *net, } ch = (struct sctp_chunkhdr *)ch_end; - } while (ch_end < skb_tail_pointer(skb)); + } while (ch_end + sizeof(*ch) < skb_tail_pointer(skb)); if (ootb_shut_ack) return sctp_sf_shut_8_4_5(net, ep, asoc, type, arg, commands); -- cgit v1.3 From 3bd9b9abdf1563a22041b7255baea6d449902f1a Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 29 Oct 2024 15:58:47 -0600 Subject: net: ethtool: Avoid thousands of -Wflex-array-member-not-at-end warnings -Wflex-array-member-not-at-end was introduced in GCC-14, and we are getting ready to enable it, globally. Change the type of the middle struct member currently causing trouble from `struct ethtool_link_settings` to `struct ethtool_link_settings_hdr`. Additionally, update the type of some variables in various functions that don't access the flexible-array member, changing them to the newly created `struct ethtool_link_settings_hdr`. These changes are needed because the type of the conflicting middle members changed. So, those instances that expect the type to be `struct ethtool_link_settings` should be adjusted to the newly created type `struct ethtool_link_settings_hdr`. Also, adjust variable declarations to follow the reverse xmas tree convention. Fix 3338 of the following -Wflex-array-member-not-at-end warnings: include/linux/ethtool.h:214:38: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] Signed-off-by: Gustavo A. R. Silva Link: https://patch.msgid.link/0bc2809fe2a6c11dd4c8a9a10d9bd65cccdb559b.1730238285.git.gustavoars@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 6 +++--- drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c | 4 ++-- drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c | 2 +- drivers/net/ethernet/cisco/enic/enic_ethtool.c | 2 +- drivers/net/ethernet/qlogic/qede/qede_ethtool.c | 4 ++-- include/linux/ethtool.h | 2 +- net/ethtool/ioctl.c | 2 +- net/ethtool/linkinfo.c | 8 ++++---- net/ethtool/linkmodes.c | 18 +++++++++++------- 9 files changed, 26 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index f71cc8188b4e..e0ebe69110bf 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -2781,7 +2781,7 @@ u32 bnxt_fw_to_ethtool_speed(u16 fw_link_speed) static void bnxt_get_default_speeds(struct ethtool_link_ksettings *lk_ksettings, struct bnxt_link_info *link_info) { - struct ethtool_link_settings *base = &lk_ksettings->base; + struct ethtool_link_settings_hdr *base = &lk_ksettings->base; if (link_info->link_state == BNXT_LINK_STATE_UP) { base->speed = bnxt_fw_to_ethtool_speed(link_info->link_speed); @@ -2800,7 +2800,7 @@ static void bnxt_get_default_speeds(struct ethtool_link_ksettings *lk_ksettings, static int bnxt_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *lk_ksettings) { - struct ethtool_link_settings *base = &lk_ksettings->base; + struct ethtool_link_settings_hdr *base = &lk_ksettings->base; enum ethtool_link_mode_bit_indices link_mode; struct bnxt *bp = netdev_priv(dev); struct bnxt_link_info *link_info; @@ -3023,9 +3023,9 @@ u16 bnxt_get_fw_auto_link_speeds(const unsigned long *mode) static int bnxt_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *lk_ksettings) { + const struct ethtool_link_settings_hdr *base = &lk_ksettings->base; struct bnxt *bp = netdev_priv(dev); struct bnxt_link_info *link_info = &bp->link_info; - const struct ethtool_link_settings *base = &lk_ksettings->base; bool set_pause = false; u32 speed, lanes = 0; int rc = 0; diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c index 7f3f5afa864f..45d28a65347e 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c @@ -662,8 +662,8 @@ static unsigned int lmm_to_fw_caps(const unsigned long *link_mode_mask) static int get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *link_ksettings) { + struct ethtool_link_settings_hdr *base = &link_ksettings->base; struct port_info *pi = netdev_priv(dev); - struct ethtool_link_settings *base = &link_ksettings->base; /* For the nonce, the Firmware doesn't send up Port State changes * when the Virtual Interface attached to the Port is down. So @@ -717,9 +717,9 @@ static int get_link_ksettings(struct net_device *dev, static int set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *link_ksettings) { + const struct ethtool_link_settings_hdr *base = &link_ksettings->base; struct port_info *pi = netdev_priv(dev); struct link_config *lc = &pi->link_cfg; - const struct ethtool_link_settings *base = &link_ksettings->base; struct link_config old_lc; unsigned int fw_caps; int ret = 0; diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c index 2fbe0f059a0b..61d08547e3f9 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c @@ -1436,8 +1436,8 @@ static void fw_caps_to_lmm(enum fw_port_type port_type, static int cxgb4vf_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *link_ksettings) { + struct ethtool_link_settings_hdr *base = &link_ksettings->base; struct port_info *pi = netdev_priv(dev); - struct ethtool_link_settings *base = &link_ksettings->base; /* For the nonce, the Firmware doesn't send up Port State changes * when the Virtual Interface attached to the Port is down. So diff --git a/drivers/net/ethernet/cisco/enic/enic_ethtool.c b/drivers/net/ethernet/cisco/enic/enic_ethtool.c index f7986f2b6a17..4fe85780a950 100644 --- a/drivers/net/ethernet/cisco/enic/enic_ethtool.c +++ b/drivers/net/ethernet/cisco/enic/enic_ethtool.c @@ -129,8 +129,8 @@ static void enic_intr_coal_set_rx(struct enic *enic, u32 timer) static int enic_get_ksettings(struct net_device *netdev, struct ethtool_link_ksettings *ecmd) { + struct ethtool_link_settings_hdr *base = &ecmd->base; struct enic *enic = netdev_priv(netdev); - struct ethtool_link_settings *base = &ecmd->base; ethtool_link_ksettings_add_link_mode(ecmd, supported, 10000baseT_Full); diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c index e50e1df0a433..c553da16d4b1 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c +++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c @@ -504,7 +504,7 @@ static int qede_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { typeof(cmd->link_modes) *link_modes = &cmd->link_modes; - struct ethtool_link_settings *base = &cmd->base; + struct ethtool_link_settings_hdr *base = &cmd->base; struct qede_dev *edev = netdev_priv(dev); struct qed_link_output current_link; @@ -537,7 +537,7 @@ static int qede_get_link_ksettings(struct net_device *dev, static int qede_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd) { - const struct ethtool_link_settings *base = &cmd->base; + const struct ethtool_link_settings_hdr *base = &cmd->base; const struct ethtool_forced_speed_map *map; struct qede_dev *edev = netdev_priv(dev); struct qed_link_output current_link; diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 12f6dc567598..1199e308c8dd 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -211,7 +211,7 @@ void ethtool_rxfh_context_lost(struct net_device *dev, u32 context_id); * fields, but they are allowed to overwrite them (will be ignored). */ struct ethtool_link_ksettings { - struct ethtool_link_settings base; + struct ethtool_link_settings_hdr base; struct { __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising); diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 5cc131cdb1bc..7da94e26ced6 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -425,7 +425,7 @@ convert_link_ksettings_to_legacy_settings( /* layout of the struct passed from/to userland */ struct ethtool_link_usettings { - struct ethtool_link_settings base; + struct ethtool_link_settings_hdr base; struct { __u32 supported[__ETHTOOL_LINK_MODE_MASK_NU32]; __u32 advertising[__ETHTOOL_LINK_MODE_MASK_NU32]; diff --git a/net/ethtool/linkinfo.c b/net/ethtool/linkinfo.c index 30b8ce275159..2d5bc57160be 100644 --- a/net/ethtool/linkinfo.c +++ b/net/ethtool/linkinfo.c @@ -8,9 +8,9 @@ struct linkinfo_req_info { }; struct linkinfo_reply_data { - struct ethnl_reply_data base; - struct ethtool_link_ksettings ksettings; - struct ethtool_link_settings *lsettings; + struct ethnl_reply_data base; + struct ethtool_link_ksettings ksettings; + struct ethtool_link_settings_hdr *lsettings; }; #define LINKINFO_REPDATA(__reply_base) \ @@ -98,7 +98,7 @@ static int ethnl_set_linkinfo(struct ethnl_req_info *req_info, struct genl_info *info) { struct ethtool_link_ksettings ksettings = {}; - struct ethtool_link_settings *lsettings; + struct ethtool_link_settings_hdr *lsettings; struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; bool mod = false; diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index 259cd9ef1f2a..17e49cf89f03 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -11,10 +11,10 @@ struct linkmodes_req_info { }; struct linkmodes_reply_data { - struct ethnl_reply_data base; - struct ethtool_link_ksettings ksettings; - struct ethtool_link_settings *lsettings; - bool peer_empty; + struct ethnl_reply_data base; + struct ethtool_link_ksettings ksettings; + struct ethtool_link_settings_hdr *lsettings; + bool peer_empty; }; #define LINKMODES_REPDATA(__reply_base) \ @@ -62,10 +62,12 @@ static int linkmodes_reply_size(const struct ethnl_req_info *req_base, { const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base); const struct ethtool_link_ksettings *ksettings = &data->ksettings; - const struct ethtool_link_settings *lsettings = &ksettings->base; bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct ethtool_link_settings_hdr *lsettings; int len, ret; + lsettings = &ksettings->base; + len = nla_total_size(sizeof(u8)) /* LINKMODES_AUTONEG */ + nla_total_size(sizeof(u32)) /* LINKMODES_SPEED */ + nla_total_size(sizeof(u32)) /* LINKMODES_LANES */ @@ -103,10 +105,12 @@ static int linkmodes_fill_reply(struct sk_buff *skb, { const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base); const struct ethtool_link_ksettings *ksettings = &data->ksettings; - const struct ethtool_link_settings *lsettings = &ksettings->base; bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct ethtool_link_settings_hdr *lsettings; int ret; + lsettings = &ksettings->base; + if (nla_put_u8(skb, ETHTOOL_A_LINKMODES_AUTONEG, lsettings->autoneg)) return -EMSGSIZE; @@ -237,7 +241,7 @@ static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb, struct ethtool_link_ksettings *ksettings, bool *mod, const struct net_device *dev) { - struct ethtool_link_settings *lsettings = &ksettings->base; + struct ethtool_link_settings_hdr *lsettings = &ksettings->base; bool req_speed, req_lanes, req_duplex; const struct nlattr *master_slave_cfg, *lanes_cfg; int ret; -- cgit v1.3 From 6b2d11e2d8fc130df4708be0b6b53fd3e6b54cf6 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Wed, 30 Oct 2024 04:22:33 +0000 Subject: net/tcp: Add missing lockdep annotations for TCP-AO hlist traversals Under CONFIG_PROVE_RCU_LIST + CONFIG_RCU_EXPERT hlist_for_each_entry_rcu() provides very helpful splats, which help to find possible issues. I missed CONFIG_RCU_EXPERT=y in my testing config the same as described in a3e4bf7f9675 ("configs/debug: make sure PROVE_RCU_LIST=y takes effect"). The fix itself is trivial: add the very same lockdep annotations as were used to dereference ao_info from the socket. Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/netdev/20241028152645.35a8be66@kernel.org/ Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Link: https://patch.msgid.link/20241030-tcp-ao-hlist-lockdep-annotate-v1-1-bf641a64d7c6@gmail.com Signed-off-by: Jakub Kicinski --- include/net/tcp_ao.h | 3 ++- net/ipv4/tcp_ao.c | 42 +++++++++++++++++++++++------------------- net/ipv4/tcp_ipv4.c | 3 ++- net/ipv6/tcp_ipv6.c | 4 ++-- 4 files changed, 29 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h index 1d46460d0fef..df655ce6987d 100644 --- a/include/net/tcp_ao.h +++ b/include/net/tcp_ao.h @@ -183,7 +183,8 @@ int tcp_ao_hash_skb(unsigned short int family, const u8 *tkey, int hash_offset, u32 sne); int tcp_parse_ao(struct sock *sk, int cmd, unsigned short int family, sockptr_t optval, int optlen); -struct tcp_ao_key *tcp_ao_established_key(struct tcp_ao_info *ao, +struct tcp_ao_key *tcp_ao_established_key(const struct sock *sk, + struct tcp_ao_info *ao, int sndid, int rcvid); int tcp_ao_copy_all_matching(const struct sock *sk, struct sock *newsk, struct request_sock *req, struct sk_buff *skb, diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index db6516092daf..bbb8d5f0eae7 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -109,12 +109,13 @@ bool tcp_ao_ignore_icmp(const struct sock *sk, int family, int type, int code) * it's known that the keys in ao_info are matching peer's * family/address/VRF/etc. */ -struct tcp_ao_key *tcp_ao_established_key(struct tcp_ao_info *ao, +struct tcp_ao_key *tcp_ao_established_key(const struct sock *sk, + struct tcp_ao_info *ao, int sndid, int rcvid) { struct tcp_ao_key *key; - hlist_for_each_entry_rcu(key, &ao->head, node) { + hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk)) { if ((sndid >= 0 && key->sndid != sndid) || (rcvid >= 0 && key->rcvid != rcvid)) continue; @@ -205,7 +206,7 @@ static struct tcp_ao_key *__tcp_ao_do_lookup(const struct sock *sk, int l3index, if (!ao) return NULL; - hlist_for_each_entry_rcu(key, &ao->head, node) { + hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk)) { u8 prefixlen = min(prefix, key->prefixlen); if (!tcp_ao_key_cmp(key, l3index, addr, prefixlen, @@ -793,7 +794,7 @@ int tcp_ao_prepare_reset(const struct sock *sk, struct sk_buff *skb, if (!ao_info) return -ENOENT; - *key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1); + *key = tcp_ao_established_key(sk, ao_info, aoh->rnext_keyid, -1); if (!*key) return -ENOENT; *traffic_key = snd_other_key(*key); @@ -979,7 +980,7 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, */ key = READ_ONCE(info->rnext_key); if (key->rcvid != aoh->keyid) { - key = tcp_ao_established_key(info, -1, aoh->keyid); + key = tcp_ao_established_key(sk, info, -1, aoh->keyid); if (!key) goto key_not_found; } @@ -1003,7 +1004,7 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, aoh->rnext_keyid, tcp_ao_hdr_maclen(aoh)); /* If the key is not found we do nothing. */ - key = tcp_ao_established_key(info, aoh->rnext_keyid, -1); + key = tcp_ao_established_key(sk, info, aoh->rnext_keyid, -1); if (key) /* pairs with tcp_ao_del_cmd */ WRITE_ONCE(info->current_key, key); @@ -1163,7 +1164,7 @@ void tcp_ao_established(struct sock *sk) if (!ao) return; - hlist_for_each_entry_rcu(key, &ao->head, node) + hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk)) tcp_ao_cache_traffic_keys(sk, ao, key); } @@ -1180,7 +1181,7 @@ void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb) WRITE_ONCE(ao->risn, tcp_hdr(skb)->seq); ao->rcv_sne = 0; - hlist_for_each_entry_rcu(key, &ao->head, node) + hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk)) tcp_ao_cache_traffic_keys(sk, ao, key); } @@ -1256,14 +1257,14 @@ int tcp_ao_copy_all_matching(const struct sock *sk, struct sock *newsk, key_head = rcu_dereference(hlist_first_rcu(&new_ao->head)); first_key = hlist_entry_safe(key_head, struct tcp_ao_key, node); - key = tcp_ao_established_key(new_ao, tcp_rsk(req)->ao_keyid, -1); + key = tcp_ao_established_key(req_to_sk(req), new_ao, tcp_rsk(req)->ao_keyid, -1); if (key) new_ao->current_key = key; else new_ao->current_key = first_key; /* set rnext_key */ - key = tcp_ao_established_key(new_ao, -1, tcp_rsk(req)->ao_rcv_next); + key = tcp_ao_established_key(req_to_sk(req), new_ao, -1, tcp_rsk(req)->ao_rcv_next); if (key) new_ao->rnext_key = key; else @@ -1857,12 +1858,12 @@ static int tcp_ao_del_cmd(struct sock *sk, unsigned short int family, * if there's any. */ if (cmd.set_current) { - new_current = tcp_ao_established_key(ao_info, cmd.current_key, -1); + new_current = tcp_ao_established_key(sk, ao_info, cmd.current_key, -1); if (!new_current) return -ENOENT; } if (cmd.set_rnext) { - new_rnext = tcp_ao_established_key(ao_info, -1, cmd.rnext); + new_rnext = tcp_ao_established_key(sk, ao_info, -1, cmd.rnext); if (!new_rnext) return -ENOENT; } @@ -1902,7 +1903,8 @@ static int tcp_ao_del_cmd(struct sock *sk, unsigned short int family, * "It is presumed that an MKT affecting a particular * connection cannot be destroyed during an active connection" */ - hlist_for_each_entry_rcu(key, &ao_info->head, node) { + hlist_for_each_entry_rcu(key, &ao_info->head, node, + lockdep_sock_is_held(sk)) { if (cmd.sndid != key->sndid || cmd.rcvid != key->rcvid) continue; @@ -2000,14 +2002,14 @@ static int tcp_ao_info_cmd(struct sock *sk, unsigned short int family, * if there's any. */ if (cmd.set_current) { - new_current = tcp_ao_established_key(ao_info, cmd.current_key, -1); + new_current = tcp_ao_established_key(sk, ao_info, cmd.current_key, -1); if (!new_current) { err = -ENOENT; goto out; } } if (cmd.set_rnext) { - new_rnext = tcp_ao_established_key(ao_info, -1, cmd.rnext); + new_rnext = tcp_ao_established_key(sk, ao_info, -1, cmd.rnext); if (!new_rnext) { err = -ENOENT; goto out; @@ -2101,7 +2103,8 @@ int tcp_v4_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen) * The layout of the fields in the user and kernel structures is expected to * be the same (including in the 32bit vs 64bit case). */ -static int tcp_ao_copy_mkts_to_user(struct tcp_ao_info *ao_info, +static int tcp_ao_copy_mkts_to_user(const struct sock *sk, + struct tcp_ao_info *ao_info, sockptr_t optval, sockptr_t optlen) { struct tcp_ao_getsockopt opt_in, opt_out; @@ -2229,7 +2232,8 @@ static int tcp_ao_copy_mkts_to_user(struct tcp_ao_info *ao_info, /* May change in RX, while we're dumping, pre-fetch it */ current_key = READ_ONCE(ao_info->current_key); - hlist_for_each_entry_rcu(key, &ao_info->head, node) { + hlist_for_each_entry_rcu(key, &ao_info->head, node, + lockdep_sock_is_held(sk)) { if (opt_in.get_all) goto match; @@ -2309,7 +2313,7 @@ int tcp_ao_get_mkts(struct sock *sk, sockptr_t optval, sockptr_t optlen) if (!ao_info) return -ENOENT; - return tcp_ao_copy_mkts_to_user(ao_info, optval, optlen); + return tcp_ao_copy_mkts_to_user(sk, ao_info, optval, optlen); } int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen) @@ -2396,7 +2400,7 @@ int tcp_ao_set_repair(struct sock *sk, sockptr_t optval, unsigned int optlen) WRITE_ONCE(ao->snd_sne, cmd.snd_sne); WRITE_ONCE(ao->rcv_sne, cmd.rcv_sne); - hlist_for_each_entry_rcu(key, &ao->head, node) + hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk)) tcp_ao_cache_traffic_keys(sk, ao, key); return 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 9d3dd101ea71..a38c8b1f44db 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1053,7 +1053,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) } if (aoh) - key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1); + key.ao_key = tcp_ao_established_key(sk, ao_info, + aoh->rnext_keyid, -1); } } if (key.ao_key) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 597920061a3a..c748eeae1453 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1172,8 +1172,8 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) goto out; if (aoh) - key.ao_key = tcp_ao_established_key(ao_info, - aoh->rnext_keyid, -1); + key.ao_key = tcp_ao_established_key(sk, ao_info, + aoh->rnext_keyid, -1); } } if (key.ao_key) { -- cgit v1.3 From 10f0740234f0b157b41bdc7e9c3555a9b86c1599 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 9 Oct 2024 16:28:06 +1100 Subject: sunrpc: handle -ENOTCONN in xs_tcp_setup_socket() xs_tcp_finish_connecting() can return -ENOTCONN but the switch statement in xs_tcp_setup_socket() treats that as an unhandled error. If we treat it as a known error it would propagate back to call_connect_status() which does handle that error code. This appears to be the intention of the commit (given below) which added -ENOTCONN as a return status for xs_tcp_finish_connecting(). So add -ENOTCONN to the switch statement as an error to pass through to the caller. Link: https://bugzilla.suse.com/show_bug.cgi?id=1231050 Link: https://access.redhat.com/discussions/3434091 Fixes: 01d37c428ae0 ("SUNRPC: xprt_connect() don't abort the task if the transport isn't bound") Signed-off-by: NeilBrown Reviewed-by: Benjamin Coddington Signed-off-by: Anna Schumaker --- net/sunrpc/xprtsock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 0e1691316f42..1326fbf45a34 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2459,6 +2459,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) case -EHOSTUNREACH: case -EADDRINUSE: case -ENOBUFS: + case -ENOTCONN: break; default: printk("%s: connect returned unhandled error %d\n", -- cgit v1.3 From 9adbb4198bf6cf3634032871118a7052aeaa573f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 Nov 2024 10:41:13 +0100 Subject: netfilter: nf_tables: avoid false-positive lockdep splat on rule deletion On rule delete we get: WARNING: suspicious RCU usage net/netfilter/nf_tables_api.c:3420 RCU-list traversed in non-reader section!! 1 lock held by iptables/134: #0: ffff888008c4fcc8 (&nft_net->commit_mutex){+.+.}-{3:3}, at: nf_tables_valid_genid (include/linux/jiffies.h:101) nf_tables Code is fine, no other CPU can change the list because we're holding transaction mutex. Pass the needed lockdep annotation to the iterator and fix two comments for functions that are no longer restricted to rcu-only context. This is enough to resolve rule delete, but there are several other missing annotations, added in followup-patches. Fixes: 28875945ba98 ("rcu: Add support for consolidated-RCU reader checking") Reported-by: Matthieu Baerts Tested-by: Matthieu Baerts Closes: https://lore.kernel.org/netfilter-devel/da27f17f-3145-47af-ad0f-7fd2a823623e@kernel.org/ Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 30331688301e..80c285ac7e07 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3411,13 +3411,15 @@ void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr) * Rules */ -static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain, +static struct nft_rule *__nft_rule_lookup(const struct net *net, + const struct nft_chain *chain, u64 handle) { struct nft_rule *rule; // FIXME: this sucks - list_for_each_entry_rcu(rule, &chain->rules, list) { + list_for_each_entry_rcu(rule, &chain->rules, list, + lockdep_commit_lock_is_held(net)) { if (handle == rule->handle) return rule; } @@ -3425,13 +3427,14 @@ static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain, return ERR_PTR(-ENOENT); } -static struct nft_rule *nft_rule_lookup(const struct nft_chain *chain, +static struct nft_rule *nft_rule_lookup(const struct net *net, + const struct nft_chain *chain, const struct nlattr *nla) { if (nla == NULL) return ERR_PTR(-EINVAL); - return __nft_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla))); + return __nft_rule_lookup(net, chain, be64_to_cpu(nla_get_be64(nla))); } static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { @@ -3732,7 +3735,7 @@ static int nf_tables_dump_rules_done(struct netlink_callback *cb) return 0; } -/* called with rcu_read_lock held */ +/* Caller must hold rcu read lock or transaction mutex */ static struct sk_buff * nf_tables_getrule_single(u32 portid, const struct nfnl_info *info, const struct nlattr * const nla[], bool reset) @@ -3759,7 +3762,7 @@ nf_tables_getrule_single(u32 portid, const struct nfnl_info *info, return ERR_CAST(chain); } - rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); + rule = nft_rule_lookup(net, chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return ERR_CAST(rule); @@ -4057,7 +4060,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, if (nla[NFTA_RULE_HANDLE]) { handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE])); - rule = __nft_rule_lookup(chain, handle); + rule = __nft_rule_lookup(net, chain, handle); if (IS_ERR(rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return PTR_ERR(rule); @@ -4079,7 +4082,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, if (nla[NFTA_RULE_POSITION]) { pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION])); - old_rule = __nft_rule_lookup(chain, pos_handle); + old_rule = __nft_rule_lookup(net, chain, pos_handle); if (IS_ERR(old_rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]); return PTR_ERR(old_rule); @@ -4296,7 +4299,7 @@ static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, if (chain) { if (nla[NFTA_RULE_HANDLE]) { - rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); + rule = nft_rule_lookup(info->net, chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) { if (PTR_ERR(rule) == -ENOENT && NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE) @@ -8101,7 +8104,7 @@ static int nf_tables_dump_obj_done(struct netlink_callback *cb) return 0; } -/* called with rcu_read_lock held */ +/* Caller must hold rcu read lock or transaction mutex */ static struct sk_buff * nf_tables_getobj_single(u32 portid, const struct nfnl_info *info, const struct nlattr * const nla[], bool reset) -- cgit v1.3 From 8f5f3786dba765099f12da00e6be0c26f69f2fbd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 Nov 2024 10:41:14 +0100 Subject: netfilter: nf_tables: avoid false-positive lockdep splats with sets Same as previous patch. All set handling functions here can be called with transaction mutex held (but not the rcu read lock). The transaction mutex prevents concurrent add/delete, so this is fine. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 80c285ac7e07..a51731d76401 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3986,7 +3986,8 @@ int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set) struct nft_set_ext *ext; int ret = 0; - list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + list_for_each_entry_rcu(catchall, &set->catchall_list, list, + lockdep_commit_lock_is_held(ctx->net)) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_active(ext, dummy_iter.genmask)) continue; @@ -4459,7 +4460,8 @@ static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { [NFTA_SET_DESC_CONCAT] = NLA_POLICY_NESTED_ARRAY(nft_concat_policy), }; -static struct nft_set *nft_set_lookup(const struct nft_table *table, +static struct nft_set *nft_set_lookup(const struct net *net, + const struct nft_table *table, const struct nlattr *nla, u8 genmask) { struct nft_set *set; @@ -4467,7 +4469,8 @@ static struct nft_set *nft_set_lookup(const struct nft_table *table, if (nla == NULL) return ERR_PTR(-EINVAL); - list_for_each_entry_rcu(set, &table->sets, list) { + list_for_each_entry_rcu(set, &table->sets, list, + lockdep_commit_lock_is_held(net)) { if (!nla_strcmp(nla, set->name) && nft_active_genmask(set, genmask)) return set; @@ -4517,7 +4520,7 @@ struct nft_set *nft_set_lookup_global(const struct net *net, { struct nft_set *set; - set = nft_set_lookup(table, nla_set_name, genmask); + set = nft_set_lookup(net, table, nla_set_name, genmask); if (IS_ERR(set)) { if (!nla_set_id) return set; @@ -4893,7 +4896,7 @@ static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info, if (!nla[NFTA_SET_TABLE]) return -EINVAL; - set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask); + set = nft_set_lookup(net, table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); return PTR_ERR(set); @@ -5229,7 +5232,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); - set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask); + set = nft_set_lookup(net, table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) { if (PTR_ERR(set) != -ENOENT) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); @@ -5431,7 +5434,7 @@ static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, set = nft_set_lookup_byhandle(table, attr, genmask); } else { attr = nla[NFTA_SET_NAME]; - set = nft_set_lookup(table, attr, genmask); + set = nft_set_lookup(net, table, attr, genmask); } if (IS_ERR(set)) { @@ -5495,7 +5498,8 @@ static int nft_set_catchall_bind_check(const struct nft_ctx *ctx, struct nft_set_ext *ext; int ret = 0; - list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + list_for_each_entry_rcu(catchall, &set->catchall_list, list, + lockdep_commit_lock_is_held(ctx->net)) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_active(ext, genmask)) continue; @@ -6261,7 +6265,7 @@ static int nft_set_dump_ctx_init(struct nft_set_dump_ctx *dump_ctx, return PTR_ERR(table); } - set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask); + set = nft_set_lookup(net, table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]); return PTR_ERR(set); @@ -7493,7 +7497,8 @@ static int nft_set_catchall_flush(const struct nft_ctx *ctx, struct nft_set_ext *ext; int ret = 0; - list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + list_for_each_entry_rcu(catchall, &set->catchall_list, list, + lockdep_commit_lock_is_held(ctx->net)) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_active(ext, genmask)) continue; @@ -7543,7 +7548,7 @@ static int nf_tables_delsetelem(struct sk_buff *skb, return PTR_ERR(table); } - set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask); + set = nft_set_lookup(net, table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]); return PTR_ERR(set); -- cgit v1.3 From b3e8f29d6b45e865bab9a9964709ff7413e33e85 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 Nov 2024 10:41:15 +0100 Subject: netfilter: nf_tables: avoid false-positive lockdep splats with flowtables The transaction mutex prevents concurrent add/delete, its ok to iterate those lists outside of rcu read side critical sections. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 ++- net/netfilter/nf_tables_api.c | 15 +++++++++------ net/netfilter/nft_flow_offload.c | 4 ++-- 3 files changed, 13 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 91ae20cb7648..c1513bd14568 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1463,7 +1463,8 @@ struct nft_flowtable { struct nf_flowtable data; }; -struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, +struct nft_flowtable *nft_flowtable_lookup(const struct net *net, + const struct nft_table *table, const struct nlattr *nla, u8 genmask); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a51731d76401..9e367e134691 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8378,12 +8378,14 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = { [NFTA_FLOWTABLE_FLAGS] = { .type = NLA_U32 }, }; -struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, +struct nft_flowtable *nft_flowtable_lookup(const struct net *net, + const struct nft_table *table, const struct nlattr *nla, u8 genmask) { struct nft_flowtable *flowtable; - list_for_each_entry_rcu(flowtable, &table->flowtables, list) { + list_for_each_entry_rcu(flowtable, &table->flowtables, list, + lockdep_commit_lock_is_held(net)) { if (!nla_strcmp(nla, flowtable->name) && nft_active_genmask(flowtable, genmask)) return flowtable; @@ -8739,7 +8741,7 @@ static int nf_tables_newflowtable(struct sk_buff *skb, return PTR_ERR(table); } - flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], + flowtable = nft_flowtable_lookup(net, table, nla[NFTA_FLOWTABLE_NAME], genmask); if (IS_ERR(flowtable)) { err = PTR_ERR(flowtable); @@ -8933,7 +8935,7 @@ static int nf_tables_delflowtable(struct sk_buff *skb, flowtable = nft_flowtable_lookup_byhandle(table, attr, genmask); } else { attr = nla[NFTA_FLOWTABLE_NAME]; - flowtable = nft_flowtable_lookup(table, attr, genmask); + flowtable = nft_flowtable_lookup(net, table, attr, genmask); } if (IS_ERR(flowtable)) { @@ -9003,7 +9005,8 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, if (!hook_list) hook_list = &flowtable->hook_list; - list_for_each_entry_rcu(hook, hook_list, list) { + list_for_each_entry_rcu(hook, hook_list, list, + lockdep_commit_lock_is_held(net)) { if (nla_put_string(skb, NFTA_DEVICE_NAME, hook->ops.dev->name)) goto nla_put_failure; } @@ -9145,7 +9148,7 @@ static int nf_tables_getflowtable(struct sk_buff *skb, return PTR_ERR(table); } - flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], + flowtable = nft_flowtable_lookup(net, table, nla[NFTA_FLOWTABLE_NAME], genmask); if (IS_ERR(flowtable)) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 2f732fae5a83..65199c23c75c 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -409,8 +409,8 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx, if (!tb[NFTA_FLOW_TABLE_NAME]) return -EINVAL; - flowtable = nft_flowtable_lookup(ctx->table, tb[NFTA_FLOW_TABLE_NAME], - genmask); + flowtable = nft_flowtable_lookup(ctx->net, ctx->table, + tb[NFTA_FLOW_TABLE_NAME], genmask); if (IS_ERR(flowtable)) return PTR_ERR(flowtable); -- cgit v1.3 From 28b7a6b84c0aea37c5f796e14b479f1e8dbeba12 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 Nov 2024 10:41:16 +0100 Subject: netfilter: nf_tables: avoid false-positive lockdep splats in set walker Its not possible to add or delete elements from hash and bitmap sets, as long as caller is holding the transaction mutex, so its ok to iterate the list outside of rcu read side critical section. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_bitmap.c | 10 ++++++---- net/netfilter/nft_set_hash.c | 3 ++- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 1caa04619dc6..12390d2e994f 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -88,13 +88,15 @@ bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, } static struct nft_bitmap_elem * -nft_bitmap_elem_find(const struct nft_set *set, struct nft_bitmap_elem *this, +nft_bitmap_elem_find(const struct net *net, + const struct nft_set *set, struct nft_bitmap_elem *this, u8 genmask) { const struct nft_bitmap *priv = nft_set_priv(set); struct nft_bitmap_elem *be; - list_for_each_entry_rcu(be, &priv->list, head) { + list_for_each_entry_rcu(be, &priv->list, head, + lockdep_is_held(&nft_pernet(net)->commit_mutex)) { if (memcmp(nft_set_ext_key(&be->ext), nft_set_ext_key(&this->ext), set->klen) || !nft_set_elem_active(&be->ext, genmask)) @@ -132,7 +134,7 @@ static int nft_bitmap_insert(const struct net *net, const struct nft_set *set, u8 genmask = nft_genmask_next(net); u32 idx, off; - be = nft_bitmap_elem_find(set, new, genmask); + be = nft_bitmap_elem_find(net, set, new, genmask); if (be) { *elem_priv = &be->priv; return -EEXIST; @@ -201,7 +203,7 @@ nft_bitmap_deactivate(const struct net *net, const struct nft_set *set, nft_bitmap_location(set, elem->key.val.data, &idx, &off); - be = nft_bitmap_elem_find(set, this, genmask); + be = nft_bitmap_elem_find(net, set, this, genmask); if (!be) return NULL; diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index daa56dda737a..65bd291318f2 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -647,7 +647,8 @@ static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set, int i; for (i = 0; i < priv->buckets; i++) { - hlist_for_each_entry_rcu(he, &priv->table[i], node) { + hlist_for_each_entry_rcu(he, &priv->table[i], node, + lockdep_is_held(&nft_pernet(ctx->net)->commit_mutex)) { if (iter->count < iter->skip) goto cont; -- cgit v1.3 From 3567146b94afcd69d4916c880eb5b1b0e3797397 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 Nov 2024 10:41:17 +0100 Subject: netfilter: nf_tables: avoid false-positive lockdep splats with basechain hook Like previous patches: iteration is ok if the list cannot be altered in parallel. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9e367e134691..3b5154f2dd79 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1824,7 +1824,8 @@ nla_put_failure: return -ENOSPC; } -static int nft_dump_basechain_hook(struct sk_buff *skb, int family, +static int nft_dump_basechain_hook(struct sk_buff *skb, + const struct net *net, int family, const struct nft_base_chain *basechain, const struct list_head *hook_list) { @@ -1849,7 +1850,8 @@ static int nft_dump_basechain_hook(struct sk_buff *skb, int family, if (!hook_list) hook_list = &basechain->hook_list; - list_for_each_entry_rcu(hook, hook_list, list) { + list_for_each_entry_rcu(hook, hook_list, list, + lockdep_commit_lock_is_held(net)) { if (!first) first = hook; @@ -1900,7 +1902,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, const struct nft_base_chain *basechain = nft_base_chain(chain); struct nft_stats __percpu *stats; - if (nft_dump_basechain_hook(skb, family, basechain, hook_list)) + if (nft_dump_basechain_hook(skb, net, family, basechain, hook_list)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CHAIN_POLICY, -- cgit v1.3 From ee666a541ed957937454d50afa4757924508cd74 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 Nov 2024 10:41:18 +0100 Subject: netfilter: nf_tables: must hold rcu read lock while iterating expression type list nft shell tests trigger: WARNING: suspicious RCU usage net/netfilter/nf_tables_api.c:3125 RCU-list traversed in non-reader section!! 1 lock held by nft/2068: #0: ffff888106c6f8c8 (&nft_net->commit_mutex){+.+.}-{4:4}, at: nf_tables_valid_genid+0x3c/0xf0 But the transaction mutex doesn't protect this list, the nfnl subsystem mutex would, but we can't acquire it here without risk of ABBA deadlocks. Acquire the rcu read lock to avoid this issue. v3: add a comment that explains the ->inner_ops check implies expression is builtin and lack of a module owner reference is ok. Fixes: 3a07327d10a0 ("netfilter: nft_inner: support for inner tunnel header matching") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3b5154f2dd79..de8e48a5c62d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3296,25 +3296,37 @@ int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, if (!tb[NFTA_EXPR_DATA] || !tb[NFTA_EXPR_NAME]) return -EINVAL; + rcu_read_lock(); + type = __nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]); - if (!type) - return -ENOENT; + if (!type) { + err = -ENOENT; + goto out_unlock; + } - if (!type->inner_ops) - return -EOPNOTSUPP; + if (!type->inner_ops) { + err = -EOPNOTSUPP; + goto out_unlock; + } err = nla_parse_nested_deprecated(info->tb, type->maxattr, tb[NFTA_EXPR_DATA], type->policy, NULL); if (err < 0) - goto err_nla_parse; + goto out_unlock; info->attr = nla; info->ops = type->inner_ops; + /* No module reference will be taken on type->owner. + * Presence of type->inner_ops implies that the expression + * is builtin, so it cannot go away. + */ + rcu_read_unlock(); return 0; -err_nla_parse: +out_unlock: + rcu_read_unlock(); return err; } -- cgit v1.3 From cddc04275f95ca3b18da5c0fb111705ac173af89 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 Nov 2024 10:41:19 +0100 Subject: netfilter: nf_tables: must hold rcu read lock while iterating object type list Update of stateful object triggers: WARNING: suspicious RCU usage net/netfilter/nf_tables_api.c:7759 RCU-list traversed in non-reader section!! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by nft/3060: #0: ffff88810f0578c8 (&nft_net->commit_mutex){+.+.}-{4:4}, [..] ... but this list is not protected by the transaction mutex but the nfnl nftables subsystem mutex. Switch to nft_obj_type_get which will acquire rcu read lock, bump refcount, and returns the result. v3: Dan Carpenter points out nft_obj_type_get returns error pointer, not NULL, on error. Fixes: dad3bdeef45f ("netfilter: nf_tables: fix memory leak during stateful obj update"). Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index de8e48a5c62d..b7a817e483aa 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7809,9 +7809,7 @@ static int nf_tables_updobj(const struct nft_ctx *ctx, struct nft_trans *trans; int err = -ENOMEM; - if (!try_module_get(type->owner)) - return -ENOENT; - + /* caller must have obtained type->owner reference. */ trans = nft_trans_alloc(ctx, NFT_MSG_NEWOBJ, sizeof(struct nft_trans_obj)); if (!trans) @@ -7879,15 +7877,16 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info, if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - type = __nft_obj_type_get(objtype, family); - if (WARN_ON_ONCE(!type)) - return -ENOENT; - if (!obj->ops->update) return 0; + type = nft_obj_type_get(net, objtype, family); + if (WARN_ON_ONCE(IS_ERR(type))) + return PTR_ERR(type); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + /* type->owner reference is put when transaction object is released. */ return nf_tables_updobj(&ctx, type, nla[NFTA_OBJ_DATA], obj); } -- cgit v1.3 From 7d1c2d517f503c63aac3775b51ec96210a6e6ef9 Mon Sep 17 00:00:00 2001 From: Aaron Conole Date: Fri, 1 Nov 2024 16:47:32 -0400 Subject: openvswitch: Pass on secpath details for internal port rx. Clearing the secpath for internal ports will cause packet drops when ipsec offload or early SW ipsec decrypt are used. Systems that rely on these will not be able to actually pass traffic via openvswitch. There is still an open issue for a flow miss packet - this is because we drop the extensions during upcall and there is no facility to restore such data (and it is non-trivial to add such functionality to the upcall interface). That means that when a flow miss occurs, there will still be packet drops. With this patch, when a flow is found then traffic which has an associated xfrm extension will properly flow. Signed-off-by: Aaron Conole Acked-by: Eelco Chaudron Link: https://patch.msgid.link/20241101204732.183840-1-aconole@redhat.com Signed-off-by: Jakub Kicinski --- net/openvswitch/vport-internal_dev.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 5858d65ea1a9..2412d7813d24 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -195,7 +195,6 @@ static int internal_dev_recv(struct sk_buff *skb) skb_dst_drop(skb); nf_reset_ct(skb); - secpath_reset(skb); skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, netdev); -- cgit v1.3 From cfbbd4859882a5469f6f4945937a074ee78c4b46 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 4 Nov 2024 13:31:41 +0100 Subject: mptcp: no admin perm to list endpoints During the switch to YNL, the command to list all endpoints has been accidentally restricted to users with admin permissions. It looks like there are no reasons to have this restriction which makes it harder for a user to quickly check if the endpoint list has been correctly populated by an automated tool. Best to go back to the previous behaviour then. mptcp_pm_gen.c has been modified using ynl-gen-c.py: $ ./tools/net/ynl/ynl-gen-c.py --mode kernel \ --spec Documentation/netlink/specs/mptcp_pm.yaml --source \ -o net/mptcp/mptcp_pm_gen.c The header file doesn't need to be regenerated. Fixes: 1d0507f46843 ("net: mptcp: convert netlink from small_ops to ops") Cc: stable@vger.kernel.org Reviewed-by: Davide Caratti Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241104-net-mptcp-misc-6-12-v1-1-c13f2ff1656f@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 1 - net/mptcp/mptcp_pm_gen.c | 1 - 2 files changed, 2 deletions(-) (limited to 'net') diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index 30d8342cacc8..dc190bf838fe 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -293,7 +293,6 @@ operations: doc: Get endpoint information attribute-set: attr dont-validate: [ strict ] - flags: [ uns-admin-perm ] do: &get-addr-attrs request: attributes: diff --git a/net/mptcp/mptcp_pm_gen.c b/net/mptcp/mptcp_pm_gen.c index c30a2a90a192..bfb37c5a88c4 100644 --- a/net/mptcp/mptcp_pm_gen.c +++ b/net/mptcp/mptcp_pm_gen.c @@ -112,7 +112,6 @@ const struct genl_ops mptcp_pm_nl_ops[11] = { .dumpit = mptcp_pm_nl_get_addr_dumpit, .policy = mptcp_pm_get_addr_nl_policy, .maxattr = MPTCP_PM_ATTR_TOKEN, - .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MPTCP_PM_CMD_FLUSH_ADDRS, -- cgit v1.3 From 99635c91fb8b860a6404b9bc8b769df7bdaa2ae3 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 4 Nov 2024 13:31:42 +0100 Subject: mptcp: use sock_kfree_s instead of kfree The local address entries on userspace_pm_local_addr_list are allocated by sock_kmalloc(). It's then required to use sock_kfree_s() instead of kfree() to free these entries in order to adjust the allocated size on the sk side. Fixes: 24430f8bf516 ("mptcp: add address into userspace pm list") Cc: stable@vger.kernel.org Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241104-net-mptcp-misc-6-12-v1-2-c13f2ff1656f@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_userspace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 2cceded3a83a..56dfea9862b7 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -91,6 +91,7 @@ static int mptcp_userspace_pm_delete_local_addr(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *addr) { struct mptcp_pm_addr_entry *entry, *tmp; + struct sock *sk = (struct sock *)msk; list_for_each_entry_safe(entry, tmp, &msk->pm.userspace_pm_local_addr_list, list) { if (mptcp_addresses_equal(&entry->addr, &addr->addr, false)) { @@ -98,7 +99,7 @@ static int mptcp_userspace_pm_delete_local_addr(struct mptcp_sock *msk, * be used multiple times (e.g. fullmesh mode). */ list_del_rcu(&entry->list); - kfree(entry); + sock_kfree_s(sk, entry, sizeof(*entry)); msk->pm.local_addr_used--; return 0; } -- cgit v1.3 From f2c71c49da8f8941e3e465605fc41939eee9210a Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 4 Nov 2024 13:43:47 +0100 Subject: mptcp: remove unneeded lock when listing scheds mptcp_get_available_schedulers() needs to iterate over the schedulers' list only to read the names: it doesn't modify anything there. In this case, it is enough to hold the RCU read lock, no need to combine this with the associated spin lock as it was done since its introduction in commit 73c900aa3660 ("mptcp: add net.mptcp.available_schedulers"). Suggested-by: Paolo Abeni Reviewed-by: Geliang Tang Reviewed-by: Simon Horman Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241104-net-next-mptcp-sched-unneeded-lock-v2-1-2ccc1e0c750c@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/sched.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c index 78ed508ebc1b..df7dbcfa3b71 100644 --- a/net/mptcp/sched.c +++ b/net/mptcp/sched.c @@ -60,7 +60,6 @@ void mptcp_get_available_schedulers(char *buf, size_t maxlen) size_t offs = 0; rcu_read_lock(); - spin_lock(&mptcp_sched_list_lock); list_for_each_entry_rcu(sched, &mptcp_sched_list, list) { offs += snprintf(buf + offs, maxlen - offs, "%s%s", @@ -69,7 +68,6 @@ void mptcp_get_available_schedulers(char *buf, size_t maxlen) if (WARN_ON_ONCE(offs >= maxlen)) break; } - spin_unlock(&mptcp_sched_list_lock); rcu_read_unlock(); } -- cgit v1.3 From 6ca575374dd9a507cdd16dfa0e78c2e9e20bd05f Mon Sep 17 00:00:00 2001 From: Hyunwoo Kim Date: Tue, 22 Oct 2024 09:32:56 +0200 Subject: vsock/virtio: Initialization of the dangling pointer occurring in vsk->trans During loopback communication, a dangling pointer can be created in vsk->trans, potentially leading to a Use-After-Free condition. This issue is resolved by initializing vsk->trans to NULL. Cc: stable Fixes: 06a8fc78367d ("VSOCK: Introduce virtio_vsock_common.ko") Signed-off-by: Hyunwoo Kim Signed-off-by: Wongi Lee Signed-off-by: Greg Kroah-Hartman Message-Id: <2024102245-strive-crib-c8d3@gregkh> Signed-off-by: Michael S. Tsirkin --- net/vmw_vsock/virtio_transport_common.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index ccbd2bc0d210..fc5666c8298f 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -1109,6 +1109,7 @@ void virtio_transport_destruct(struct vsock_sock *vsk) struct virtio_vsock_sock *vvs = vsk->trans; kfree(vvs); + vsk->trans = NULL; } EXPORT_SYMBOL_GPL(virtio_transport_destruct); -- cgit v1.3 From 766f532089afd202a537f44c09a88ab9912f07d7 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 31 Oct 2024 16:52:36 +0100 Subject: xfrm: Convert xfrm_get_tos() to dscp_t. Return a dscp_t variable to prepare for the future conversion of xfrm_bundle_create() to dscp_t. While there, rename the function "xfrm_get_dscp", to align its name with the new return type. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 8a1b83191a6c..51a071a79016 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2587,10 +2587,10 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl, } -static int xfrm_get_tos(const struct flowi *fl, int family) +static dscp_t xfrm_get_dscp(const struct flowi *fl, int family) { if (family == AF_INET) - return fl->u.ip4.flowi4_tos & INET_DSCP_MASK; + return inet_dsfield_to_dscp(fl->u.ip4.flowi4_tos); return 0; } @@ -2684,7 +2684,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, xfrm_flowi_addr_get(fl, &saddr, &daddr, family); - tos = xfrm_get_tos(fl, family); + tos = inet_dscp_to_dsfield(xfrm_get_dscp(fl, family)); dst_hold(dst); -- cgit v1.3 From 01f61cbfc8b2cf89fe960ea3c1c67bba089dbdc5 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 31 Oct 2024 16:52:43 +0100 Subject: xfrm: Convert xfrm_bundle_create() to dscp_t. Use a dscp_t variable to store the result of xfrm_get_dscp(). This prepares for the future conversion of xfrm_dst_lookup(). Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 51a071a79016..ecb989347bd4 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2678,13 +2678,13 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, int header_len = 0; int nfheader_len = 0; int trailer_len = 0; - int tos; int family = policy->selector.family; xfrm_address_t saddr, daddr; + dscp_t dscp; xfrm_flowi_addr_get(fl, &saddr, &daddr, family); - tos = inet_dscp_to_dsfield(xfrm_get_dscp(fl, family)); + dscp = xfrm_get_dscp(fl, family); dst_hold(dst); @@ -2732,7 +2732,8 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, family = xfrm[i]->props.family; oif = fl->flowi_oif ? : fl->flowi_l3mdev; - dst = xfrm_dst_lookup(xfrm[i], tos, oif, + dst = xfrm_dst_lookup(xfrm[i], + inet_dscp_to_dsfield(dscp), oif, &saddr, &daddr, family, mark); err = PTR_ERR(dst); if (IS_ERR(dst)) -- cgit v1.3 From 3021a2a3403df0fe0b79af15071e5f6ee25461a4 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 31 Oct 2024 16:52:49 +0100 Subject: xfrm: Convert xfrm_dst_lookup() to dscp_t. Pass a dscp_t variable to xfrm_dst_lookup(), instead of an int, to prevent accidental setting of ECN bits in ->flowi4_tos. Only xfrm_bundle_create() actually calls xfrm_dst_lookup(). Since it already has a dscp_t variable to pass as parameter, we only need to remove the inet_dscp_to_dsfield() conversion. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index ecb989347bd4..7e3e10fb9ca0 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -289,7 +289,7 @@ struct dst_entry *__xfrm_dst_lookup(int family, EXPORT_SYMBOL(__xfrm_dst_lookup); static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, - int tos, int oif, + dscp_t dscp, int oif, xfrm_address_t *prev_saddr, xfrm_address_t *prev_daddr, int family, u32 mark) @@ -312,7 +312,7 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, params.net = net; params.saddr = saddr; params.daddr = daddr; - params.tos = tos; + params.tos = inet_dscp_to_dsfield(dscp); params.oif = oif; params.mark = mark; params.ipproto = x->id.proto; @@ -2732,9 +2732,8 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, family = xfrm[i]->props.family; oif = fl->flowi_oif ? : fl->flowi_l3mdev; - dst = xfrm_dst_lookup(xfrm[i], - inet_dscp_to_dsfield(dscp), oif, - &saddr, &daddr, family, mark); + dst = xfrm_dst_lookup(xfrm[i], dscp, oif, &saddr, + &daddr, family, mark); err = PTR_ERR(dst); if (IS_ERR(dst)) goto put_states; -- cgit v1.3 From e57dfaa4b0a72f6a231a8eedb95d260045bbd8db Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 31 Oct 2024 16:52:57 +0100 Subject: xfrm: Convert struct xfrm_dst_lookup_params -> tos to dscp_t. Add type annotation to the "tos" field of struct xfrm_dst_lookup_params, to ensure that the ECN bits aren't mistakenly taken into account when doing route lookups. Rename that field (tos -> dscp) to make that change explicit. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 ++- net/ipv4/xfrm4_policy.c | 3 ++- net/xfrm/xfrm_policy.c | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 2b87999bd5aa..32c09e85a64c 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -354,7 +355,7 @@ void xfrm_if_unregister_cb(void); struct xfrm_dst_lookup_params { struct net *net; - int tos; + dscp_t dscp; int oif; xfrm_address_t *saddr; xfrm_address_t *daddr; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 7e1c2faed1ff..7fb6205619e7 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -24,7 +25,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct flowi4 *fl4, memset(fl4, 0, sizeof(*fl4)); fl4->daddr = params->daddr->a4; - fl4->flowi4_tos = params->tos; + fl4->flowi4_tos = inet_dscp_to_dsfield(params->dscp); fl4->flowi4_l3mdev = l3mdev_master_ifindex_by_index(params->net, params->oif); fl4->flowi4_mark = params->mark; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 7e3e10fb9ca0..4408c11c0835 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -312,7 +312,7 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, params.net = net; params.saddr = saddr; params.daddr = daddr; - params.tos = inet_dscp_to_dsfield(dscp); + params.dscp = dscp; params.oif = oif; params.mark = mark; params.ipproto = x->id.proto; -- cgit v1.3 From 15ab0548e3107665c34579ae523b2b6e7c22082a Mon Sep 17 00:00:00 2001 From: Zijian Zhang Date: Wed, 6 Nov 2024 22:25:18 +0000 Subject: bpf, sockmap: Several fixes to bpf_msg_push_data Several fixes to bpf_msg_push_data, 1. test_sockmap has tests where bpf_msg_push_data is invoked to push some data at the end of a message, but -EINVAL is returned. In this case, in bpf_msg_push_data, after the first loop, i will be set to msg->sg.end, add the logic to handle it. 2. In the code block of "if (start - offset)", it's possible that "i" points to the last of sk_msg_elem. In this case, "sk_msg_iter_next(msg, end)" might still be called twice, another invoking is in "if (!copy)" code block, but actually only one is needed. Add the logic to handle it, and reconstruct the code to make the logic more clear. Fixes: 6fff607e2f14 ("bpf: sk_msg program helper bpf_msg_push_data") Signed-off-by: Zijian Zhang Link: https://lore.kernel.org/r/20241106222520.527076-7-zijianzhang@bytedance.com Signed-off-by: Martin KaFai Lau --- net/core/filter.c | 53 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 82f92ed0dc72..255d58bae2a9 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2778,7 +2778,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, sk_msg_iter_var_next(i); } while (i != msg->sg.end); - if (start >= offset + l) + if (start > offset + l) return -EINVAL; space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); @@ -2803,6 +2803,8 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, raw = page_address(page); + if (i == msg->sg.end) + sk_msg_iter_var_prev(i); psge = sk_msg_elem(msg, i); front = start - offset; back = psge->length - front; @@ -2819,7 +2821,13 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, } put_page(sg_page(psge)); - } else if (start - offset) { + new = i; + goto place_new; + } + + if (start - offset) { + if (i == msg->sg.end) + sk_msg_iter_var_prev(i); psge = sk_msg_elem(msg, i); rsge = sk_msg_elem_cpy(msg, i); @@ -2830,39 +2838,44 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, sk_msg_iter_var_next(i); sg_unmark_end(psge); sg_unmark_end(&rsge); - sk_msg_iter_next(msg, end); } /* Slot(s) to place newly allocated data */ + sk_msg_iter_next(msg, end); new = i; + sk_msg_iter_var_next(i); + + if (i == msg->sg.end) { + if (!rsge.length) + goto place_new; + sk_msg_iter_next(msg, end); + goto place_new; + } /* Shift one or two slots as needed */ - if (!copy) { - sge = sk_msg_elem_cpy(msg, i); + sge = sk_msg_elem_cpy(msg, new); + sg_unmark_end(&sge); + nsge = sk_msg_elem_cpy(msg, i); + if (rsge.length) { sk_msg_iter_var_next(i); - sg_unmark_end(&sge); + nnsge = sk_msg_elem_cpy(msg, i); sk_msg_iter_next(msg, end); + } - nsge = sk_msg_elem_cpy(msg, i); + while (i != msg->sg.end) { + msg->sg.data[i] = sge; + sge = nsge; + sk_msg_iter_var_next(i); if (rsge.length) { - sk_msg_iter_var_next(i); + nsge = nnsge; nnsge = sk_msg_elem_cpy(msg, i); - } - - while (i != msg->sg.end) { - msg->sg.data[i] = sge; - sge = nsge; - sk_msg_iter_var_next(i); - if (rsge.length) { - nsge = nnsge; - nnsge = sk_msg_elem_cpy(msg, i); - } else { - nsge = sk_msg_elem_cpy(msg, i); - } + } else { + nsge = sk_msg_elem_cpy(msg, i); } } +place_new: /* Place newly allocated data buffer */ sk_mem_charge(msg->sk, len); msg->sg.size += len; -- cgit v1.3 From 5d609ba262475db450ba69b8e8a557bd768ac07a Mon Sep 17 00:00:00 2001 From: Zijian Zhang Date: Wed, 6 Nov 2024 22:25:19 +0000 Subject: bpf, sockmap: Several fixes to bpf_msg_pop_data Several fixes to bpf_msg_pop_data, 1. In sk_msg_shift_left, we should put_page 2. if (len == 0), return early is better 3. pop the entire sk_msg (last == msg->sg.size) should be supported 4. Fix for the value of variable "a" 5. In sk_msg_shift_left, after shifting, i has already pointed to the next element. Addtional sk_msg_iter_var_next may result in BUG. Fixes: 7246d8ed4dcc ("bpf: helper to pop data from messages") Signed-off-by: Zijian Zhang Reviewed-by: John Fastabend Link: https://lore.kernel.org/r/20241106222520.527076-8-zijianzhang@bytedance.com Signed-off-by: Martin KaFai Lau --- net/core/filter.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 255d58bae2a9..2fdba950b575 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2904,8 +2904,10 @@ static const struct bpf_func_proto bpf_msg_push_data_proto = { static void sk_msg_shift_left(struct sk_msg *msg, int i) { + struct scatterlist *sge = sk_msg_elem(msg, i); int prev; + put_page(sg_page(sge)); do { prev = i; sk_msg_iter_var_next(i); @@ -2942,6 +2944,9 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, if (unlikely(flags)) return -EINVAL; + if (unlikely(len == 0)) + return 0; + /* First find the starting scatterlist element */ i = msg->sg.start; do { @@ -2954,7 +2959,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, } while (i != msg->sg.end); /* Bounds checks: start and pop must be inside message */ - if (start >= offset + l || last >= msg->sg.size) + if (start >= offset + l || last > msg->sg.size) return -EINVAL; space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); @@ -2983,12 +2988,12 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, */ if (start != offset) { struct scatterlist *nsge, *sge = sk_msg_elem(msg, i); - int a = start; + int a = start - offset; int b = sge->length - pop - a; sk_msg_iter_var_next(i); - if (pop < sge->length - a) { + if (b > 0) { if (space) { sge->length = a; sk_msg_shift_right(msg, i); @@ -3007,7 +3012,6 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, if (unlikely(!page)) return -ENOMEM; - sge->length = a; orig = sg_page(sge); from = sg_virt(sge); to = page_address(page); @@ -3017,7 +3021,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, put_page(orig); } pop = 0; - } else if (pop >= sge->length - a) { + } else { pop -= (sge->length - a); sge->length = a; } @@ -3051,7 +3055,6 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, pop -= sge->length; sk_msg_shift_left(msg, i); } - sk_msg_iter_var_next(i); } sk_mem_uncharge(msg->sk, len - pop); -- cgit v1.3 From 955afd57dc4bf7e8c620a0a9e3af3c881c2c6dff Mon Sep 17 00:00:00 2001 From: Zijian Zhang Date: Wed, 6 Nov 2024 22:25:20 +0000 Subject: bpf, sockmap: Fix sk_msg_reset_curr Found in the test_txmsg_pull in test_sockmap, ``` txmsg_cork = 512; // corking is importrant here opt->iov_length = 3; opt->iov_count = 1; opt->rate = 512; // sendmsg will be invoked 512 times ``` The first sendmsg will send an sk_msg with size 3, and bpf_msg_pull_data will be invoked the first time. sk_msg_reset_curr will reset the copybreak from 3 to 0. In the second sendmsg, since we are in the stage of corking, psock->cork will be reused in func sk_msg_alloc. msg->sg.copybreak is 0 now, the second msg will overwrite the first msg. As a result, we could not pass the data integrity test. The same problem happens in push and pop test. Thus, fix sk_msg_reset_curr to restore the correct copybreak. Fixes: bb9aefde5bba ("bpf: sockmap, updating the sg structure should also update curr") Signed-off-by: Zijian Zhang Link: https://lore.kernel.org/r/20241106222520.527076-9-zijianzhang@bytedance.com Signed-off-by: Martin KaFai Lau --- net/core/filter.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 2fdba950b575..64248d0ac4ad 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2604,18 +2604,16 @@ BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes) static void sk_msg_reset_curr(struct sk_msg *msg) { - u32 i = msg->sg.start; - u32 len = 0; - - do { - len += sk_msg_elem(msg, i)->length; - sk_msg_iter_var_next(i); - if (len >= msg->sg.size) - break; - } while (i != msg->sg.end); + if (!msg->sg.size) { + msg->sg.curr = msg->sg.start; + msg->sg.copybreak = 0; + } else { + u32 i = msg->sg.end; - msg->sg.curr = i; - msg->sg.copybreak = 0; + sk_msg_iter_var_prev(i); + msg->sg.curr = i; + msg->sg.copybreak = msg->sg.data[i].length; + } } static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { -- cgit v1.3 From 2634303f8773b0c602069887565cd412440be15d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 Nov 2024 09:14:58 +0100 Subject: alarmtimers: Remove return value from alarm functions Now that the SIG_IGN problem is solved in the core code, the alarmtimer callbacks do not require a return value anymore. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20241105064214.318837272@linutronix.de --- drivers/power/supply/charger-manager.c | 3 +-- fs/timerfd.c | 4 +--- include/linux/alarmtimer.h | 10 ++-------- kernel/time/alarmtimer.c | 16 +++++----------- net/netfilter/xt_IDLETIMER.c | 4 +--- 5 files changed, 10 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/drivers/power/supply/charger-manager.c b/drivers/power/supply/charger-manager.c index 96f0a7fbf105..09ec0ecf1486 100644 --- a/drivers/power/supply/charger-manager.c +++ b/drivers/power/supply/charger-manager.c @@ -1412,10 +1412,9 @@ static inline struct charger_desc *cm_get_drv_data(struct platform_device *pdev) return dev_get_platdata(&pdev->dev); } -static enum alarmtimer_restart cm_timer_func(struct alarm *alarm, ktime_t now) +static void cm_timer_func(struct alarm *alarm, ktime_t now) { cm_timer_set = false; - return ALARMTIMER_NORESTART; } static int charger_manager_probe(struct platform_device *pdev) diff --git a/fs/timerfd.c b/fs/timerfd.c index 137523e0bb21..f10c99ad5c60 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -79,13 +79,11 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) return HRTIMER_NORESTART; } -static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm, - ktime_t now) +static void timerfd_alarmproc(struct alarm *alarm, ktime_t now) { struct timerfd_ctx *ctx = container_of(alarm, struct timerfd_ctx, t.alarm); timerfd_triggered(ctx); - return ALARMTIMER_NORESTART; } /* diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h index 05e758b8b894..3ffa5341dce2 100644 --- a/include/linux/alarmtimer.h +++ b/include/linux/alarmtimer.h @@ -20,12 +20,6 @@ enum alarmtimer_type { ALARM_BOOTTIME_FREEZER, }; -enum alarmtimer_restart { - ALARMTIMER_NORESTART, - ALARMTIMER_RESTART, -}; - - #define ALARMTIMER_STATE_INACTIVE 0x00 #define ALARMTIMER_STATE_ENQUEUED 0x01 @@ -42,14 +36,14 @@ enum alarmtimer_restart { struct alarm { struct timerqueue_node node; struct hrtimer timer; - enum alarmtimer_restart (*function)(struct alarm *, ktime_t now); + void (*function)(struct alarm *, ktime_t now); enum alarmtimer_type type; int state; void *data; }; void alarm_init(struct alarm *alarm, enum alarmtimer_type type, - enum alarmtimer_restart (*function)(struct alarm *, ktime_t)); + void (*function)(struct alarm *, ktime_t)); void alarm_start(struct alarm *alarm, ktime_t start); void alarm_start_relative(struct alarm *alarm, ktime_t start); void alarm_restart(struct alarm *alarm); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 593e7d561fa8..37d2d79daea4 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -321,7 +321,7 @@ static int alarmtimer_resume(struct device *dev) static void __alarm_init(struct alarm *alarm, enum alarmtimer_type type, - enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) + void (*function)(struct alarm *, ktime_t)) { timerqueue_init(&alarm->node); alarm->timer.function = alarmtimer_fired; @@ -337,7 +337,7 @@ __alarm_init(struct alarm *alarm, enum alarmtimer_type type, * @function: callback that is run when the alarm fires */ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, - enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) + void (*function)(struct alarm *, ktime_t)) { hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid, HRTIMER_MODE_ABS); @@ -530,14 +530,12 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) * * Return: whether the timer is to be restarted */ -static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, ktime_t now) +static void alarm_handle_timer(struct alarm *alarm, ktime_t now) { struct k_itimer *ptr = container_of(alarm, struct k_itimer, it.alarm.alarmtimer); guard(spinlock_irqsave)(&ptr->it_lock); posix_timer_queue_signal(ptr); - - return ALARMTIMER_NORESTART; } /** @@ -698,18 +696,14 @@ static int alarm_timer_create(struct k_itimer *new_timer) * @now: time at the timer expiration * * Wakes up the task that set the alarmtimer - * - * Return: ALARMTIMER_NORESTART */ -static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, - ktime_t now) +static void alarmtimer_nsleep_wakeup(struct alarm *alarm, ktime_t now) { struct task_struct *task = alarm->data; alarm->data = NULL; if (task) wake_up_process(task); - return ALARMTIMER_NORESTART; } /** @@ -761,7 +755,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, static void alarm_init_on_stack(struct alarm *alarm, enum alarmtimer_type type, - enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) + void (*function)(struct alarm *, ktime_t)) { hrtimer_init_on_stack(&alarm->timer, alarm_bases[type].base_clockid, HRTIMER_MODE_ABS); diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index db720efa811d..5514600586a9 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -107,14 +107,12 @@ static void idletimer_tg_expired(struct timer_list *t) schedule_work(&timer->work); } -static enum alarmtimer_restart idletimer_tg_alarmproc(struct alarm *alarm, - ktime_t now) +static void idletimer_tg_alarmproc(struct alarm *alarm, ktime_t now) { struct idletimer_tg *timer = alarm->data; pr_debug("alarm %s expired\n", timer->attr.attr.name); schedule_work(&timer->work); - return ALARMTIMER_NORESTART; } static int idletimer_check_sysfs_name(const char *name, unsigned int size) -- cgit v1.3 From eb02688c5c45c3e7af7e71f036a7144f5639cbfe Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 5 Nov 2024 19:23:50 +0100 Subject: ipv6: release nexthop on device removal The CI is hitting some aperiodic hangup at device removal time in the pmtu.sh self-test: unregister_netdevice: waiting for veth_A-R1 to become free. Usage count = 6 ref_tracker: veth_A-R1@ffff888013df15d8 has 1/5 users at dst_init+0x84/0x4a0 dst_alloc+0x97/0x150 ip6_dst_alloc+0x23/0x90 ip6_rt_pcpu_alloc+0x1e6/0x520 ip6_pol_route+0x56f/0x840 fib6_rule_lookup+0x334/0x630 ip6_route_output_flags+0x259/0x480 ip6_dst_lookup_tail.constprop.0+0x5c2/0x940 ip6_dst_lookup_flow+0x88/0x190 udp_tunnel6_dst_lookup+0x2a7/0x4c0 vxlan_xmit_one+0xbde/0x4a50 [vxlan] vxlan_xmit+0x9ad/0xf20 [vxlan] dev_hard_start_xmit+0x10e/0x360 __dev_queue_xmit+0xf95/0x18c0 arp_solicit+0x4a2/0xe00 neigh_probe+0xaa/0xf0 While the first suspect is the dst_cache, explicitly tracking the dst owing the last device reference via probes proved such dst is held by the nexthop in the originating fib6_info. Similar to commit f5b51fe804ec ("ipv6: route: purge exception on removal"), we need to explicitly release the originating fib info when disconnecting a to-be-removed device from a live ipv6 dst: move the fib6_info cleanup into ip6_dst_ifdown(). Tested running: ./pmtu.sh cleanup_ipv6_exception in a tight loop for more than 400 iterations with no spat, running an unpatched kernel I observed a splat every ~10 iterations. Fixes: f88d8ea67fbd ("ipv6: Plumb support for nexthop object in a fib6_info") Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/604c45c188c609b732286b47ac2a451a40f6cf6d.1730828007.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d7ce5cf2017a..038c1eeef0be 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -374,6 +374,7 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { struct rt6_info *rt = dst_rt6_info(dst); struct inet6_dev *idev = rt->rt6i_idev; + struct fib6_info *from; if (idev && idev->dev != blackhole_netdev) { struct inet6_dev *blackhole_idev = in6_dev_get(blackhole_netdev); @@ -383,6 +384,8 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev) in6_dev_put(idev); } } + from = unrcu_pointer(xchg(&rt->from, NULL)); + fib6_info_release(from); } static bool __rt6_check_expired(const struct rt6_info *rt) @@ -1455,7 +1458,6 @@ static DEFINE_SPINLOCK(rt6_exception_lock); static void rt6_remove_exception(struct rt6_exception_bucket *bucket, struct rt6_exception *rt6_ex) { - struct fib6_info *from; struct net *net; if (!bucket || !rt6_ex) @@ -1467,8 +1469,6 @@ static void rt6_remove_exception(struct rt6_exception_bucket *bucket, /* purge completely the exception to allow releasing the held resources: * some [sk] cache may keep the dst around for unlimited time */ - from = unrcu_pointer(xchg(&rt6_ex->rt6i->from, NULL)); - fib6_info_release(from); dst_dev_put(&rt6_ex->rt6i->dst); hlist_del_rcu(&rt6_ex->hlist); -- cgit v1.3 From eb688451dcfb7de0fef678a476096d3616228815 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 31 Oct 2024 16:14:26 +0100 Subject: net: pktgen: Switch to use hrtimer_setup_sleeper_on_stack() hrtimer_setup_sleeper_on_stack() replaces hrtimer_init_sleeper_on_stack() to keep the naming convention consistent. Convert the usage site over to it. The conversion was done with Coccinelle. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/c4b40b8fef250b6a325e1b8bd6057005fb3cb660.1730386209.git.namcao@linutronix.de --- net/core/pktgen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 34f68ef74b8f..7e23cacbe66e 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2285,7 +2285,7 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) s64 remaining; struct hrtimer_sleeper t; - hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_setup_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); hrtimer_set_expires(&t.timer, spin_until); remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer)); -- cgit v1.3 From 9907cda95fcbf44141b1292faab89cf8ec542f22 Mon Sep 17 00:00:00 2001 From: Juraj Šarinay Date: Sun, 3 Nov 2024 13:45:25 +0100 Subject: net: nfc: Propagate ISO14443 type A target ATS to userspace via netlink MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a 20-byte field ats to struct nfc_target and expose it as NFC_ATTR_TARGET_ATS via the netlink interface. The payload contains 'historical bytes' that help to distinguish cards from one another. The information is commonly used to assemble an emulated ATR similar to that reported by smart cards with contacts. Add a 20-byte field target_ats to struct nci_dev to hold the payload obtained in nci_rf_intf_activated_ntf_packet() and copy it to over to nfc_target.ats in nci_activate_target(). The approach is similar to the handling of 'general bytes' within ATR_RES. Replace the hard-coded size of rats_res within struct activation_params_nfca_poll_iso_dep by the equal constant NFC_ATS_MAXSIZE now defined in nfc.h Within NCI, the information corresponds to the 'RATS Response' activation parameter that omits the initial length byte TL. This loses no information and is consistent with our handling of SENSB_RES that also drops the first (constant) byte. Tested with nxp_nci_i2c on a few type A targets including an ICAO 9303 compliant passport. I refrain from the corresponding change to digital_in_recv_ats() to have the few drivers based on digital.h fill nfc_target.ats, as I have no way to test it. That class of drivers appear not to set NFC_ATTR_TARGET_SENSB_RES either. Consider a separate patch to propagate (all) the parameters. Signed-off-by: Juraj Šarinay Link: https://patch.msgid.link/20241103124525.8392-1-juraj@sarinay.com Signed-off-by: Paolo Abeni --- include/net/nfc/nci.h | 2 +- include/net/nfc/nci_core.h | 4 ++++ include/net/nfc/nfc.h | 4 ++++ include/uapi/linux/nfc.h | 3 +++ net/nfc/nci/core.c | 13 ++++++++++++- net/nfc/nci/ntf.c | 32 +++++++++++++++++++++++++++++++- net/nfc/netlink.c | 5 +++++ 7 files changed, 60 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/nfc/nci.h b/include/net/nfc/nci.h index dc36519d16aa..09efcaed7c3f 100644 --- a/include/net/nfc/nci.h +++ b/include/net/nfc/nci.h @@ -475,7 +475,7 @@ struct nci_rf_discover_ntf { #define NCI_OP_RF_INTF_ACTIVATED_NTF nci_opcode_pack(NCI_GID_RF_MGMT, 0x05) struct activation_params_nfca_poll_iso_dep { __u8 rats_res_len; - __u8 rats_res[20]; + __u8 rats_res[NFC_ATS_MAXSIZE]; }; struct activation_params_nfcb_poll_iso_dep { diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h index ea8595651c38..e180bdf2f82b 100644 --- a/include/net/nfc/nci_core.h +++ b/include/net/nfc/nci_core.h @@ -265,6 +265,10 @@ struct nci_dev { /* stored during intf_activated_ntf */ __u8 remote_gb[NFC_MAX_GT_LEN]; __u8 remote_gb_len; + + /* stored during intf_activated_ntf */ + __u8 target_ats[NFC_ATS_MAXSIZE]; + __u8 target_ats_len; }; /* ----- NCI Devices ----- */ diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h index 3a3781838c67..127e6c7d910d 100644 --- a/include/net/nfc/nfc.h +++ b/include/net/nfc/nfc.h @@ -86,6 +86,8 @@ struct nfc_ops { * is a type A one. The %sens_res most significant byte must be byte 2 * as described by the NFC Forum digital specification (i.e. the platform * configuration one) while %sens_res least significant byte is byte 1. + * @ats_len: length of Answer To Select in bytes + * @ats: Answer To Select returned by an ISO 14443 Type A target upon activation */ struct nfc_target { u32 idx; @@ -105,6 +107,8 @@ struct nfc_target { u8 is_iso15693; u8 iso15693_dsfid; u8 iso15693_uid[NFC_ISO15693_UID_MAXSIZE]; + u8 ats_len; + u8 ats[NFC_ATS_MAXSIZE]; }; /** diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h index 4fa4e979e948..2f5b4be25261 100644 --- a/include/uapi/linux/nfc.h +++ b/include/uapi/linux/nfc.h @@ -164,6 +164,7 @@ enum nfc_commands { * @NFC_ATTR_VENDOR_SUBCMD: Vendor specific sub command * @NFC_ATTR_VENDOR_DATA: Vendor specific data, to be optionally passed * to a vendor specific command implementation + * @NFC_ATTR_TARGET_ATS: ISO 14443 type A target Answer To Select */ enum nfc_attrs { NFC_ATTR_UNSPEC, @@ -198,6 +199,7 @@ enum nfc_attrs { NFC_ATTR_VENDOR_ID, NFC_ATTR_VENDOR_SUBCMD, NFC_ATTR_VENDOR_DATA, + NFC_ATTR_TARGET_ATS, /* private: internal use only */ __NFC_ATTR_AFTER_LAST }; @@ -225,6 +227,7 @@ enum nfc_sdp_attr { #define NFC_GB_MAXSIZE 48 #define NFC_FIRMWARE_NAME_MAXSIZE 32 #define NFC_ISO15693_UID_MAXSIZE 8 +#define NFC_ATS_MAXSIZE 20 /* NFC protocols */ #define NFC_PROTO_JEWEL 1 diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index f456a5911e7d..1ec5955fe469 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -757,6 +757,14 @@ int nci_core_conn_close(struct nci_dev *ndev, u8 conn_id) } EXPORT_SYMBOL(nci_core_conn_close); +static void nci_set_target_ats(struct nfc_target *target, struct nci_dev *ndev) +{ + if (ndev->target_ats_len > 0) { + target->ats_len = ndev->target_ats_len; + memcpy(target->ats, ndev->target_ats, target->ats_len); + } +} + static int nci_set_local_general_bytes(struct nfc_dev *nfc_dev) { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); @@ -939,8 +947,11 @@ static int nci_activate_target(struct nfc_dev *nfc_dev, msecs_to_jiffies(NCI_RF_DISC_SELECT_TIMEOUT)); } - if (!rc) + if (!rc) { ndev->target_active_prot = protocol; + if (protocol == NFC_PROTO_ISO14443) + nci_set_target_ats(target, ndev); + } return rc; } diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index 994a0a1efb58..a818eff27e6b 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -402,7 +402,7 @@ static int nci_extract_activation_params_iso_dep(struct nci_dev *ndev, switch (ntf->activation_rf_tech_and_mode) { case NCI_NFC_A_PASSIVE_POLL_MODE: nfca_poll = &ntf->activation_params.nfca_poll_iso_dep; - nfca_poll->rats_res_len = min_t(__u8, *data++, 20); + nfca_poll->rats_res_len = min_t(__u8, *data++, NFC_ATS_MAXSIZE); pr_debug("rats_res_len %d\n", nfca_poll->rats_res_len); if (nfca_poll->rats_res_len > 0) { memcpy(nfca_poll->rats_res, @@ -531,6 +531,28 @@ static int nci_store_general_bytes_nfc_dep(struct nci_dev *ndev, return NCI_STATUS_OK; } +static int nci_store_ats_nfc_iso_dep(struct nci_dev *ndev, + const struct nci_rf_intf_activated_ntf *ntf) +{ + ndev->target_ats_len = 0; + + if (ntf->activation_params_len <= 0) + return NCI_STATUS_OK; + + if (ntf->activation_params.nfca_poll_iso_dep.rats_res_len > NFC_ATS_MAXSIZE) { + pr_debug("ATS too long\n"); + return NCI_STATUS_RF_PROTOCOL_ERROR; + } + + if (ntf->activation_params.nfca_poll_iso_dep.rats_res_len > 0) { + ndev->target_ats_len = ntf->activation_params.nfca_poll_iso_dep.rats_res_len; + memcpy(ndev->target_ats, ntf->activation_params.nfca_poll_iso_dep.rats_res, + ndev->target_ats_len); + } + + return NCI_STATUS_OK; +} + static void nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, const struct sk_buff *skb) { @@ -660,6 +682,14 @@ exit: if (err != NCI_STATUS_OK) pr_err("unable to store general bytes\n"); } + + /* store ATS to be reported later in nci_activate_target */ + if (ntf.rf_interface == NCI_RF_INTERFACE_ISO_DEP && + ntf.activation_rf_tech_and_mode == NCI_NFC_A_PASSIVE_POLL_MODE) { + err = nci_store_ats_nfc_iso_dep(ndev, &ntf); + if (err != NCI_STATUS_OK) + pr_err("unable to store ATS\n"); + } } if (!(ntf.activation_rf_tech_and_mode & NCI_RF_TECH_MODE_LISTEN_MASK)) { diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index dd2ce73a24fb..6a40b8d0350d 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -96,6 +96,11 @@ static int nfc_genl_send_target(struct sk_buff *msg, struct nfc_target *target, goto nla_put_failure; } + if (target->ats_len > 0 && + nla_put(msg, NFC_ATTR_TARGET_ATS, target->ats_len, + target->ats)) + goto nla_put_failure; + genlmsg_end(msg, hdr); return 0; -- cgit v1.3 From c03d278fdf35e73dd0ec543b9b556876b9d9a8dc Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 5 Nov 2024 12:07:22 +0100 Subject: netfilter: nf_tables: wait for rcu grace period on net_device removal 8c873e219970 ("netfilter: core: free hooks with call_rcu") removed synchronize_net() call when unregistering basechain hook, however, net_device removal event handler for the NFPROTO_NETDEV was not updated to wait for RCU grace period. Note that 835b803377f5 ("netfilter: nf_tables_netdev: unregister hooks on net_device removal") does not remove basechain rules on device removal, I was hinted to remove rules on net_device removal later, see 5ebe0b0eec9d ("netfilter: nf_tables: destroy basechain and rules on netdevice removal"). Although NETDEV_UNREGISTER event is guaranteed to be handled after synchronize_net() call, this path needs to wait for rcu grace period via rcu callback to release basechain hooks if netns is alive because an ongoing netlink dump could be in progress (sockets hold a reference on the netns). Note that nf_tables_pre_exit_net() unregisters and releases basechain hooks but it is possible to see NETDEV_UNREGISTER at a later stage in the netns exit path, eg. veth peer device in another netns: cleanup_net() default_device_exit_batch() unregister_netdevice_many_notify() notifier_call_chain() nf_tables_netdev_event() __nft_release_basechain() In this particular case, same rule of thumb applies: if netns is alive, then wait for rcu grace period because netlink dump in the other netns could be in progress. Otherwise, if the other netns is going away then no netlink dump can be in progress and basechain hooks can be released inmediately. While at it, turn WARN_ON() into WARN_ON_ONCE() for the basechain validation, which should not ever happen. Fixes: 835b803377f5 ("netfilter: nf_tables_netdev: unregister hooks on net_device removal") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++++ net/netfilter/nf_tables_api.c | 41 ++++++++++++++++++++++++++++++++------- 2 files changed, 38 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 91ae20cb7648..066a3ea33b12 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1103,6 +1103,7 @@ struct nft_rule_blob { * @name: name of the chain * @udlen: user data length * @udata: user data in the chain + * @rcu_head: rcu head for deferred release * @blob_next: rule blob pointer to the next in the chain */ struct nft_chain { @@ -1120,6 +1121,7 @@ struct nft_chain { char *name; u16 udlen; u8 *udata; + struct rcu_head rcu_head; /* Only used during control plane commit phase: */ struct nft_rule_blob *blob_next; @@ -1263,6 +1265,7 @@ static inline void nft_use_inc_restore(u32 *use) * @sets: sets in the table * @objects: stateful objects in the table * @flowtables: flow tables in the table + * @net: netnamespace this table belongs to * @hgenerator: handle generator state * @handle: table handle * @use: number of chain references to this table @@ -1282,6 +1285,7 @@ struct nft_table { struct list_head sets; struct list_head objects; struct list_head flowtables; + possible_net_t net; u64 hgenerator; u64 handle; u32 use; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a24fe62650a7..588a2757986c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1495,6 +1495,7 @@ static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info, INIT_LIST_HEAD(&table->sets); INIT_LIST_HEAD(&table->objects); INIT_LIST_HEAD(&table->flowtables); + write_pnet(&table->net, net); table->family = family; table->flags = flags; table->handle = ++nft_net->table_handle; @@ -11430,22 +11431,48 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, } EXPORT_SYMBOL_GPL(nft_data_dump); -int __nft_release_basechain(struct nft_ctx *ctx) +static void __nft_release_basechain_now(struct nft_ctx *ctx) { struct nft_rule *rule, *nr; - if (WARN_ON(!nft_is_base_chain(ctx->chain))) - return 0; - - nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain); list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) { list_del(&rule->list); - nft_use_dec(&ctx->chain->use); nf_tables_rule_release(ctx, rule); } + nf_tables_chain_destroy(ctx->chain); +} + +static void nft_release_basechain_rcu(struct rcu_head *head) +{ + struct nft_chain *chain = container_of(head, struct nft_chain, rcu_head); + struct nft_ctx ctx = { + .family = chain->table->family, + .chain = chain, + .net = read_pnet(&chain->table->net), + }; + + __nft_release_basechain_now(&ctx); + put_net(ctx.net); +} + +int __nft_release_basechain(struct nft_ctx *ctx) +{ + struct nft_rule *rule; + + if (WARN_ON_ONCE(!nft_is_base_chain(ctx->chain))) + return 0; + + nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain); + list_for_each_entry(rule, &ctx->chain->rules, list) + nft_use_dec(&ctx->chain->use); + nft_chain_del(ctx->chain); nft_use_dec(&ctx->table->use); - nf_tables_chain_destroy(ctx->chain); + + if (maybe_get_net(ctx->net)) + call_rcu(&ctx->chain->rcu_head, nft_release_basechain_rcu); + else + __nft_release_basechain_now(ctx); return 0; } -- cgit v1.3 From b4ebb58cb9a4b1b5cb5278b09d6afdcd71b2a6b4 Mon Sep 17 00:00:00 2001 From: Lingbo Kong Date: Thu, 31 Oct 2024 21:42:23 +0800 Subject: wifi: cfg80211: Remove the Medium Synchronization Delay validity check Currently, when the driver attempts to connect to an AP MLD with multiple APs, the cfg80211_mlme_check_mlo_compat() function requires the Medium Synchronization Delay values from different APs of the same AP MLD to be equal, which may result in connection failures. This is because when the driver receives a multi-link probe response from an AP MLD with multiple APs, cfg80211 updates the Elements for each AP based on the multi-link probe response. If the Medium Synchronization Delay is set in the multi-link probe response, the Elements for each AP belonging to the same AP MLD will have the Medium Synchronization Delay set simultaneously. If non-multi-link probe responses are received from different APs of the same MLD AP, cfg80211 will still update the Elements based on the non-multi-link probe response. Since the non-multi-link probe response does not set the Medium Synchronization Delay (IEEE 802.11be-2024-35.3.4.4), if the Elements from a non-multi-link probe response overwrite those from a multi-link probe response that has set the Medium Synchronization Delay, the Medium Synchronization Delay values for APs belonging to the same AP MLD will not be equal. This discrepancy causes the cfg80211_mlme_check_mlo_compat() function to fail, leading to connection failures. Commit ccb964b4ab16 ("wifi: cfg80211: validate MLO connections better") did not take this into account. To address this issue, remove this validity check. Fixes: ccb964b4ab16 ("wifi: cfg80211: validate MLO connections better") Signed-off-by: Lingbo Kong Link: https://patch.msgid.link/20241031134223.970-1-quic_lingbok@quicinc.com Signed-off-by: Johannes Berg --- net/wireless/mlme.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net') diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 4dac81854721..a5eb92d93074 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -340,12 +340,6 @@ cfg80211_mlme_check_mlo_compat(const struct ieee80211_multi_link_elem *mle_a, return -EINVAL; } - if (ieee80211_mle_get_eml_med_sync_delay((const u8 *)mle_a) != - ieee80211_mle_get_eml_med_sync_delay((const u8 *)mle_b)) { - NL_SET_ERR_MSG(extack, "link EML medium sync delay mismatch"); - return -EINVAL; - } - if (ieee80211_mle_get_eml_cap((const u8 *)mle_a) != ieee80211_mle_get_eml_cap((const u8 *)mle_b)) { NL_SET_ERR_MSG(extack, "link EML capabilities mismatch"); -- cgit v1.3 From 9c46a3a5b394d6d123866aa44436fc2cd342eb0d Mon Sep 17 00:00:00 2001 From: Aleksei Vetrov Date: Tue, 29 Oct 2024 13:22:11 +0000 Subject: wifi: nl80211: fix bounds checker error in nl80211_parse_sched_scan The channels array in the cfg80211_scan_request has a __counted_by attribute attached to it, which points to the n_channels variable. This attribute is used in bounds checking, and if it is not set before the array is filled, then the bounds sanitizer will issue a warning or a kernel panic if CONFIG_UBSAN_TRAP is set. This patch sets the size of allocated memory as the initial value for n_channels. It is updated with the actual number of added elements after the array is filled. Fixes: aa4ec06c455d ("wifi: cfg80211: use __counted_by where appropriate") Cc: stable@vger.kernel.org Signed-off-by: Aleksei Vetrov Reviewed-by: Jeff Johnson Link: https://patch.msgid.link/20241029-nl80211_parse_sched_scan-bounds-checker-fix-v2-1-c804b787341f@google.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 1ac8a196f376..d281568b2e2e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -9802,6 +9802,7 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, request = kzalloc(size, GFP_KERNEL); if (!request) return ERR_PTR(-ENOMEM); + request->n_channels = n_channels; if (n_ssids) request->ssids = (void *)request + -- cgit v1.3 From bb9df91cfe651d418719c52a4f47d4a49ac06609 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 26 Oct 2024 17:34:49 +0200 Subject: wifi: cfg80211: Fix an error handling path in nl80211_start_ap() All error handling paths go to "out", except this one. Before the commit in Fixes, error in the previous code would also end to "out", freeing the memory. Move the code up to avoid the leak. Fixes: 62262dd00c31 ("wifi: cfg80211: disallow SMPS in AP mode") Signed-off-by: Christophe JAILLET Link: https://patch.msgid.link/eae54ce066d541914f272b10cab7b263c08eced3.1729956868.git.christophe.jaillet@wanadoo.fr [move code, update commit message] Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d281568b2e2e..b8cdd844f0e6 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6123,6 +6123,10 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) !info->attrs[NL80211_ATTR_BEACON_HEAD]) return -EINVAL; + if (info->attrs[NL80211_ATTR_SMPS_MODE] && + nla_get_u8(info->attrs[NL80211_ATTR_SMPS_MODE]) != NL80211_SMPS_OFF) + return -EOPNOTSUPP; + params = kzalloc(sizeof(*params), GFP_KERNEL); if (!params) return -ENOMEM; @@ -6272,10 +6276,6 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) goto out; } - if (info->attrs[NL80211_ATTR_SMPS_MODE] && - nla_get_u8(info->attrs[NL80211_ATTR_SMPS_MODE]) != NL80211_SMPS_OFF) - return -EOPNOTSUPP; - params->pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]); if (params->pbss && !rdev->wiphy.bands[NL80211_BAND_60GHZ]) { err = -EOPNOTSUPP; -- cgit v1.3 From 702c290a1cb16f4a64567cae0bedb848399f7915 Mon Sep 17 00:00:00 2001 From: Gilad Naaman Date: Mon, 4 Nov 2024 08:35:44 +0000 Subject: sctp: Avoid enqueuing addr events redundantly Avoid modifying or enqueuing new events if it's possible to tell that no one will consume them. Since enqueueing requires searching the current queue for opposite events for the same address, adding addresses en-masse turns this inetaddr_event into a bottle-neck, as it will get slower and slower with each address added. Signed-off-by: Gilad Naaman Acked-by: Xin Long Link: https://patch.msgid.link/20241104083545.114-1-gnaaman@drivenets.com Signed-off-by: Paolo Abeni --- net/sctp/ipv6.c | 2 +- net/sctp/protocol.c | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index f7b809c0d142..b96c849545ae 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -103,10 +103,10 @@ static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev, ipv6_addr_equal(&addr->a.v6.sin6_addr, &ifa->addr) && addr->a.v6.sin6_scope_id == ifa->idev->dev->ifindex) { - sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_DEL); found = 1; addr->valid = 0; list_del_rcu(&addr->list); + sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_DEL); break; } } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 39ca5403d4d7..8b9a1b96695e 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -738,6 +738,20 @@ void sctp_addr_wq_mgmt(struct net *net, struct sctp_sockaddr_entry *addr, int cm */ spin_lock_bh(&net->sctp.addr_wq_lock); + + /* Avoid searching the queue or modifying it if there are no consumers, + * as it can lead to performance degradation if addresses are modified + * en-masse. + * + * If the queue already contains some events, update it anyway to avoid + * ugly races between new sessions and new address events. + */ + if (list_empty(&net->sctp.auto_asconf_splist) && + list_empty(&net->sctp.addr_waitq)) { + spin_unlock_bh(&net->sctp.addr_wq_lock); + return; + } + /* Offsets existing events in addr_wq */ addrw = sctp_addr_wq_lookup(net, addr); if (addrw) { @@ -808,10 +822,10 @@ static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev, if (addr->a.sa.sa_family == AF_INET && addr->a.v4.sin_addr.s_addr == ifa->ifa_local) { - sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_DEL); found = 1; addr->valid = 0; list_del_rcu(&addr->list); + sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_DEL); break; } } -- cgit v1.3 From de88df01796b309903b70888fbdf2b89607e3a6a Mon Sep 17 00:00:00 2001 From: Wenjia Zhang Date: Wed, 6 Nov 2024 09:26:12 +0100 Subject: net/smc: Fix lookup of netdev by using ib_device_get_netdev() The SMC-R variant of the SMC protocol used direct call to function ib_device_ops.get_netdev() to lookup netdev. As we used mlx5 device driver to run SMC-R, it failed to find a device, because in mlx5_ib the internal net device management for retrieving net devices was replaced by a common interface ib_device_get_netdev() in commit 8d159eb2117b ("RDMA/mlx5: Use IB set_netdev and get_netdev functions"). Since such direct accesses to the internal net device management is not recommended at all, update the SMC-R code to use proper API ib_device_get_netdev(). Fixes: 54903572c23c ("net/smc: allow pnetid-less configuration") Reported-by: Aswin K Reviewed-by: Gerd Bayer Reviewed-by: Halil Pasic Reviewed-by: Simon Horman Reviewed-by: Dust Li Reviewed-by: Wen Gu Reviewed-by: Zhu Yanjun Reviewed-by: D. Wythe Signed-off-by: Wenjia Zhang Reviewed-by: Leon Romanovsky Link: https://patch.msgid.link/20241106082612.57803-1-wenjia@linux.ibm.com Signed-off-by: Jakub Kicinski --- net/smc/smc_ib.c | 8 ++------ net/smc/smc_pnet.c | 4 +--- 2 files changed, 3 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 9297dc20bfe2..9c563cdbea90 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -899,9 +899,7 @@ static void smc_copy_netdev_ifindex(struct smc_ib_device *smcibdev, int port) struct ib_device *ibdev = smcibdev->ibdev; struct net_device *ndev; - if (!ibdev->ops.get_netdev) - return; - ndev = ibdev->ops.get_netdev(ibdev, port + 1); + ndev = ib_device_get_netdev(ibdev, port + 1); if (ndev) { smcibdev->ndev_ifidx[port] = ndev->ifindex; dev_put(ndev); @@ -921,9 +919,7 @@ void smc_ib_ndev_change(struct net_device *ndev, unsigned long event) port_cnt = smcibdev->ibdev->phys_port_cnt; for (i = 0; i < min_t(size_t, port_cnt, SMC_MAX_PORTS); i++) { libdev = smcibdev->ibdev; - if (!libdev->ops.get_netdev) - continue; - lndev = libdev->ops.get_netdev(libdev, i + 1); + lndev = ib_device_get_netdev(libdev, i + 1); dev_put(lndev); if (lndev != ndev) continue; diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index a04aa0e882f8..716808f374a8 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -1054,9 +1054,7 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev, for (i = 1; i <= SMC_MAX_PORTS; i++) { if (!rdma_is_port_valid(ibdev->ibdev, i)) continue; - if (!ibdev->ibdev->ops.get_netdev) - continue; - ndev = ibdev->ibdev->ops.get_netdev(ibdev->ibdev, i); + ndev = ib_device_get_netdev(ibdev->ibdev, i); if (!ndev) continue; dev_put(ndev); -- cgit v1.3 From fc9de52de38f656399d2ce40f7349a6b5f86e787 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 6 Nov 2024 13:03:22 +0000 Subject: rxrpc: Fix missing locking causing hanging calls If a call gets aborted (e.g. because kafs saw a signal) between it being queued for connection and the I/O thread picking up the call, the abort will be prioritised over the connection and it will be removed from local->new_client_calls by rxrpc_disconnect_client_call() without a lock being held. This may cause other calls on the list to disappear if a race occurs. Fix this by taking the client_call_lock when removing a call from whatever list its ->wait_link happens to be on. Signed-off-by: David Howells cc: linux-afs@lists.infradead.org Reported-by: Marc Dionne Fixes: 9d35d880e0e4 ("rxrpc: Move client call connection to the I/O thread") Link: https://patch.msgid.link/726660.1730898202@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 1 + net/rxrpc/conn_client.c | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index a1b126a6b0d7..cc22596c7250 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -287,6 +287,7 @@ EM(rxrpc_call_see_input, "SEE input ") \ EM(rxrpc_call_see_release, "SEE release ") \ EM(rxrpc_call_see_userid_exists, "SEE u-exists") \ + EM(rxrpc_call_see_waiting_call, "SEE q-conn ") \ E_(rxrpc_call_see_zap, "SEE zap ") #define rxrpc_txqueue_traces \ diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index d25bf1cf3670..bb11e8289d6d 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -516,6 +516,7 @@ void rxrpc_connect_client_calls(struct rxrpc_local *local) spin_lock(&local->client_call_lock); list_move_tail(&call->wait_link, &bundle->waiting_calls); + rxrpc_see_call(call, rxrpc_call_see_waiting_call); spin_unlock(&local->client_call_lock); if (rxrpc_bundle_has_space(bundle)) @@ -586,7 +587,10 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call _debug("call is waiting"); ASSERTCMP(call->call_id, ==, 0); ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags)); + /* May still be on ->new_client_calls. */ + spin_lock(&local->client_call_lock); list_del_init(&call->wait_link); + spin_unlock(&local->client_call_lock); return; } -- cgit v1.3 From d293958a8595ba566fb90b99da4d6263e14fee15 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Nov 2024 22:19:22 +0000 Subject: net/smc: do not leave a dangling sk pointer in __smc_create() Thanks to commit 4bbd360a5084 ("socket: Print pf->create() when it does not clear sock->sk on failure."), syzbot found an issue with AF_SMC: smc_create must clear sock->sk on failure, family: 43, type: 1, protocol: 0 WARNING: CPU: 0 PID: 5827 at net/socket.c:1565 __sock_create+0x96f/0xa30 net/socket.c:1563 Modules linked in: CPU: 0 UID: 0 PID: 5827 Comm: syz-executor259 Not tainted 6.12.0-rc6-next-20241106-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:__sock_create+0x96f/0xa30 net/socket.c:1563 Code: 03 00 74 08 4c 89 e7 e8 4f 3b 85 f8 49 8b 34 24 48 c7 c7 40 89 0c 8d 8b 54 24 04 8b 4c 24 0c 44 8b 44 24 08 e8 32 78 db f7 90 <0f> 0b 90 90 e9 d3 fd ff ff 89 e9 80 e1 07 fe c1 38 c1 0f 8c ee f7 RSP: 0018:ffffc90003e4fda0 EFLAGS: 00010246 RAX: 099c6f938c7f4700 RBX: 1ffffffff1a595fd RCX: ffff888034823c00 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 00000000ffffffe9 R08: ffffffff81567052 R09: 1ffff920007c9f50 R10: dffffc0000000000 R11: fffff520007c9f51 R12: ffffffff8d2cafe8 R13: 1ffffffff1a595fe R14: ffffffff9a789c40 R15: ffff8880764298c0 FS: 000055557b518380(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fa62ff43225 CR3: 0000000031628000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: sock_create net/socket.c:1616 [inline] __sys_socket_create net/socket.c:1653 [inline] __sys_socket+0x150/0x3c0 net/socket.c:1700 __do_sys_socket net/socket.c:1714 [inline] __se_sys_socket net/socket.c:1712 [inline] For reference, see commit 2d859aff775d ("Merge branch 'do-not-leave-dangling-sk-pointers-in-pf-create-functions'") Fixes: d25a92ccae6b ("net/smc: Introduce IPPROTO_SMC") Signed-off-by: Eric Dumazet Cc: Ignat Korchagin Cc: D. Wythe Cc: Dust Li Reviewed-by: Kuniyuki Iwashima Reviewed-by: Wenjia Zhang Link: https://patch.msgid.link/20241106221922.1544045-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/smc/af_smc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 0316217b7687..9d76e902fd77 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -3359,8 +3359,10 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol, else rc = smc_create_clcsk(net, sk, family); - if (rc) + if (rc) { sk_common_release(sk); + sock->sk = NULL; + } out: return rc; } -- cgit v1.3 From 1904fb9ebf911441f90a68e96b22aa73e4410505 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 5 Nov 2024 17:52:34 -0800 Subject: netlink: terminate outstanding dump on socket close Netlink supports iterative dumping of data. It provides the families the following ops: - start - (optional) kicks off the dumping process - dump - actual dump helper, keeps getting called until it returns 0 - done - (optional) pairs with .start, can be used for cleanup The whole process is asynchronous and the repeated calls to .dump don't actually happen in a tight loop, but rather are triggered in response to recvmsg() on the socket. This gives the user full control over the dump, but also means that the user can close the socket without getting to the end of the dump. To make sure .start is always paired with .done we check if there is an ongoing dump before freeing the socket, and if so call .done. The complication is that sockets can get freed from BH and .done is allowed to sleep. So we use a workqueue to defer the call, when needed. Unfortunately this does not work correctly. What we defer is not the cleanup but rather releasing a reference on the socket. We have no guarantee that we own the last reference, if someone else holds the socket they may release it in BH and we're back to square one. The whole dance, however, appears to be unnecessary. Only the user can interact with dumps, so we can clean up when socket is closed. And close always happens in process context. Some async code may still access the socket after close, queue notification skbs to it etc. but no dumps can start, end or otherwise make progress. Delete the workqueue and flush the dump state directly from the release handler. Note that further cleanup is possible in -next, for instance we now always call .done before releasing the main module reference, so dump doesn't have to take a reference of its own. Reported-by: syzkaller Fixes: ed5d7788a934 ("netlink: Do not schedule work from sk_destruct") Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241106015235.2458807-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/netlink/af_netlink.c | 31 ++++++++----------------------- net/netlink/af_netlink.h | 2 -- 2 files changed, 8 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 0a9287fadb47..f84aad420d44 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -393,15 +393,6 @@ static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) static void netlink_sock_destruct(struct sock *sk) { - struct netlink_sock *nlk = nlk_sk(sk); - - if (nlk->cb_running) { - if (nlk->cb.done) - nlk->cb.done(&nlk->cb); - module_put(nlk->cb.module); - kfree_skb(nlk->cb.skb); - } - skb_queue_purge(&sk->sk_receive_queue); if (!sock_flag(sk, SOCK_DEAD)) { @@ -414,14 +405,6 @@ static void netlink_sock_destruct(struct sock *sk) WARN_ON(nlk_sk(sk)->groups); } -static void netlink_sock_destruct_work(struct work_struct *work) -{ - struct netlink_sock *nlk = container_of(work, struct netlink_sock, - work); - - sk_free(&nlk->sk); -} - /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on * SMP. Look, when several writers sleep and reader wakes them up, all but one * immediately hit write lock and grab all the cpus. Exclusive sleep solves @@ -731,12 +714,6 @@ static void deferred_put_nlk_sk(struct rcu_head *head) if (!refcount_dec_and_test(&sk->sk_refcnt)) return; - if (nlk->cb_running && nlk->cb.done) { - INIT_WORK(&nlk->work, netlink_sock_destruct_work); - schedule_work(&nlk->work); - return; - } - sk_free(sk); } @@ -788,6 +765,14 @@ static int netlink_release(struct socket *sock) NETLINK_URELEASE, &n); } + /* Terminate any outstanding dump */ + if (nlk->cb_running) { + if (nlk->cb.done) + nlk->cb.done(&nlk->cb); + module_put(nlk->cb.module); + kfree_skb(nlk->cb.skb); + } + module_put(nlk->module); if (netlink_is_kernel(sk)) { diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 5b0e4e62ab8b..778a3809361f 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -4,7 +4,6 @@ #include #include -#include #include /* flags */ @@ -50,7 +49,6 @@ struct netlink_sock { struct rhash_head node; struct rcu_head rcu; - struct work_struct work; }; static inline struct netlink_sock *nlk_sk(struct sock *sk) -- cgit v1.3 From 48171c65f61148b0025128b70837280123f1309d Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 6 Nov 2024 22:37:32 +0100 Subject: ipv4: Prepare ip_route_output() to future .flowi4_tos conversion. Convert the "tos" parameter of ip_route_output() to dscp_t. This way we'll have a dscp_t value directly available when .flowi4_tos will eventually be converted to dscp_t. All ip_route_output() callers but one set this "tos" parameter to 0 and therefore don't need to be adapted to the new prototype. Only br_nf_pre_routing_finish() needs conversion. It can just use ip4h_dscp() to get the DSCP field from the IPv4 header. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/0f10d031dd44c70aae9bc6e19391cb30d5c2fe71.1730928699.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/net/route.h | 6 +++--- net/bridge/br_netfilter_hooks.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/route.h b/include/net/route.h index 586e59f7ed8a..0a690adfdff5 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -156,12 +156,12 @@ static inline struct rtable *ip_route_output_key(struct net *net, struct flowi4 * structure is only partially set, it may bypass some fib-rules. */ static inline struct rtable *ip_route_output(struct net *net, __be32 daddr, - __be32 saddr, u8 tos, int oif, - __u8 scope) + __be32 saddr, dscp_t dscp, + int oif, __u8 scope) { struct flowi4 fl4 = { .flowi4_oif = oif, - .flowi4_tos = tos, + .flowi4_tos = inet_dscp_to_dsfield(dscp), .flowi4_scope = scope, .daddr = daddr, .saddr = saddr, diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 17a5f5923d61..7f2f40cef5fe 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -406,7 +406,7 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_ goto free_skb; rt = ip_route_output(net, iph->daddr, 0, - iph->tos & INET_DSCP_MASK, 0, + ip4h_dscp(iph), 0, RT_SCOPE_UNIVERSE); if (!IS_ERR(rt)) { /* - Bridged-and-DNAT'ed traffic doesn't -- cgit v1.3 From 38a1f50a5efb5a941f3491d4d2353d12a87d04a0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Nov 2024 13:18:17 +0000 Subject: phonet: do not call synchronize_rcu() from phonet_route_del() Calling synchronize_rcu() while holding rcu_read_lock() is not permitted [1] Move the synchronize_rcu() + dev_put() to route_doit(). Alternative would be to not use rcu_read_lock() in route_doit(). [1] WARNING: suspicious RCU usage 6.12.0-rc5-syzkaller-01056-gf07a6e6ceb05 #0 Not tainted ----------------------------- kernel/rcu/tree.c:4092 Illegal synchronize_rcu() in RCU read-side critical section! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by syz-executor427/5840: #0: ffffffff8e937da0 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:337 [inline] #0: ffffffff8e937da0 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:849 [inline] #0: ffffffff8e937da0 (rcu_read_lock){....}-{1:2}, at: route_doit+0x3d6/0x640 net/phonet/pn_netlink.c:264 stack backtrace: CPU: 1 UID: 0 PID: 5840 Comm: syz-executor427 Not tainted 6.12.0-rc5-syzkaller-01056-gf07a6e6ceb05 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 lockdep_rcu_suspicious+0x226/0x340 kernel/locking/lockdep.c:6821 synchronize_rcu+0xea/0x360 kernel/rcu/tree.c:4089 phonet_route_del+0xc6/0x140 net/phonet/pn_dev.c:409 route_doit+0x514/0x640 net/phonet/pn_netlink.c:275 rtnetlink_rcv_msg+0x791/0xcf0 net/core/rtnetlink.c:6790 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2551 netlink_unicast_kernel net/netlink/af_netlink.c:1331 [inline] netlink_unicast+0x7f6/0x990 net/netlink/af_netlink.c:1357 netlink_sendmsg+0x8e4/0xcb0 net/netlink/af_netlink.c:1901 sock_sendmsg_nosec net/socket.c:729 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:744 sock_write_iter+0x2d7/0x3f0 net/socket.c:1165 new_sync_write fs/read_write.c:590 [inline] vfs_write+0xaeb/0xd30 fs/read_write.c:683 ksys_write+0x183/0x2b0 fs/read_write.c:736 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 17a1ac0018ae ("phonet: Don't hold RTNL for route_doit().") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Cc: Remi Denis-Courmont Reviewed-by: Jiri Pirko Link: https://patch.msgid.link/20241106131818.1240710-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/phonet/pn_dev.c | 5 +++-- net/phonet/pn_netlink.c | 12 ++++++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index 19234d664c4f..5c36bae37b8f 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -406,8 +406,9 @@ int phonet_route_del(struct net_device *dev, u8 daddr) if (!dev) return -ENOENT; - synchronize_rcu(); - dev_put(dev); + + /* Note : our caller must call synchronize_rcu() and dev_put(dev) */ + return 0; } diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c index ca1f04e4a2d9..b9043c92dc24 100644 --- a/net/phonet/pn_netlink.c +++ b/net/phonet/pn_netlink.c @@ -233,6 +233,7 @@ static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh, { struct net *net = sock_net(skb->sk); struct nlattr *tb[RTA_MAX+1]; + bool sync_needed = false; struct net_device *dev; struct rtmsg *rtm; u32 ifindex; @@ -269,13 +270,20 @@ static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh, return -ENODEV; } - if (nlh->nlmsg_type == RTM_NEWROUTE) + if (nlh->nlmsg_type == RTM_NEWROUTE) { err = phonet_route_add(dev, dst); - else + } else { err = phonet_route_del(dev, dst); + if (!err) + sync_needed = true; + } rcu_read_unlock(); + if (sync_needed) { + synchronize_rcu(); + dev_put(dev); + } if (!err) rtm_phonet_notify(net, nlh->nlmsg_type, ifindex, dst); -- cgit v1.3 From 675d4566e599bab1a225d418bbf7a53100367978 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Mon, 4 Nov 2024 17:11:27 -0500 Subject: SUNRPC: Fix a hang in TLS sock_close if sk_write_pending We've observed an NFS server shrink the TCP window and then reset the TCP connection as part of a HA failover. When the connection has TLS, often the NFS client will hang indefinitely in this stack: wait_woken+0x70/0x80 wait_on_pending_writer+0xe4/0x110 [tls] tls_sk_proto_close+0x368/0x3a0 [tls] inet_release+0x54/0xb0 __sock_release+0x48/0xc8 sock_close+0x20/0x38 __fput+0xe0/0x2f0 __fput_sync+0x58/0x70 xs_reset_transport+0xe8/0x1f8 [sunrpc] xs_tcp_shutdown+0xa4/0x190 [sunrpc] xprt_autoclose+0x68/0x170 [sunrpc] process_one_work+0x180/0x420 worker_thread+0x258/0x368 kthread+0x104/0x118 ret_from_fork+0x10/0x20 This hang prevents the client from closing the socket and reconnecting to the server. Because xs_nospace() elevates sk_write_pending, and sk_sndtimeo is MAX_SCHEDULE_TIMEOUT, tls_sk_proto_close is never able to complete its wait for pending writes to the socket. For this case where we are resetting the transport anyway, we don't expect the socket to ever have write space, so fix this by simply clearing the sock's sndtimeo under the sock's lock. Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 1326fbf45a34..d587c261d999 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1278,6 +1278,7 @@ static void xs_reset_transport(struct sock_xprt *transport) transport->file = NULL; sk->sk_user_data = NULL; + sk->sk_sndtimeo = 0; xs_restore_old_callbacks(transport, sk); xprt_clear_connected(xprt); -- cgit v1.3 From fb86c42a2a5d44e849ddfbc98b8d2f4f40d36ee3 Mon Sep 17 00:00:00 2001 From: Jiawei Ye Date: Fri, 8 Nov 2024 08:18:52 +0000 Subject: bpf: Fix mismatched RCU unlock flavour in bpf_out_neigh_v6 In the bpf_out_neigh_v6 function, rcu_read_lock() is used to begin an RCU read-side critical section. However, when unlocking, one branch incorrectly uses a different RCU unlock flavour rcu_read_unlock_bh() instead of rcu_read_unlock(). This mismatch in RCU locking flavours can lead to unexpected behavior and potential concurrency issues. This possible bug was identified using a static analysis tool developed by myself, specifically designed to detect RCU-related issues. This patch corrects the mismatched unlock flavour by replacing the incorrect rcu_read_unlock_bh() with the appropriate rcu_read_unlock(), ensuring that the RCU critical section is properly exited. This change prevents potential synchronization issues and aligns with proper RCU usage patterns. Fixes: 09eed1192cec ("neighbour: switch to standard rcu, instead of rcu_bh") Signed-off-by: Jiawei Ye Acked-by: Yonghong Song Link: https://lore.kernel.org/r/tencent_CFD3D1C3D68B45EA9F52D8EC76D2C4134306@qq.com Signed-off-by: Martin KaFai Lau --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index e31ee8be2de0..fb56567c551e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2249,7 +2249,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, rcu_read_unlock(); return ret; } - rcu_read_unlock_bh(); + rcu_read_unlock(); if (dst) IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); out_drop: -- cgit v1.3 From 580db513b4a9d52f306580015a1872eea0a0894e Mon Sep 17 00:00:00 2001 From: Khang Nguyen Date: Tue, 5 Nov 2024 14:19:15 +0700 Subject: net: mctp: Expose transport binding identifier via IFLA attribute MCTP control protocol implementations are transport binding dependent. Endpoint discovery is mandatory based on transport binding. Message timing requirements are specified in each respective transport binding specification. However, we currently have no means to get this information from MCTP links. Add a IFLA_MCTP_PHYS_BINDING netlink link attribute, which represents the transport type using the DMTF DSP0239-defined type numbers, returned as part of RTM_GETLINK data. We get an IFLA_MCTP_PHYS_BINDING attribute for each MCTP link, for example: - 0x00 (unspec) for loopback interface; - 0x01 (SMBus/I2C) for mctpi2c%d interfaces; and - 0x05 (serial) for mctpserial%d interfaces. Signed-off-by: Khang Nguyen Reviewed-by: Matt Johnston Link: https://patch.msgid.link/20241105071915.821871-1-khangng@os.amperecomputing.com Signed-off-by: Jakub Kicinski --- drivers/net/mctp/mctp-i2c.c | 3 ++- drivers/net/mctp/mctp-i3c.c | 2 +- drivers/net/mctp/mctp-serial.c | 5 +++-- include/net/mctp.h | 18 ++++++++++++++++++ include/net/mctpdevice.h | 4 +++- include/uapi/linux/if_link.h | 1 + net/mctp/device.c | 12 +++++++++--- 7 files changed, 37 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/drivers/net/mctp/mctp-i2c.c b/drivers/net/mctp/mctp-i2c.c index e70fb6687994..d2b3f5a59141 100644 --- a/drivers/net/mctp/mctp-i2c.c +++ b/drivers/net/mctp/mctp-i2c.c @@ -880,7 +880,8 @@ static int mctp_i2c_add_netdev(struct mctp_i2c_client *mcli, goto err; } - rc = mctp_register_netdev(ndev, &mctp_i2c_mctp_ops); + rc = mctp_register_netdev(ndev, &mctp_i2c_mctp_ops, + MCTP_PHYS_BINDING_SMBUS); if (rc < 0) { dev_err(&mcli->client->dev, "register netdev \"%s\" failed %d\n", diff --git a/drivers/net/mctp/mctp-i3c.c b/drivers/net/mctp/mctp-i3c.c index 1bc87a062686..9adad59b8676 100644 --- a/drivers/net/mctp/mctp-i3c.c +++ b/drivers/net/mctp/mctp-i3c.c @@ -607,7 +607,7 @@ __must_hold(&busdevs_lock) goto err_free_uninit; } - rc = mctp_register_netdev(ndev, NULL); + rc = mctp_register_netdev(ndev, NULL, MCTP_PHYS_BINDING_I3C); if (rc < 0) { dev_warn(&ndev->dev, "netdev register failed: %d\n", rc); goto err_free_netdev; diff --git a/drivers/net/mctp/mctp-serial.c b/drivers/net/mctp/mctp-serial.c index e63720ec3238..26c9a33fd636 100644 --- a/drivers/net/mctp/mctp-serial.c +++ b/drivers/net/mctp/mctp-serial.c @@ -23,6 +23,7 @@ #include #include +#include #include #define MCTP_SERIAL_MTU 68 /* base mtu (64) + mctp header */ @@ -470,7 +471,7 @@ static int mctp_serial_open(struct tty_struct *tty) spin_lock_init(&dev->lock); INIT_WORK(&dev->tx_work, mctp_serial_tx_work); - rc = register_netdev(ndev); + rc = mctp_register_netdev(ndev, NULL, MCTP_PHYS_BINDING_SERIAL); if (rc) goto free_netdev; @@ -492,7 +493,7 @@ static void mctp_serial_close(struct tty_struct *tty) struct mctp_serial *dev = tty->disc_data; int idx = dev->idx; - unregister_netdev(dev->netdev); + mctp_unregister_netdev(dev->netdev); ida_free(&mctp_serial_ida, idx); } diff --git a/include/net/mctp.h b/include/net/mctp.h index 28d59ae94ca3..1ecbff7116f6 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -298,4 +298,22 @@ void mctp_routes_exit(void); int mctp_device_init(void); void mctp_device_exit(void); +/* MCTP IDs and Codes from DMTF specification + * "DSP0239 Management Component Transport Protocol (MCTP) IDs and Codes" + * https://www.dmtf.org/sites/default/files/standards/documents/DSP0239_1.11.1.pdf + */ +enum mctp_phys_binding { + MCTP_PHYS_BINDING_UNSPEC = 0x00, + MCTP_PHYS_BINDING_SMBUS = 0x01, + MCTP_PHYS_BINDING_PCIE_VDM = 0x02, + MCTP_PHYS_BINDING_USB = 0x03, + MCTP_PHYS_BINDING_KCS = 0x04, + MCTP_PHYS_BINDING_SERIAL = 0x05, + MCTP_PHYS_BINDING_I3C = 0x06, + MCTP_PHYS_BINDING_MMBI = 0x07, + MCTP_PHYS_BINDING_PCC = 0x08, + MCTP_PHYS_BINDING_UCIE = 0x09, + MCTP_PHYS_BINDING_VENDOR = 0xFF, +}; + #endif /* __NET_MCTP_H */ diff --git a/include/net/mctpdevice.h b/include/net/mctpdevice.h index 5c0d04b5c12c..957d9ef924c5 100644 --- a/include/net/mctpdevice.h +++ b/include/net/mctpdevice.h @@ -22,6 +22,7 @@ struct mctp_dev { refcount_t refs; unsigned int net; + enum mctp_phys_binding binding; const struct mctp_netdev_ops *ops; @@ -44,7 +45,8 @@ struct mctp_dev *mctp_dev_get_rtnl(const struct net_device *dev); struct mctp_dev *__mctp_dev_get(const struct net_device *dev); int mctp_register_netdev(struct net_device *dev, - const struct mctp_netdev_ops *ops); + const struct mctp_netdev_ops *ops, + enum mctp_phys_binding binding); void mctp_unregister_netdev(struct net_device *dev); void mctp_dev_hold(struct mctp_dev *mdev); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 8516c1ccd57a..2575e0cd9b48 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1958,6 +1958,7 @@ struct ifla_rmnet_flags { enum { IFLA_MCTP_UNSPEC, IFLA_MCTP_NET, + IFLA_MCTP_PHYS_BINDING, __IFLA_MCTP_MAX, }; diff --git a/net/mctp/device.c b/net/mctp/device.c index 3d75b919995d..26ce34b7e88e 100644 --- a/net/mctp/device.c +++ b/net/mctp/device.c @@ -371,6 +371,8 @@ static int mctp_fill_link_af(struct sk_buff *skb, return -ENODATA; if (nla_put_u32(skb, IFLA_MCTP_NET, mdev->net)) return -EMSGSIZE; + if (nla_put_u8(skb, IFLA_MCTP_PHYS_BINDING, mdev->binding)) + return -EMSGSIZE; return 0; } @@ -385,6 +387,7 @@ static size_t mctp_get_link_af_size(const struct net_device *dev, if (!mdev) return 0; ret = nla_total_size(4); /* IFLA_MCTP_NET */ + ret += nla_total_size(1); /* IFLA_MCTP_PHYS_BINDING */ mctp_dev_put(mdev); return ret; } @@ -480,7 +483,8 @@ static int mctp_dev_notify(struct notifier_block *this, unsigned long event, } static int mctp_register_netdevice(struct net_device *dev, - const struct mctp_netdev_ops *ops) + const struct mctp_netdev_ops *ops, + enum mctp_phys_binding binding) { struct mctp_dev *mdev; @@ -489,17 +493,19 @@ static int mctp_register_netdevice(struct net_device *dev, return PTR_ERR(mdev); mdev->ops = ops; + mdev->binding = binding; return register_netdevice(dev); } int mctp_register_netdev(struct net_device *dev, - const struct mctp_netdev_ops *ops) + const struct mctp_netdev_ops *ops, + enum mctp_phys_binding binding) { int rc; rtnl_lock(); - rc = mctp_register_netdevice(dev, ops); + rc = mctp_register_netdevice(dev, ops, binding); rtnl_unlock(); return rc; -- cgit v1.3 From 7d28f4fc868ccc26124d368e8d2ead9d21c23542 Mon Sep 17 00:00:00 2001 From: MoYuanhao Date: Wed, 6 Nov 2024 15:10:35 +0800 Subject: mptcp: remove the redundant assignment of 'new_ctx->tcp_sock' in subflow_ulp_clone() The variable has already been assigned in the subflow_create_ctx(), So we don't need to reassign this variable in the subflow_ulp_clone(). Signed-off-by: MoYuanhao Reviewed-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241106071035.2591-1-moyuanhao3676@163.com Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 07352b15f145..fd021cf8286e 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -2049,7 +2049,6 @@ static void subflow_ulp_clone(const struct request_sock *req, new_ctx->tcp_state_change = old_ctx->tcp_state_change; new_ctx->tcp_error_report = old_ctx->tcp_error_report; new_ctx->rel_write_seq = 1; - new_ctx->tcp_sock = newsk; if (subflow_req->mp_capable) { /* see comments in subflow_syn_recv_sock(), MPTCP connection -- cgit v1.3 From e629295bd60abf4da1db85b82819ca6a4f6c1e79 Mon Sep 17 00:00:00 2001 From: Hyunwoo Kim Date: Wed, 6 Nov 2024 04:36:04 -0500 Subject: hv_sock: Initializing vsk->trans to NULL to prevent a dangling pointer When hvs is released, there is a possibility that vsk->trans may not be initialized to NULL, which could lead to a dangling pointer. This issue is resolved by initializing vsk->trans to NULL. Signed-off-by: Hyunwoo Kim Reviewed-by: Stefano Garzarella Acked-by: Michael S. Tsirkin Link: https://patch.msgid.link/Zys4hCj61V+mQfX2@v4bel-B760M-AORUS-ELITE-AX Signed-off-by: Jakub Kicinski --- net/vmw_vsock/hyperv_transport.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index e2157e387217..56c232cf5b0f 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -549,6 +549,7 @@ static void hvs_destruct(struct vsock_sock *vsk) vmbus_hvsock_device_unregister(chan); kfree(hvs); + vsk->trans = NULL; } static int hvs_dgram_bind(struct vsock_sock *vsk, struct sockaddr_vm *addr) -- cgit v1.3 From eb72e7fcc83987d5d5595b43222f23b295d5de7f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2024 19:20:21 +0000 Subject: sctp: fix possible UAF in sctp_v6_available() A lockdep report [1] with CONFIG_PROVE_RCU_LIST=y hints that sctp_v6_available() is calling dev_get_by_index_rcu() and ipv6_chk_addr() without holding rcu. [1] ============================= WARNING: suspicious RCU usage 6.12.0-rc5-virtme #1216 Tainted: G W ----------------------------- net/core/dev.c:876 RCU-list traversed in non-reader section!! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by sctp_hello/31495: #0: ffff9f1ebbdb7418 (sk_lock-AF_INET6){+.+.}-{0:0}, at: sctp_bind (./arch/x86/include/asm/jump_label.h:27 net/sctp/socket.c:315) sctp stack backtrace: CPU: 7 UID: 0 PID: 31495 Comm: sctp_hello Tainted: G W 6.12.0-rc5-virtme #1216 Tainted: [W]=WARN Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 Call Trace: dump_stack_lvl (lib/dump_stack.c:123) lockdep_rcu_suspicious (kernel/locking/lockdep.c:6822) dev_get_by_index_rcu (net/core/dev.c:876 (discriminator 7)) sctp_v6_available (net/sctp/ipv6.c:701) sctp sctp_do_bind (net/sctp/socket.c:400 (discriminator 1)) sctp sctp_bind (net/sctp/socket.c:320) sctp inet6_bind_sk (net/ipv6/af_inet6.c:465) ? security_socket_bind (security/security.c:4581 (discriminator 1)) __sys_bind (net/socket.c:1848 net/socket.c:1869) ? do_user_addr_fault (./include/linux/rcupdate.h:347 ./include/linux/rcupdate.h:880 ./include/linux/mm.h:729 arch/x86/mm/fault.c:1340) ? do_user_addr_fault (./arch/x86/include/asm/preempt.h:84 (discriminator 13) ./include/linux/rcupdate.h:98 (discriminator 13) ./include/linux/rcupdate.h:882 (discriminator 13) ./include/linux/mm.h:729 (discriminator 13) arch/x86/mm/fault.c:1340 (discriminator 13)) __x64_sys_bind (net/socket.c:1877 (discriminator 1) net/socket.c:1875 (discriminator 1) net/socket.c:1875 (discriminator 1)) do_syscall_64 (arch/x86/entry/common.c:52 (discriminator 1) arch/x86/entry/common.c:83 (discriminator 1)) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) RIP: 0033:0x7f59b934a1e7 Code: 44 00 00 48 8b 15 39 8c 0c 00 f7 d8 64 89 02 b8 ff ff ff ff eb bd 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 b8 31 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 09 8c 0c 00 f7 d8 64 89 01 48 All code ======== 0: 44 00 00 add %r8b,(%rax) 3: 48 8b 15 39 8c 0c 00 mov 0xc8c39(%rip),%rdx # 0xc8c43 a: f7 d8 neg %eax c: 64 89 02 mov %eax,%fs:(%rdx) f: b8 ff ff ff ff mov $0xffffffff,%eax 14: eb bd jmp 0xffffffffffffffd3 16: 66 2e 0f 1f 84 00 00 cs nopw 0x0(%rax,%rax,1) 1d: 00 00 00 20: 0f 1f 00 nopl (%rax) 23: b8 31 00 00 00 mov $0x31,%eax 28: 0f 05 syscall 2a:* 48 3d 01 f0 ff ff cmp $0xfffffffffffff001,%rax <-- trapping instruction 30: 73 01 jae 0x33 32: c3 ret 33: 48 8b 0d 09 8c 0c 00 mov 0xc8c09(%rip),%rcx # 0xc8c43 3a: f7 d8 neg %eax 3c: 64 89 01 mov %eax,%fs:(%rcx) 3f: 48 rex.W Code starting with the faulting instruction =========================================== 0: 48 3d 01 f0 ff ff cmp $0xfffffffffffff001,%rax 6: 73 01 jae 0x9 8: c3 ret 9: 48 8b 0d 09 8c 0c 00 mov 0xc8c09(%rip),%rcx # 0xc8c19 10: f7 d8 neg %eax 12: 64 89 01 mov %eax,%fs:(%rcx) 15: 48 rex.W RSP: 002b:00007ffe2d0ad398 EFLAGS: 00000202 ORIG_RAX: 0000000000000031 RAX: ffffffffffffffda RBX: 00007ffe2d0ad3d0 RCX: 00007f59b934a1e7 RDX: 000000000000001c RSI: 00007ffe2d0ad3d0 RDI: 0000000000000005 RBP: 0000000000000005 R08: 1999999999999999 R09: 0000000000000000 R10: 00007f59b9253298 R11: 0000000000000202 R12: 00007ffe2d0ada61 R13: 0000000000000000 R14: 0000562926516dd8 R15: 00007f59b9479000 Fixes: 6fe1e52490a9 ("sctp: check ipv6 addr with sk_bound_dev if set") Signed-off-by: Eric Dumazet Cc: Marcelo Ricardo Leitner Acked-by: Xin Long Link: https://patch.msgid.link/20241107192021.2579789-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sctp/ipv6.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index f7b809c0d142..38e2fbdcbeac 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -683,7 +683,7 @@ static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp) struct sock *sk = &sp->inet.sk; struct net *net = sock_net(sk); struct net_device *dev = NULL; - int type; + int type, res, bound_dev_if; type = ipv6_addr_type(in6); if (IPV6_ADDR_ANY == type) @@ -697,14 +697,21 @@ static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp) if (!(type & IPV6_ADDR_UNICAST)) return 0; - if (sk->sk_bound_dev_if) { - dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); + rcu_read_lock(); + bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); + if (bound_dev_if) { + res = 0; + dev = dev_get_by_index_rcu(net, bound_dev_if); if (!dev) - return 0; + goto out; } - return ipv6_can_nonlocal_bind(net, &sp->inet) || - ipv6_chk_addr(net, in6, dev, 0); + res = ipv6_can_nonlocal_bind(net, &sp->inet) || + ipv6_chk_addr(net, in6, dev, 0); + +out: + rcu_read_unlock(); + return res; } /* This function checks if the address is a valid address to be used for -- cgit v1.3 From 41b3caa7c0761141aa6d508924b9d23db57a17bc Mon Sep 17 00:00:00 2001 From: Gilad Naaman Date: Thu, 7 Nov 2024 16:04:38 +0000 Subject: neighbour: Add hlist_node to struct neighbour Add a doubly-linked node to neighbours, so that they can be deleted without iterating the entire bucket they're in. Signed-off-by: Gilad Naaman Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241107160444.2913124-2-gnaaman@drivenets.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 2 ++ net/core/neighbour.c | 20 +++++++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 3887ed9e5026..0402447854c7 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -136,6 +136,7 @@ struct neigh_statistics { struct neighbour { struct neighbour __rcu *next; + struct hlist_node hash; struct neigh_table *tbl; struct neigh_parms *parms; unsigned long confirmed; @@ -191,6 +192,7 @@ struct pneigh_entry { struct neigh_hash_table { struct neighbour __rcu **hash_buckets; + struct hlist_head *hash_heads; unsigned int hash_shift; __u32 hash_rnd[NEIGH_NUM_HASH_RND]; struct rcu_head rcu; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 4b871cecd2ce..5552e6b05c82 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -216,6 +216,7 @@ static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np, neigh = rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock)); rcu_assign_pointer(*np, neigh); + hlist_del_rcu(&n->hash); neigh_mark_dead(n); retval = true; } @@ -402,6 +403,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, rcu_assign_pointer(*np, rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock))); + hlist_del_rcu(&n->hash); write_lock(&n->lock); neigh_del_timer(n); neigh_mark_dead(n); @@ -529,20 +531,30 @@ static void neigh_get_hash_rnd(u32 *x) static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) { + size_t hash_heads_size = (1 << shift) * sizeof(struct hlist_head); size_t size = (1 << shift) * sizeof(struct neighbour *); - struct neigh_hash_table *ret; struct neighbour __rcu **buckets; + struct hlist_head *hash_heads; + struct neigh_hash_table *ret; int i; ret = kmalloc(sizeof(*ret), GFP_ATOMIC); if (!ret) return NULL; + buckets = kvzalloc(size, GFP_ATOMIC); if (!buckets) { kfree(ret); return NULL; } + hash_heads = kvzalloc(hash_heads_size, GFP_ATOMIC); + if (!hash_heads) { + kvfree(buckets); + kfree(ret); + return NULL; + } ret->hash_buckets = buckets; + ret->hash_heads = hash_heads; ret->hash_shift = shift; for (i = 0; i < NEIGH_NUM_HASH_RND; i++) neigh_get_hash_rnd(&ret->hash_rnd[i]); @@ -556,6 +568,7 @@ static void neigh_hash_free_rcu(struct rcu_head *head) rcu); kvfree(nht->hash_buckets); + kvfree(nht->hash_heads); kfree(nht); } @@ -592,6 +605,8 @@ static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, new_nht->hash_buckets[hash], lockdep_is_held(&tbl->lock))); rcu_assign_pointer(new_nht->hash_buckets[hash], n); + hlist_del_rcu(&n->hash); + hlist_add_head_rcu(&n->hash, &new_nht->hash_heads[hash]); } } @@ -702,6 +717,7 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, rcu_dereference_protected(nht->hash_buckets[hash_val], lockdep_is_held(&tbl->lock))); rcu_assign_pointer(nht->hash_buckets[hash_val], n); + hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]); write_unlock_bh(&tbl->lock); neigh_dbg(2, "neigh %p is created\n", n); rc = n; @@ -987,6 +1003,7 @@ static void neigh_periodic_work(struct work_struct *work) rcu_assign_pointer(*np, rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock))); + hlist_del_rcu(&n->hash); neigh_mark_dead(n); write_unlock(&n->lock); neigh_cleanup_and_release(n); @@ -3116,6 +3133,7 @@ void __neigh_for_each_release(struct neigh_table *tbl, rcu_assign_pointer(*np, rcu_dereference_protected(n->next, lockdep_is_held(&tbl->lock))); + hlist_del_rcu(&n->hash); neigh_mark_dead(n); } else np = &n->next; -- cgit v1.3 From 00df5e1a3fdf36624d59ef6ab09010ebaee6e66a Mon Sep 17 00:00:00 2001 From: Gilad Naaman Date: Thu, 7 Nov 2024 16:04:40 +0000 Subject: neighbour: Convert seq_file functions to use hlist Convert seq_file-related neighbour functionality to use neighbour::hash and the related for_each macro. Signed-off-by: Gilad Naaman Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241107160444.2913124-4-gnaaman@drivenets.com Signed-off-by: Jakub Kicinski --- net/core/neighbour.c | 104 ++++++++++++++++++++++++--------------------------- 1 file changed, 48 insertions(+), 56 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 5552e6b05c82..3485d6b3ba99 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3193,43 +3193,53 @@ EXPORT_SYMBOL(neigh_xmit); #ifdef CONFIG_PROC_FS -static struct neighbour *neigh_get_first(struct seq_file *seq) +static struct neighbour *neigh_get_valid(struct seq_file *seq, + struct neighbour *n, + loff_t *pos) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); + + if (!net_eq(dev_net(n->dev), net)) + return NULL; + + if (state->neigh_sub_iter) { + loff_t fakep = 0; + void *v; + + v = state->neigh_sub_iter(state, n, pos ? pos : &fakep); + if (!v) + return NULL; + if (pos) + return v; + } + + if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) + return n; + + if (READ_ONCE(n->nud_state) & ~NUD_NOARP) + return n; + + return NULL; +} + +static struct neighbour *neigh_get_first(struct seq_file *seq) +{ + struct neigh_seq_state *state = seq->private; struct neigh_hash_table *nht = state->nht; - struct neighbour *n = NULL; - int bucket; + struct neighbour *n, *tmp; state->flags &= ~NEIGH_SEQ_IS_PNEIGH; - for (bucket = 0; bucket < (1 << nht->hash_shift); bucket++) { - n = rcu_dereference(nht->hash_buckets[bucket]); - - while (n) { - if (!net_eq(dev_net(n->dev), net)) - goto next; - if (state->neigh_sub_iter) { - loff_t fakep = 0; - void *v; - v = state->neigh_sub_iter(state, n, &fakep); - if (!v) - goto next; - } - if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) - break; - if (READ_ONCE(n->nud_state) & ~NUD_NOARP) - break; -next: - n = rcu_dereference(n->next); + while (++state->bucket < (1 << nht->hash_shift)) { + neigh_for_each_in_bucket(n, &nht->hash_heads[state->bucket]) { + tmp = neigh_get_valid(seq, n, NULL); + if (tmp) + return tmp; } - - if (n) - break; } - state->bucket = bucket; - return n; + return NULL; } static struct neighbour *neigh_get_next(struct seq_file *seq, @@ -3237,46 +3247,28 @@ static struct neighbour *neigh_get_next(struct seq_file *seq, loff_t *pos) { struct neigh_seq_state *state = seq->private; - struct net *net = seq_file_net(seq); - struct neigh_hash_table *nht = state->nht; + struct neighbour *tmp; if (state->neigh_sub_iter) { void *v = state->neigh_sub_iter(state, n, pos); + if (v) return n; } - n = rcu_dereference(n->next); - - while (1) { - while (n) { - if (!net_eq(dev_net(n->dev), net)) - goto next; - if (state->neigh_sub_iter) { - void *v = state->neigh_sub_iter(state, n, pos); - if (v) - return n; - goto next; - } - if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) - break; - if (READ_ONCE(n->nud_state) & ~NUD_NOARP) - break; -next: - n = rcu_dereference(n->next); + hlist_for_each_entry_continue(n, hash) { + tmp = neigh_get_valid(seq, n, pos); + if (tmp) { + n = tmp; + goto out; } - - if (n) - break; - - if (++state->bucket >= (1 << nht->hash_shift)) - break; - - n = rcu_dereference(nht->hash_buckets[state->bucket]); } + n = neigh_get_first(seq); +out: if (n && pos) --(*pos); + return n; } @@ -3379,7 +3371,7 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl struct neigh_seq_state *state = seq->private; state->tbl = tbl; - state->bucket = 0; + state->bucket = -1; state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH); rcu_read_lock(); -- cgit v1.3 From 0e3bcb0f78a0ca7cfdb7906dc79d922e19ba09b5 Mon Sep 17 00:00:00 2001 From: Gilad Naaman Date: Thu, 7 Nov 2024 16:04:41 +0000 Subject: neighbour: Convert iteration to use hlist+macro Remove all usage of the bare neighbour::next pointer, replacing them with neighbour::hash and its for_each macro. Signed-off-by: Gilad Naaman Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241107160444.2913124-5-gnaaman@drivenets.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 5 +---- net/core/neighbour.c | 47 ++++++++++++++++++----------------------------- 2 files changed, 19 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 4b9068c5e668..94cf4f8c118f 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -311,12 +311,9 @@ static inline struct neighbour *___neigh_lookup_noref( u32 hash_val; hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); - for (n = rcu_dereference(nht->hash_buckets[hash_val]); - n != NULL; - n = rcu_dereference(n->next)) { + neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[hash_val]) if (n->dev == dev && key_eq(n, pkey)) return n; - } return NULL; } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 3485d6b3ba99..f99354d768c2 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -387,11 +387,11 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, lockdep_is_held(&tbl->lock)); for (i = 0; i < (1 << nht->hash_shift); i++) { - struct neighbour *n; struct neighbour __rcu **np = &nht->hash_buckets[i]; + struct hlist_node *tmp; + struct neighbour *n; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock))) != NULL) { + neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) { if (dev && n->dev != dev) { np = &n->next; continue; @@ -587,18 +587,14 @@ static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, return old_nht; for (i = 0; i < (1 << old_nht->hash_shift); i++) { - struct neighbour *n, *next; + struct hlist_node *tmp; + struct neighbour *n; - for (n = rcu_dereference_protected(old_nht->hash_buckets[i], - lockdep_is_held(&tbl->lock)); - n != NULL; - n = next) { + neigh_for_each_in_bucket_safe(n, tmp, &old_nht->hash_heads[i]) { hash = tbl->hash(n->primary_key, n->dev, new_nht->hash_rnd); hash >>= (32 - new_nht->hash_shift); - next = rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock)); rcu_assign_pointer(n->next, rcu_dereference_protected( @@ -693,11 +689,7 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, goto out_tbl_unlock; } - for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val], - lockdep_is_held(&tbl->lock)); - n1 != NULL; - n1 = rcu_dereference_protected(n1->next, - lockdep_is_held(&tbl->lock))) { + neigh_for_each_in_bucket(n1, &nht->hash_heads[hash_val]) { if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) { if (want_ref) neigh_hold(n1); @@ -949,10 +941,11 @@ static void neigh_connect(struct neighbour *neigh) static void neigh_periodic_work(struct work_struct *work) { struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); - struct neighbour *n; + struct neigh_hash_table *nht; struct neighbour __rcu **np; + struct hlist_node *tmp; + struct neighbour *n; unsigned int i; - struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); @@ -979,8 +972,7 @@ static void neigh_periodic_work(struct work_struct *work) for (i = 0 ; i < (1 << nht->hash_shift); i++) { np = &nht->hash_buckets[i]; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock))) != NULL) { + neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) { unsigned int state; write_lock(&n->lock); @@ -2730,9 +2722,8 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, for (h = s_h; h < (1 << nht->hash_shift); h++) { if (h > s_h) s_idx = 0; - for (n = rcu_dereference(nht->hash_buckets[h]), idx = 0; - n != NULL; - n = rcu_dereference(n->next)) { + idx = 0; + neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[h]) { if (idx < s_idx || !net_eq(dev_net(n->dev), net)) goto next; if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || @@ -3099,9 +3090,7 @@ void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void for (chain = 0; chain < (1 << nht->hash_shift); chain++) { struct neighbour *n; - for (n = rcu_dereference(nht->hash_buckets[chain]); - n != NULL; - n = rcu_dereference(n->next)) + neigh_for_each_in_bucket(n, &nht->hash_heads[chain]) cb(n, cookie); } read_unlock_bh(&tbl->lock); @@ -3113,18 +3102,18 @@ EXPORT_SYMBOL(neigh_for_each); void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)) { - int chain; struct neigh_hash_table *nht; + int chain; nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (chain = 0; chain < (1 << nht->hash_shift); chain++) { - struct neighbour *n; struct neighbour __rcu **np; + struct hlist_node *tmp; + struct neighbour *n; np = &nht->hash_buckets[chain]; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock))) != NULL) { + neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[chain]) { int release; write_lock(&n->lock); -- cgit v1.3 From a01a67ab2fffa7458354f0a666a6d550fa2b82fc Mon Sep 17 00:00:00 2001 From: Gilad Naaman Date: Thu, 7 Nov 2024 16:04:42 +0000 Subject: neighbour: Remove bare neighbour::next pointer Remove the now-unused neighbour::next pointer, leaving struct neighbour solely with the hlist_node implementation. Signed-off-by: Gilad Naaman Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241107160444.2913124-6-gnaaman@drivenets.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 4 +-- net/core/neighbour.c | 90 ++++++------------------------------------------- net/ipv4/arp.c | 2 +- 3 files changed, 12 insertions(+), 84 deletions(-) (limited to 'net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 94cf4f8c118f..40aac1e24c68 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -135,7 +135,6 @@ struct neigh_statistics { #define NEIGH_CACHE_STAT_INC(tbl, field) this_cpu_inc((tbl)->stats->field) struct neighbour { - struct neighbour __rcu *next; struct hlist_node hash; struct neigh_table *tbl; struct neigh_parms *parms; @@ -191,7 +190,6 @@ struct pneigh_entry { #define NEIGH_NUM_HASH_RND 4 struct neigh_hash_table { - struct neighbour __rcu **hash_buckets; struct hlist_head *hash_heads; unsigned int hash_shift; __u32 hash_rnd[NEIGH_NUM_HASH_RND]; @@ -354,7 +352,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid); void __neigh_set_probe_once(struct neighbour *neigh); -bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl); +bool neigh_remove_one(struct neighbour *ndel); void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev); int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev); int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index f99354d768c2..59f359c7b5e3 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -204,18 +204,12 @@ static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, } } -static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np, - struct neigh_table *tbl) +bool neigh_remove_one(struct neighbour *n) { bool retval = false; write_lock(&n->lock); if (refcount_read(&n->refcnt) == 1) { - struct neighbour *neigh; - - neigh = rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock)); - rcu_assign_pointer(*np, neigh); hlist_del_rcu(&n->hash); neigh_mark_dead(n); retval = true; @@ -226,29 +220,6 @@ static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np, return retval; } -bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) -{ - struct neigh_hash_table *nht; - void *pkey = ndel->primary_key; - u32 hash_val; - struct neighbour *n; - struct neighbour __rcu **np; - - nht = rcu_dereference_protected(tbl->nht, - lockdep_is_held(&tbl->lock)); - hash_val = tbl->hash(pkey, ndel->dev, nht->hash_rnd); - hash_val = hash_val >> (32 - nht->hash_shift); - - np = &nht->hash_buckets[hash_val]; - while ((n = rcu_dereference_protected(*np, - lockdep_is_held(&tbl->lock)))) { - if (n == ndel) - return neigh_del(n, np, tbl); - np = &n->next; - } - return false; -} - static int neigh_forced_gc(struct neigh_table *tbl) { int max_clean = atomic_read(&tbl->gc_entries) - @@ -276,7 +247,7 @@ static int neigh_forced_gc(struct neigh_table *tbl) remove = true; write_unlock(&n->lock); - if (remove && neigh_remove_one(n, tbl)) + if (remove && neigh_remove_one(n)) shrunk++; if (shrunk >= max_clean) break; @@ -387,22 +358,15 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, lockdep_is_held(&tbl->lock)); for (i = 0; i < (1 << nht->hash_shift); i++) { - struct neighbour __rcu **np = &nht->hash_buckets[i]; struct hlist_node *tmp; struct neighbour *n; neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) { - if (dev && n->dev != dev) { - np = &n->next; + if (dev && n->dev != dev) continue; - } - if (skip_perm && n->nud_state & NUD_PERMANENT) { - np = &n->next; + if (skip_perm && n->nud_state & NUD_PERMANENT) continue; - } - rcu_assign_pointer(*np, - rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock))); + hlist_del_rcu(&n->hash); write_lock(&n->lock); neigh_del_timer(n); @@ -531,9 +495,7 @@ static void neigh_get_hash_rnd(u32 *x) static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) { - size_t hash_heads_size = (1 << shift) * sizeof(struct hlist_head); - size_t size = (1 << shift) * sizeof(struct neighbour *); - struct neighbour __rcu **buckets; + size_t size = (1 << shift) * sizeof(struct hlist_head); struct hlist_head *hash_heads; struct neigh_hash_table *ret; int i; @@ -542,18 +504,11 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) if (!ret) return NULL; - buckets = kvzalloc(size, GFP_ATOMIC); - if (!buckets) { - kfree(ret); - return NULL; - } - hash_heads = kvzalloc(hash_heads_size, GFP_ATOMIC); + hash_heads = kvzalloc(size, GFP_ATOMIC); if (!hash_heads) { - kvfree(buckets); kfree(ret); return NULL; } - ret->hash_buckets = buckets; ret->hash_heads = hash_heads; ret->hash_shift = shift; for (i = 0; i < NEIGH_NUM_HASH_RND; i++) @@ -567,7 +522,6 @@ static void neigh_hash_free_rcu(struct rcu_head *head) struct neigh_hash_table, rcu); - kvfree(nht->hash_buckets); kvfree(nht->hash_heads); kfree(nht); } @@ -596,11 +550,6 @@ static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, hash >>= (32 - new_nht->hash_shift); - rcu_assign_pointer(n->next, - rcu_dereference_protected( - new_nht->hash_buckets[hash], - lockdep_is_held(&tbl->lock))); - rcu_assign_pointer(new_nht->hash_buckets[hash], n); hlist_del_rcu(&n->hash); hlist_add_head_rcu(&n->hash, &new_nht->hash_heads[hash]); } @@ -705,10 +654,6 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, list_add_tail(&n->managed_list, &n->tbl->managed_list); if (want_ref) neigh_hold(n); - rcu_assign_pointer(n->next, - rcu_dereference_protected(nht->hash_buckets[hash_val], - lockdep_is_held(&tbl->lock))); - rcu_assign_pointer(nht->hash_buckets[hash_val], n); hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]); write_unlock_bh(&tbl->lock); neigh_dbg(2, "neigh %p is created\n", n); @@ -942,7 +887,6 @@ static void neigh_periodic_work(struct work_struct *work) { struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); struct neigh_hash_table *nht; - struct neighbour __rcu **np; struct hlist_node *tmp; struct neighbour *n; unsigned int i; @@ -970,8 +914,6 @@ static void neigh_periodic_work(struct work_struct *work) goto out; for (i = 0 ; i < (1 << nht->hash_shift); i++) { - np = &nht->hash_buckets[i]; - neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) { unsigned int state; @@ -981,7 +923,7 @@ static void neigh_periodic_work(struct work_struct *work) if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) || (n->flags & NTF_EXT_LEARNED)) { write_unlock(&n->lock); - goto next_elt; + continue; } if (time_before(n->used, n->confirmed) && @@ -992,9 +934,6 @@ static void neigh_periodic_work(struct work_struct *work) (state == NUD_FAILED || !time_in_range_open(jiffies, n->used, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { - rcu_assign_pointer(*np, - rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock))); hlist_del_rcu(&n->hash); neigh_mark_dead(n); write_unlock(&n->lock); @@ -1002,9 +941,6 @@ static void neigh_periodic_work(struct work_struct *work) continue; } write_unlock(&n->lock); - -next_elt: - np = &n->next; } /* * It's fine to release lock here, even if hash table @@ -1951,7 +1887,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, NETLINK_CB(skb).portid, extack); write_lock_bh(&tbl->lock); neigh_release(neigh); - neigh_remove_one(neigh, tbl); + neigh_remove_one(neigh); write_unlock_bh(&tbl->lock); out: @@ -3108,24 +3044,18 @@ void __neigh_for_each_release(struct neigh_table *tbl, nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (chain = 0; chain < (1 << nht->hash_shift); chain++) { - struct neighbour __rcu **np; struct hlist_node *tmp; struct neighbour *n; - np = &nht->hash_buckets[chain]; neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[chain]) { int release; write_lock(&n->lock); release = cb(n); if (release) { - rcu_assign_pointer(*np, - rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock))); hlist_del_rcu(&n->hash); neigh_mark_dead(n); - } else - np = &n->next; + } write_unlock(&n->lock); if (release) neigh_cleanup_and_release(n); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 11c1519b3699..cb9a7ed8abd3 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1215,7 +1215,7 @@ int arp_invalidate(struct net_device *dev, __be32 ip, bool force) NEIGH_UPDATE_F_ADMIN, 0); write_lock_bh(&tbl->lock); neigh_release(neigh); - neigh_remove_one(neigh, tbl); + neigh_remove_one(neigh); write_unlock_bh(&tbl->lock); } -- cgit v1.3 From f7f52738637f4361c108cad36e23ee98959a9006 Mon Sep 17 00:00:00 2001 From: Gilad Naaman Date: Thu, 7 Nov 2024 16:04:43 +0000 Subject: neighbour: Create netdev->neighbour association Create a mapping between a netdev and its neighoburs, allowing for much cheaper flushes. Signed-off-by: Gilad Naaman Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241107160444.2913124-7-gnaaman@drivenets.com Signed-off-by: Jakub Kicinski --- .../networking/net_cachelines/net_device.rst | 1 + include/linux/netdevice.h | 7 ++ include/net/neighbour.h | 9 +- include/net/neighbour_tables.h | 12 +++ net/core/neighbour.c | 96 +++++++++++++--------- 5 files changed, 80 insertions(+), 45 deletions(-) create mode 100644 include/net/neighbour_tables.h (limited to 'net') diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index ade50d4e67cf..15e31ece675f 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -188,4 +188,5 @@ u64 max_pacing_offload_horizon struct_napi_config* napi_config unsigned_long gro_flush_timeout u32 napi_defer_hard_irqs +struct hlist_head neighbours[2] =================================== =========================== =================== =================== =================================================================================== diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3c552b648b27..df4483598628 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -52,6 +52,7 @@ #include #include #include +#include struct netpoll_info; struct device; @@ -2032,6 +2033,9 @@ enum netdev_reg_state { * @napi_defer_hard_irqs: If not zero, provides a counter that would * allow to avoid NIC hard IRQ, on busy queues. * + * @neighbours: List heads pointing to this device's neighbours' + * dev_list, one per address-family. + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -2440,6 +2444,9 @@ struct net_device { */ struct net_shaper_hierarchy *net_shaper_hierarchy; #endif + + struct hlist_head neighbours[NEIGH_NR_TABLES]; + u8 priv[] ____cacheline_aligned __counted_by(priv_len); } ____cacheline_aligned; diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 40aac1e24c68..9a832cab5b1d 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -29,6 +29,7 @@ #include #include #include +#include /* * NUD stands for "neighbor unreachability detection" @@ -136,6 +137,7 @@ struct neigh_statistics { struct neighbour { struct hlist_node hash; + struct hlist_node dev_list; struct neigh_table *tbl; struct neigh_parms *parms; unsigned long confirmed; @@ -236,13 +238,6 @@ struct neigh_table { struct pneigh_entry **phash_buckets; }; -enum { - NEIGH_ARP_TABLE = 0, - NEIGH_ND_TABLE = 1, - NEIGH_NR_TABLES, - NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */ -}; - static inline int neigh_parms_family(struct neigh_parms *p) { return p->tbl->family; diff --git a/include/net/neighbour_tables.h b/include/net/neighbour_tables.h new file mode 100644 index 000000000000..bcffbe8f7601 --- /dev/null +++ b/include/net/neighbour_tables.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NET_NEIGHBOUR_TABLES_H +#define _NET_NEIGHBOUR_TABLES_H + +enum { + NEIGH_ARP_TABLE = 0, + NEIGH_ND_TABLE = 1, + NEIGH_NR_TABLES, + NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */ +}; + +#endif diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 59f359c7b5e3..5e572f6eaf2c 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -60,6 +60,25 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, static const struct seq_operations neigh_stat_seq_ops; #endif +static struct hlist_head *neigh_get_dev_table(struct net_device *dev, int family) +{ + int i; + + switch (family) { + default: + DEBUG_NET_WARN_ON_ONCE(1); + fallthrough; /* to avoid panic by null-ptr-deref */ + case AF_INET: + i = NEIGH_ARP_TABLE; + break; + case AF_INET6: + i = NEIGH_ND_TABLE; + break; + } + + return &dev->neighbours[i]; +} + /* Neighbour hash table buckets are protected with rwlock tbl->lock. @@ -211,6 +230,7 @@ bool neigh_remove_one(struct neighbour *n) write_lock(&n->lock); if (refcount_read(&n->refcnt) == 1) { hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); retval = true; } @@ -351,48 +371,42 @@ static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net, static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { - int i; - struct neigh_hash_table *nht; - - nht = rcu_dereference_protected(tbl->nht, - lockdep_is_held(&tbl->lock)); + struct hlist_head *dev_head; + struct hlist_node *tmp; + struct neighbour *n; - for (i = 0; i < (1 << nht->hash_shift); i++) { - struct hlist_node *tmp; - struct neighbour *n; + dev_head = neigh_get_dev_table(dev, tbl->family); - neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) { - if (dev && n->dev != dev) - continue; - if (skip_perm && n->nud_state & NUD_PERMANENT) - continue; + hlist_for_each_entry_safe(n, tmp, dev_head, dev_list) { + if (skip_perm && n->nud_state & NUD_PERMANENT) + continue; - hlist_del_rcu(&n->hash); - write_lock(&n->lock); - neigh_del_timer(n); - neigh_mark_dead(n); - if (refcount_read(&n->refcnt) != 1) { - /* The most unpleasant situation. - We must destroy neighbour entry, - but someone still uses it. - - The destroy will be delayed until - the last user releases us, but - we must kill timers etc. and move - it to safe state. - */ - __skb_queue_purge(&n->arp_queue); - n->arp_queue_len_bytes = 0; - WRITE_ONCE(n->output, neigh_blackhole); - if (n->nud_state & NUD_VALID) - n->nud_state = NUD_NOARP; - else - n->nud_state = NUD_NONE; - neigh_dbg(2, "neigh %p is stray\n", n); - } - write_unlock(&n->lock); - neigh_cleanup_and_release(n); + hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); + write_lock(&n->lock); + neigh_del_timer(n); + neigh_mark_dead(n); + if (refcount_read(&n->refcnt) != 1) { + /* The most unpleasant situation. + * We must destroy neighbour entry, + * but someone still uses it. + * + * The destroy will be delayed until + * the last user releases us, but + * we must kill timers etc. and move + * it to safe state. + */ + __skb_queue_purge(&n->arp_queue); + n->arp_queue_len_bytes = 0; + WRITE_ONCE(n->output, neigh_blackhole); + if (n->nud_state & NUD_VALID) + n->nud_state = NUD_NOARP; + else + n->nud_state = NUD_NONE; + neigh_dbg(2, "neigh %p is stray\n", n); } + write_unlock(&n->lock); + neigh_cleanup_and_release(n); } } @@ -655,6 +669,10 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, if (want_ref) neigh_hold(n); hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]); + + hlist_add_head_rcu(&n->dev_list, + neigh_get_dev_table(dev, tbl->family)); + write_unlock_bh(&tbl->lock); neigh_dbg(2, "neigh %p is created\n", n); rc = n; @@ -935,6 +953,7 @@ static void neigh_periodic_work(struct work_struct *work) !time_in_range_open(jiffies, n->used, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); write_unlock(&n->lock); neigh_cleanup_and_release(n); @@ -3054,6 +3073,7 @@ void __neigh_for_each_release(struct neigh_table *tbl, release = cb(n); if (release) { hlist_del_rcu(&n->hash); + hlist_del_rcu(&n->dev_list); neigh_mark_dead(n); } write_unlock(&n->lock); -- cgit v1.3 From 774ca6d3bf24287ff60b7d6dd4171ebb6e47760a Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 5 Nov 2024 15:39:54 +0200 Subject: bridge: Allow deleting FDB entries with non-existent VLAN It is currently impossible to delete individual FDB entries (as opposed to flushing) that were added with a VLAN that no longer exists: # ip link add name dummy1 up type dummy # ip link add name br1 up type bridge vlan_filtering 1 # ip link set dev dummy1 master br1 # bridge fdb add 00:11:22:33:44:55 dev dummy1 master static vlan 1 # bridge vlan del vid 1 dev dummy1 # bridge fdb get 00:11:22:33:44:55 br br1 vlan 1 00:11:22:33:44:55 dev dummy1 vlan 1 master br1 static # bridge fdb del 00:11:22:33:44:55 dev dummy1 master vlan 1 RTNETLINK answers: Invalid argument # bridge fdb get 00:11:22:33:44:55 br br1 vlan 1 00:11:22:33:44:55 dev dummy1 vlan 1 master br1 static This is in contrast to MDB entries that can be deleted after the VLAN was deleted: # bridge vlan add vid 10 dev dummy1 # bridge mdb add dev br1 port dummy1 grp 239.1.1.1 permanent vid 10 # bridge vlan del vid 10 dev dummy1 # bridge mdb get dev br1 grp 239.1.1.1 vid 10 dev br1 port dummy1 grp 239.1.1.1 permanent vid 10 # bridge mdb del dev br1 port dummy1 grp 239.1.1.1 permanent vid 10 # bridge mdb get dev br1 grp 239.1.1.1 vid 10 Error: bridge: MDB entry not found. Align the two interfaces and allow user space to delete FDB entries that were added with a VLAN that no longer exists: # ip link add name dummy1 up type dummy # ip link add name br1 up type bridge vlan_filtering 1 # ip link set dev dummy1 master br1 # bridge fdb add 00:11:22:33:44:55 dev dummy1 master static vlan 1 # bridge vlan del vid 1 dev dummy1 # bridge fdb get 00:11:22:33:44:55 br br1 vlan 1 00:11:22:33:44:55 dev dummy1 vlan 1 master br1 static # bridge fdb del 00:11:22:33:44:55 dev dummy1 master vlan 1 # bridge fdb get 00:11:22:33:44:55 br br1 vlan 1 Error: Fdb entry not found. Add a selftest to make sure this behavior does not regress: # ./rtnetlink.sh -t kci_test_fdb_del PASS: bridge fdb del Signed-off-by: Ido Schimmel Reviewed-by: Andy Roulin Reviewed-by: Petr Machata Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20241105133954.350479-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/bridge/br_fdb.c | 9 ++----- tools/testing/selftests/net/rtnetlink.sh | 40 ++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 1cd7bade9b3b..77f110035df1 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -1319,7 +1319,6 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], { struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; - struct net_bridge_vlan *v; struct net_bridge *br; int err; @@ -1338,14 +1337,10 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], } if (vid) { - v = br_vlan_find(vg, vid); - if (!v) { - pr_info("bridge: RTM_DELNEIGH with unconfigured vlan %d on %s\n", vid, dev->name); - return -EINVAL; - } - err = __br_fdb_delete(br, p, addr, vid); } else { + struct net_bridge_vlan *v; + err = -ENOENT; err &= __br_fdb_delete(br, p, addr, 0); if (!vg || !vg->num_vlans) diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh index 87dce3efe31e..6e216d7a8e2f 100755 --- a/tools/testing/selftests/net/rtnetlink.sh +++ b/tools/testing/selftests/net/rtnetlink.sh @@ -25,6 +25,7 @@ ALL_TESTS=" kci_test_ipsec kci_test_ipsec_offload kci_test_fdb_get + kci_test_fdb_del kci_test_neigh_get kci_test_bridge_parent_id kci_test_address_proto @@ -1065,6 +1066,45 @@ kci_test_fdb_get() end_test "PASS: bridge fdb get" } +kci_test_fdb_del() +{ + local test_mac=de:ad:be:ef:13:37 + local dummydev="dummy1" + local brdev="test-br0" + local ret=0 + + run_cmd_grep 'bridge fdb get' bridge fdb help + if [ $? -ne 0 ]; then + end_test "SKIP: fdb del tests: iproute2 too old" + return $ksft_skip + fi + + setup_ns testns + if [ $? -ne 0 ]; then + end_test "SKIP fdb del tests: cannot add net namespace $testns" + return $ksft_skip + fi + IP="ip -netns $testns" + BRIDGE="bridge -netns $testns" + run_cmd $IP link add $dummydev type dummy + run_cmd $IP link add name $brdev type bridge vlan_filtering 1 + run_cmd $IP link set dev $dummydev master $brdev + run_cmd $BRIDGE fdb add $test_mac dev $dummydev master static vlan 1 + run_cmd $BRIDGE vlan del vid 1 dev $dummydev + run_cmd $BRIDGE fdb get $test_mac br $brdev vlan 1 + run_cmd $BRIDGE fdb del $test_mac dev $dummydev master vlan 1 + run_cmd_fail $BRIDGE fdb get $test_mac br $brdev vlan 1 + + ip netns del $testns &>/dev/null + + if [ $ret -ne 0 ]; then + end_test "FAIL: bridge fdb del" + return 1 + fi + + end_test "PASS: bridge fdb del" +} + kci_test_neigh_get() { dstmac=de:ad:be:ef:13:37 -- cgit v1.3 From a885a6b2d37eaaae08323583bdb1928c8a2935fc Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 8 Nov 2024 11:41:45 +0100 Subject: net: convert to nla_get_*_default() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Most of the original conversion is from the spatch below, but I edited some and left out other instances that were either buggy after conversion (where default values don't fit into the type) or just looked strange. @@ expression attr, def; expression val; identifier fn =~ "^nla_get_.*"; fresh identifier dfn = fn ## "_default"; @@ ( -if (attr) - val = fn(attr); -else - val = def; +val = dfn(attr, def); | -if (!attr) - val = def; -else - val = fn(attr); +val = dfn(attr, def); | -if (!attr) - return def; -return fn(attr); +return dfn(attr, def); | -attr ? fn(attr) : def +dfn(attr, def) | -!attr ? def : fn(attr) +dfn(attr, def) ) Signed-off-by: Johannes Berg Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20241108114145.0580b8684e7f.I740beeaa2f70ebfc19bfca1045a24d6151992790@changeid Signed-off-by: Jakub Kicinski --- drivers/net/amt.c | 12 ++++------ drivers/net/gtp.c | 16 ++++--------- drivers/net/macsec.c | 6 ++--- drivers/net/vxlan/vxlan_core.c | 5 +--- net/8021q/vlan_netlink.c | 6 ++--- net/core/fib_rules.c | 3 +-- net/core/rtnetlink.c | 5 +--- net/devlink/dev.c | 6 ++--- net/hsr/hsr_netlink.c | 5 +--- net/ieee802154/nl-mac.c | 15 +++--------- net/ieee802154/nl802154.c | 26 ++++++++------------ net/ipv4/devinet.c | 3 +-- net/ipv4/ipmr.c | 6 ++--- net/ipv4/nexthop.c | 13 +++------- net/ipv4/route.c | 10 ++++---- net/ipv6/addrconf.c | 7 ++---- net/ipv6/ila/ila_xlat.c | 15 ++++-------- net/ipv6/ioam6.c | 14 ++++------- net/ipv6/ioam6_iptunnel.c | 6 ++--- net/ipv6/ip6mr.c | 2 +- net/netfilter/ipvs/ip_vs_ctl.c | 5 +--- net/netfilter/nf_nat_core.c | 6 ++--- net/netfilter/nft_tunnel.c | 5 +--- net/netlabel/netlabel_mgmt.c | 13 ++++------ net/openvswitch/datapath.c | 10 ++++---- net/openvswitch/flow_netlink.c | 2 +- net/sched/act_ct.c | 10 ++++---- net/sched/act_ctinfo.c | 8 +++---- net/sched/act_gate.c | 11 +++------ net/sched/act_mpls.c | 18 +++++++------- net/sched/act_police.c | 6 ++--- net/sched/cls_api.c | 8 +++---- net/sched/sch_choke.c | 2 +- net/sched/sch_gred.c | 2 +- net/sched/sch_htb.c | 4 ++-- net/sched/sch_qfq.c | 5 +--- net/sched/sch_red.c | 2 +- net/sched/sch_taprio.c | 2 +- net/wireless/nl80211.c | 54 ++++++++++++++---------------------------- net/xfrm/xfrm_user.c | 8 +++---- 40 files changed, 126 insertions(+), 236 deletions(-) (limited to 'net') diff --git a/drivers/net/amt.c b/drivers/net/amt.c index 0433a0f36d1b..98c6205ed19f 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -3206,15 +3206,11 @@ static int amt_newlink(struct net *net, struct net_device *dev, goto err; } - if (data[IFLA_AMT_RELAY_PORT]) - amt->relay_port = nla_get_be16(data[IFLA_AMT_RELAY_PORT]); - else - amt->relay_port = htons(IANA_AMT_UDP_PORT); + amt->relay_port = nla_get_be16_default(data[IFLA_AMT_RELAY_PORT], + htons(IANA_AMT_UDP_PORT)); - if (data[IFLA_AMT_GATEWAY_PORT]) - amt->gw_port = nla_get_be16(data[IFLA_AMT_GATEWAY_PORT]); - else - amt->gw_port = htons(IANA_AMT_UDP_PORT); + amt->gw_port = nla_get_be16_default(data[IFLA_AMT_GATEWAY_PORT], + htons(IANA_AMT_UDP_PORT)); if (!amt->relay_port) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_AMT_DISCOVERY_IP], diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 70f981887518..89a996ad8cd0 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -1491,10 +1491,8 @@ static int gtp_newlink(struct net *src_net, struct net_device *dev, } gtp->role = role; - if (!data[IFLA_GTP_RESTART_COUNT]) - gtp->restart_count = 0; - else - gtp->restart_count = nla_get_u8(data[IFLA_GTP_RESTART_COUNT]); + gtp->restart_count = nla_get_u8_default(data[IFLA_GTP_RESTART_COUNT], + 0); gtp->net = src_net; @@ -1829,10 +1827,7 @@ static struct pdp_ctx *gtp_pdp_add(struct gtp_dev *gtp, struct sock *sk, version = nla_get_u32(info->attrs[GTPA_VERSION]); - if (info->attrs[GTPA_FAMILY]) - family = nla_get_u8(info->attrs[GTPA_FAMILY]); - else - family = AF_INET; + family = nla_get_u8_default(info->attrs[GTPA_FAMILY], AF_INET); #if !IS_ENABLED(CONFIG_IPV6) if (family == AF_INET6) @@ -2069,10 +2064,7 @@ static struct pdp_ctx *gtp_find_pdp_by_link(struct net *net, struct gtp_dev *gtp; int family; - if (nla[GTPA_FAMILY]) - family = nla_get_u8(nla[GTPA_FAMILY]); - else - family = AF_INET; + family = nla_get_u8_default(nla[GTPA_FAMILY], AF_INET); gtp = gtp_find_dev(net, nla); if (!gtp) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index ee2159282573..53dc89a6ae67 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -4299,9 +4299,9 @@ static int macsec_validate_attr(struct nlattr *tb[], struct nlattr *data[], } } - es = data[IFLA_MACSEC_ES] ? nla_get_u8(data[IFLA_MACSEC_ES]) : false; - sci = data[IFLA_MACSEC_INC_SCI] ? nla_get_u8(data[IFLA_MACSEC_INC_SCI]) : false; - scb = data[IFLA_MACSEC_SCB] ? nla_get_u8(data[IFLA_MACSEC_SCB]) : false; + es = nla_get_u8_default(data[IFLA_MACSEC_ES], false); + sci = nla_get_u8_default(data[IFLA_MACSEC_INC_SCI], false); + scb = nla_get_u8_default(data[IFLA_MACSEC_SCB], false); if ((sci && (scb || es)) || (scb && es)) return -EINVAL; diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 841b59d1c1c2..42b07bc2b107 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1232,10 +1232,7 @@ static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, *ifindex = 0; } - if (tb[NDA_NH_ID]) - *nhid = nla_get_u32(tb[NDA_NH_ID]); - else - *nhid = 0; + *nhid = nla_get_u32_default(tb[NDA_NH_ID], 0); return 0; } diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index cf5219df7903..134419667d59 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -161,10 +161,8 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev, return -ENODEV; } - if (data[IFLA_VLAN_PROTOCOL]) - proto = nla_get_be16(data[IFLA_VLAN_PROTOCOL]); - else - proto = htons(ETH_P_8021Q); + proto = nla_get_be16_default(data[IFLA_VLAN_PROTOCOL], + htons(ETH_P_8021Q)); vlan->vlan_proto = proto; vlan->vlan_id = nla_get_u16(data[IFLA_VLAN_ID]); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index d0de9677f450..34185d138c95 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -558,8 +558,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, nlrule->pref = fib_default_rule_pref(ops); } - nlrule->proto = tb[FRA_PROTOCOL] ? - nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC; + nlrule->proto = nla_get_u8_default(tb[FRA_PROTOCOL], RTPROT_UNSPEC); if (tb[FRA_IIFNAME]) { struct net_device *dev; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 3b33810d92a8..a5c386a45501 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2940,10 +2940,7 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev, const char *pat = ifname[0] ? ifname : NULL; int new_ifindex; - if (tb[IFLA_NEW_IFINDEX]) - new_ifindex = nla_get_s32(tb[IFLA_NEW_IFINDEX]); - else - new_ifindex = 0; + new_ifindex = nla_get_s32_default(tb[IFLA_NEW_IFINDEX], 0); err = __dev_change_net_namespace(dev, tgt_net, pat, new_ifindex); if (err) diff --git a/net/devlink/dev.c b/net/devlink/dev.c index 9264bbc90d0c..d6e3db300acb 100644 --- a/net/devlink/dev.c +++ b/net/devlink/dev.c @@ -531,10 +531,8 @@ int devlink_nl_reload_doit(struct sk_buff *skb, struct genl_info *info) return err; } - if (info->attrs[DEVLINK_ATTR_RELOAD_ACTION]) - action = nla_get_u8(info->attrs[DEVLINK_ATTR_RELOAD_ACTION]); - else - action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT; + action = nla_get_u8_default(info->attrs[DEVLINK_ATTR_RELOAD_ACTION], + DEVLINK_RELOAD_ACTION_DRIVER_REINIT); if (!devlink_reload_action_is_supported(devlink, action)) { NL_SET_ERR_MSG(info->extack, "Requested reload action is not supported by the driver"); diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 6f09b9512484..b68f2f71d0e1 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -82,10 +82,7 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, return -EINVAL; } - if (!data[IFLA_HSR_MULTICAST_SPEC]) - multicast_spec = 0; - else - multicast_spec = nla_get_u8(data[IFLA_HSR_MULTICAST_SPEC]); + multicast_spec = nla_get_u8_default(data[IFLA_HSR_MULTICAST_SPEC], 0); if (data[IFLA_HSR_PROTOCOL]) proto = nla_get_u8(data[IFLA_HSR_PROTOCOL]); diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c index 29bf97640166..74ef0a310afb 100644 --- a/net/ieee802154/nl-mac.c +++ b/net/ieee802154/nl-mac.c @@ -202,10 +202,7 @@ int ieee802154_associate_req(struct sk_buff *skb, struct genl_info *info) addr.pan_id = nla_get_shortaddr( info->attrs[IEEE802154_ATTR_COORD_PAN_ID]); - if (info->attrs[IEEE802154_ATTR_PAGE]) - page = nla_get_u8(info->attrs[IEEE802154_ATTR_PAGE]); - else - page = 0; + page = nla_get_u8_default(info->attrs[IEEE802154_ATTR_PAGE], 0); ret = ieee802154_mlme_ops(dev)->assoc_req(dev, &addr, nla_get_u8(info->attrs[IEEE802154_ATTR_CHANNEL]), @@ -338,10 +335,7 @@ int ieee802154_start_req(struct sk_buff *skb, struct genl_info *info) blx = nla_get_u8(info->attrs[IEEE802154_ATTR_BAT_EXT]); coord_realign = nla_get_u8(info->attrs[IEEE802154_ATTR_COORD_REALIGN]); - if (info->attrs[IEEE802154_ATTR_PAGE]) - page = nla_get_u8(info->attrs[IEEE802154_ATTR_PAGE]); - else - page = 0; + page = nla_get_u8_default(info->attrs[IEEE802154_ATTR_PAGE], 0); if (addr.short_addr == cpu_to_le16(IEEE802154_ADDR_BROADCAST)) { ieee802154_nl_start_confirm(dev, IEEE802154_NO_SHORT_ADDRESS); @@ -388,10 +382,7 @@ int ieee802154_scan_req(struct sk_buff *skb, struct genl_info *info) channels = nla_get_u32(info->attrs[IEEE802154_ATTR_CHANNELS]); duration = nla_get_u8(info->attrs[IEEE802154_ATTR_DURATION]); - if (info->attrs[IEEE802154_ATTR_PAGE]) - page = nla_get_u8(info->attrs[IEEE802154_ATTR_PAGE]); - else - page = 0; + page = nla_get_u8_default(info->attrs[IEEE802154_ATTR_PAGE], 0); ret = ieee802154_mlme_ops(dev)->scan_req(dev, type, channels, page, duration); diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 7eb37de3add2..5a024ca60d35 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -1438,22 +1438,18 @@ static int nl802154_trigger_scan(struct sk_buff *skb, struct genl_info *info) } /* Use current page by default */ - if (info->attrs[NL802154_ATTR_PAGE]) - request->page = nla_get_u8(info->attrs[NL802154_ATTR_PAGE]); - else - request->page = wpan_phy->current_page; + request->page = nla_get_u8_default(info->attrs[NL802154_ATTR_PAGE], + wpan_phy->current_page); /* Scan all supported channels by default */ - if (info->attrs[NL802154_ATTR_SCAN_CHANNELS]) - request->channels = nla_get_u32(info->attrs[NL802154_ATTR_SCAN_CHANNELS]); - else - request->channels = wpan_phy->supported.channels[request->page]; + request->channels = + nla_get_u32_default(info->attrs[NL802154_ATTR_SCAN_CHANNELS], + wpan_phy->supported.channels[request->page]); /* Use maximum duration order by default */ - if (info->attrs[NL802154_ATTR_SCAN_DURATION]) - request->duration = nla_get_u8(info->attrs[NL802154_ATTR_SCAN_DURATION]); - else - request->duration = IEEE802154_MAX_SCAN_DURATION; + request->duration = + nla_get_u8_default(info->attrs[NL802154_ATTR_SCAN_DURATION], + IEEE802154_MAX_SCAN_DURATION); err = rdev_trigger_scan(rdev, request); if (err) { @@ -1598,10 +1594,8 @@ nl802154_send_beacons(struct sk_buff *skb, struct genl_info *info) request->wpan_phy = wpan_phy; /* Use maximum duration order by default */ - if (info->attrs[NL802154_ATTR_BEACON_INTERVAL]) - request->interval = nla_get_u8(info->attrs[NL802154_ATTR_BEACON_INTERVAL]); - else - request->interval = IEEE802154_MAX_SCAN_DURATION; + request->interval = nla_get_u8_default(info->attrs[NL802154_ATTR_BEACON_INTERVAL], + IEEE802154_MAX_SCAN_DURATION); err = rdev_send_beacons(rdev, request); if (err) { diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index f58f39a9ee87..c8b3cf5fba4c 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -926,8 +926,7 @@ static struct in_ifaddr *inet_rtm_to_ifa(struct net *net, struct nlmsghdr *nlh, ifa->ifa_prefixlen = ifm->ifa_prefixlen; ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); - ifa->ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : - ifm->ifa_flags; + ifa->ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags); ifa->ifa_scope = ifm->ifa_scope; ifa->ifa_local = nla_get_in_addr(tb[IFA_LOCAL]); ifa->ifa_address = nla_get_in_addr(tb[IFA_ADDRESS]); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 99e7cd0531d9..c58dd78509a2 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2546,9 +2546,9 @@ static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (err < 0) goto errout; - src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0; - grp = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; - tableid = tb[RTA_TABLE] ? nla_get_u32(tb[RTA_TABLE]) : 0; + src = nla_get_in_addr_default(tb[RTA_SRC], 0); + grp = nla_get_in_addr_default(tb[RTA_DST], 0); + tableid = nla_get_u32_default(tb[RTA_TABLE], 0); mrt = ipmr_get_table(net, tableid ? tableid : RT_TABLE_DEFAULT); if (!mrt) { diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 570e450e008c..09a3d73b45ba 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -3247,12 +3247,8 @@ static int nh_valid_get_del_req(const struct nlmsghdr *nlh, return -EINVAL; } - if (op_flags) { - if (tb[NHA_OP_FLAGS]) - *op_flags = nla_get_u32(tb[NHA_OP_FLAGS]); - else - *op_flags = 0; - } + if (op_flags) + *op_flags = nla_get_u32_default(tb[NHA_OP_FLAGS], 0); return 0; } @@ -3433,10 +3429,7 @@ static int nh_valid_dump_req(const struct nlmsghdr *nlh, if (err < 0) return err; - if (tb[NHA_OP_FLAGS]) - filter->op_flags = nla_get_u32(tb[NHA_OP_FLAGS]); - else - filter->op_flags = 0; + filter->op_flags = nla_get_u32_default(tb[NHA_OP_FLAGS], 0); return __nh_valid_dump_req(nlh, tb, filter, cb->extack); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 763398e08b7d..4c5e773002fe 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3231,10 +3231,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, return err; rtm = nlmsg_data(nlh); - src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0; - dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; - iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; - mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; + src = nla_get_in_addr_default(tb[RTA_SRC], 0); + dst = nla_get_in_addr_default(tb[RTA_DST], 0); + iif = nla_get_u32_default(tb[RTA_IIF], 0); + mark = nla_get_u32_default(tb[RTA_MARK], 0); if (tb[RTA_UID]) uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID])); else @@ -3260,7 +3260,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fl4.daddr = dst; fl4.saddr = src; fl4.flowi4_tos = rtm->rtm_tos & INET_DSCP_MASK; - fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; + fl4.flowi4_oif = nla_get_u32_default(tb[RTA_OIF], 0); fl4.flowi4_mark = mark; fl4.flowi4_uid = uid; if (sport) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d0a99710d65d..96b5b2b0d507 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4793,7 +4793,7 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, if (!pfx) return -EINVAL; - ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : ifm->ifa_flags; + ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags); /* We ignore other flags so far. */ ifa_flags &= IFA_F_MANAGETEMPADDR; @@ -5018,10 +5018,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, return -ENODEV; } - if (tb[IFA_FLAGS]) - cfg.ifa_flags = nla_get_u32(tb[IFA_FLAGS]); - else - cfg.ifa_flags = ifm->ifa_flags; + cfg.ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags); /* We ignore other flags so far. */ cfg.ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 534a4498e280..7646e401c630 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -105,16 +105,11 @@ static int parse_nl_config(struct genl_info *info, xp->ip.locator_match.v64 = (__force __be64)nla_get_u64( info->attrs[ILA_ATTR_LOCATOR_MATCH]); - if (info->attrs[ILA_ATTR_CSUM_MODE]) - xp->ip.csum_mode = nla_get_u8(info->attrs[ILA_ATTR_CSUM_MODE]); - else - xp->ip.csum_mode = ILA_CSUM_NO_ACTION; - - if (info->attrs[ILA_ATTR_IDENT_TYPE]) - xp->ip.ident_type = nla_get_u8( - info->attrs[ILA_ATTR_IDENT_TYPE]); - else - xp->ip.ident_type = ILA_ATYPE_USE_FORMAT; + xp->ip.csum_mode = nla_get_u8_default(info->attrs[ILA_ATTR_CSUM_MODE], + ILA_CSUM_NO_ACTION); + + xp->ip.ident_type = nla_get_u8_default(info->attrs[ILA_ATTR_IDENT_TYPE], + ILA_ATYPE_USE_FORMAT); if (info->attrs[ILA_ATTR_IFINDEX]) xp->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]); diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c index 08c929513065..a84d332f952f 100644 --- a/net/ipv6/ioam6.c +++ b/net/ipv6/ioam6.c @@ -135,15 +135,11 @@ static int ioam6_genl_addns(struct sk_buff *skb, struct genl_info *info) ns->id = id; - if (!info->attrs[IOAM6_ATTR_NS_DATA]) - data32 = IOAM6_U32_UNAVAILABLE; - else - data32 = nla_get_u32(info->attrs[IOAM6_ATTR_NS_DATA]); - - if (!info->attrs[IOAM6_ATTR_NS_DATA_WIDE]) - data64 = IOAM6_U64_UNAVAILABLE; - else - data64 = nla_get_u64(info->attrs[IOAM6_ATTR_NS_DATA_WIDE]); + data32 = nla_get_u32_default(info->attrs[IOAM6_ATTR_NS_DATA], + IOAM6_U32_UNAVAILABLE); + + data64 = nla_get_u64_default(info->attrs[IOAM6_ATTR_NS_DATA_WIDE], + IOAM6_U64_UNAVAILABLE); ns->data = cpu_to_be32(data32); ns->data_wide = cpu_to_be64(data64); diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c index beb6b4cfc551..9d8422e350f8 100644 --- a/net/ipv6/ioam6_iptunnel.c +++ b/net/ipv6/ioam6_iptunnel.c @@ -142,10 +142,8 @@ static int ioam6_build_state(struct net *net, struct nlattr *nla, } } - if (!tb[IOAM6_IPTUNNEL_MODE]) - mode = IOAM6_IPTUNNEL_MODE_INLINE; - else - mode = nla_get_u8(tb[IOAM6_IPTUNNEL_MODE]); + mode = nla_get_u8_default(tb[IOAM6_IPTUNNEL_MODE], + IOAM6_IPTUNNEL_MODE_INLINE); if (tb[IOAM6_IPTUNNEL_SRC] && mode == IOAM6_IPTUNNEL_MODE_INLINE) { NL_SET_ERR_MSG(extack, "no tunnel src expected with this mode"); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 8add0f45aa52..d66f58932a79 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -2560,7 +2560,7 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, src = nla_get_in6_addr(tb[RTA_SRC]); if (tb[RTA_DST]) grp = nla_get_in6_addr(tb[RTA_DST]); - tableid = tb[RTA_TABLE] ? nla_get_u32(tb[RTA_TABLE]) : 0; + tableid = nla_get_u32_default(tb[RTA_TABLE], 0); mrt = ip6mr_get_table(net, tableid ?: RT_TABLE_DEFAULT); if (!mrt) { diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index dc6ddc4abbe2..7d13110ce188 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3662,10 +3662,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); udest->port = nla_get_be16(nla_port); - if (nla_addr_family) - udest->af = nla_get_u16(nla_addr_family); - else - udest->af = 0; + udest->af = nla_get_u16_default(nla_addr_family, 0); /* If a full entry was requested, check for the additional fields */ if (full_entry) { diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 4085c436e306..aad84aabd7f1 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -1090,10 +1090,8 @@ static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], range->flags |= NF_NAT_RANGE_MAP_IPS; } - if (tb[CTA_NAT_V4_MAXIP]) - range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]); - else - range->max_addr.ip = range->min_addr.ip; + range->max_addr.ip = nla_get_be32_default(tb[CTA_NAT_V4_MAXIP], + range->min_addr.ip); return 0; } diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 5c6ed68cc6e0..681301b46aa4 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -497,10 +497,7 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx, } if (tb[NFTA_TUNNEL_KEY_TOS]) info.key.tos = nla_get_u8(tb[NFTA_TUNNEL_KEY_TOS]); - if (tb[NFTA_TUNNEL_KEY_TTL]) - info.key.ttl = nla_get_u8(tb[NFTA_TUNNEL_KEY_TTL]); - else - info.key.ttl = U8_MAX; + info.key.ttl = nla_get_u8_default(tb[NFTA_TUNNEL_KEY_TTL], U8_MAX); if (tb[NFTA_TUNNEL_KEY_OPTS]) { err = nft_tunnel_obj_opts_init(ctx, tb[NFTA_TUNNEL_KEY_OPTS], diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c index 689eaa2afbec..079fe72a6384 100644 --- a/net/netlabel/netlabel_mgmt.c +++ b/net/netlabel/netlabel_mgmt.c @@ -107,11 +107,9 @@ static int netlbl_mgmt_add_common(struct genl_info *info, switch (entry->def.type) { case NETLBL_NLTYPE_UNLABELED: - if (info->attrs[NLBL_MGMT_A_FAMILY]) - entry->family = - nla_get_u16(info->attrs[NLBL_MGMT_A_FAMILY]); - else - entry->family = AF_UNSPEC; + entry->family = + nla_get_u16_default(info->attrs[NLBL_MGMT_A_FAMILY], + AF_UNSPEC); break; case NETLBL_NLTYPE_CIPSOV4: if (!info->attrs[NLBL_MGMT_A_CV4DOI]) @@ -601,10 +599,7 @@ static int netlbl_mgmt_listdef(struct sk_buff *skb, struct genl_info *info) struct netlbl_dom_map *entry; u16 family; - if (info->attrs[NLBL_MGMT_A_FAMILY]) - family = nla_get_u16(info->attrs[NLBL_MGMT_A_FAMILY]); - else - family = AF_INET; + family = nla_get_u16_default(info->attrs[NLBL_MGMT_A_FAMILY], AF_INET); ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (ans_skb == NULL) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 78d9961fcd44..225f6048867f 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1828,8 +1828,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) parms.dp = dp; parms.port_no = OVSP_LOCAL; parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID]; - parms.desired_ifindex = a[OVS_DP_ATTR_IFINDEX] - ? nla_get_s32(a[OVS_DP_ATTR_IFINDEX]) : 0; + parms.desired_ifindex = nla_get_s32_default(a[OVS_DP_ATTR_IFINDEX], 0); /* So far only local changes have been made, now need the lock. */ ovs_lock(); @@ -2266,8 +2265,7 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) if (a[OVS_VPORT_ATTR_IFINDEX] && parms.type != OVS_VPORT_TYPE_INTERNAL) return -EOPNOTSUPP; - port_no = a[OVS_VPORT_ATTR_PORT_NO] - ? nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]) : 0; + port_no = nla_get_u32_default(a[OVS_VPORT_ATTR_PORT_NO], 0); if (port_no >= DP_MAX_PORTS) return -EFBIG; @@ -2304,8 +2302,8 @@ restart: parms.dp = dp; parms.port_no = port_no; parms.upcall_portids = a[OVS_VPORT_ATTR_UPCALL_PID]; - parms.desired_ifindex = a[OVS_VPORT_ATTR_IFINDEX] - ? nla_get_s32(a[OVS_VPORT_ATTR_IFINDEX]) : 0; + parms.desired_ifindex = nla_get_s32_default(a[OVS_VPORT_ATTR_IFINDEX], + 0); vport = new_vport(&parms); err = PTR_ERR(vport); diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 729ef582a3a8..881ddd3696d5 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1938,7 +1938,7 @@ int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid, u32 ovs_nla_get_ufid_flags(const struct nlattr *attr) { - return attr ? nla_get_u32(attr) : 0; + return nla_get_u32_default(attr, 0); } /** diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 2197eb625658..c02f39efc6ef 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -1183,9 +1183,8 @@ static int tcf_ct_fill_params_nat(struct tcf_ct_params *p, range->min_addr.ip = nla_get_in_addr(tb[TCA_CT_NAT_IPV4_MIN]); - range->max_addr.ip = max_attr ? - nla_get_in_addr(max_attr) : - range->min_addr.ip; + range->max_addr.ip = + nla_get_in_addr_default(max_attr, range->min_addr.ip); } else if (tb[TCA_CT_NAT_IPV6_MIN]) { struct nlattr *max_attr = tb[TCA_CT_NAT_IPV6_MAX]; @@ -1314,8 +1313,9 @@ static int tcf_ct_fill_params(struct net *net, err = -EINVAL; goto err; } - family = tb[TCA_CT_HELPER_FAMILY] ? nla_get_u8(tb[TCA_CT_HELPER_FAMILY]) : AF_INET; - proto = tb[TCA_CT_HELPER_PROTO] ? nla_get_u8(tb[TCA_CT_HELPER_PROTO]) : IPPROTO_TCP; + family = nla_get_u8_default(tb[TCA_CT_HELPER_FAMILY], AF_INET); + proto = nla_get_u8_default(tb[TCA_CT_HELPER_PROTO], + IPPROTO_TCP); err = nf_ct_add_helper(tmpl, name, family, proto, p->ct_action & TCA_CT_ACT_NAT, &p->helper); if (err) { diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 5dd41a012110..5b1241ddc758 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -197,8 +197,9 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, "dscp mask must be 6 contiguous bits"); return -EINVAL; } - dscpstatemask = tb[TCA_CTINFO_PARMS_DSCP_STATEMASK] ? - nla_get_u32(tb[TCA_CTINFO_PARMS_DSCP_STATEMASK]) : 0; + dscpstatemask = + nla_get_u32_default(tb[TCA_CTINFO_PARMS_DSCP_STATEMASK], + 0); /* mask & statemask must not overlap */ if (dscpmask & dscpstatemask) { NL_SET_ERR_MSG_ATTR(extack, @@ -243,8 +244,7 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, } cp_new->net = net; - cp_new->zone = tb[TCA_CTINFO_ZONE] ? - nla_get_u16(tb[TCA_CTINFO_ZONE]) : 0; + cp_new->zone = nla_get_u16_default(tb[TCA_CTINFO_ZONE], 0); if (dscpmask) { cp_new->dscpmask = dscpmask; cp_new->dscpmaskshift = dscpmaskshift; diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index 1dd74125398a..91c0ec729823 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -190,15 +190,10 @@ static int fill_gate_entry(struct nlattr **tb, struct tcfg_gate_entry *entry, entry->interval = interval; - if (tb[TCA_GATE_ENTRY_IPV]) - entry->ipv = nla_get_s32(tb[TCA_GATE_ENTRY_IPV]); - else - entry->ipv = -1; + entry->ipv = nla_get_s32_default(tb[TCA_GATE_ENTRY_IPV], -1); - if (tb[TCA_GATE_ENTRY_MAX_OCTETS]) - entry->maxoctets = nla_get_s32(tb[TCA_GATE_ENTRY_MAX_OCTETS]); - else - entry->maxoctets = -1; + entry->maxoctets = nla_get_s32_default(tb[TCA_GATE_ENTRY_MAX_OCTETS], + -1); return 0; } diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index 44a37a71ae92..9f86f4e666d3 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -288,16 +288,14 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla, } p->tcfm_action = parm->m_action; - p->tcfm_label = tb[TCA_MPLS_LABEL] ? nla_get_u32(tb[TCA_MPLS_LABEL]) : - ACT_MPLS_LABEL_NOT_SET; - p->tcfm_tc = tb[TCA_MPLS_TC] ? nla_get_u8(tb[TCA_MPLS_TC]) : - ACT_MPLS_TC_NOT_SET; - p->tcfm_ttl = tb[TCA_MPLS_TTL] ? nla_get_u8(tb[TCA_MPLS_TTL]) : - mpls_ttl; - p->tcfm_bos = tb[TCA_MPLS_BOS] ? nla_get_u8(tb[TCA_MPLS_BOS]) : - ACT_MPLS_BOS_NOT_SET; - p->tcfm_proto = tb[TCA_MPLS_PROTO] ? nla_get_be16(tb[TCA_MPLS_PROTO]) : - htons(ETH_P_MPLS_UC); + p->tcfm_label = nla_get_u32_default(tb[TCA_MPLS_LABEL], + ACT_MPLS_LABEL_NOT_SET); + p->tcfm_tc = nla_get_u8_default(tb[TCA_MPLS_TC], ACT_MPLS_TC_NOT_SET); + p->tcfm_ttl = nla_get_u8_default(tb[TCA_MPLS_TTL], mpls_ttl); + p->tcfm_bos = nla_get_u8_default(tb[TCA_MPLS_BOS], + ACT_MPLS_BOS_NOT_SET); + p->tcfm_proto = nla_get_be16_default(tb[TCA_MPLS_PROTO], + htons(ETH_P_MPLS_UC)); spin_lock_bh(&m->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 8555125ed34d..a214ed681142 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -167,8 +167,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, } if (R_tab) { new->rate_present = true; - rate64 = tb[TCA_POLICE_RATE64] ? - nla_get_u64(tb[TCA_POLICE_RATE64]) : 0; + rate64 = nla_get_u64_default(tb[TCA_POLICE_RATE64], 0); psched_ratecfg_precompute(&new->rate, &R_tab->rate, rate64); qdisc_put_rtab(R_tab); } else { @@ -176,8 +175,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, } if (P_tab) { new->peak_present = true; - prate64 = tb[TCA_POLICE_PEAKRATE64] ? - nla_get_u64(tb[TCA_POLICE_PEAKRATE64]) : 0; + prate64 = nla_get_u64_default(tb[TCA_POLICE_PEAKRATE64], 0); psched_ratecfg_precompute(&new->peak, &P_tab->rate, prate64); qdisc_put_rtab(P_tab); } else { diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 2a7d856cc334..04942f8c62e0 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -2297,7 +2297,7 @@ replay: } block->classid = parent; - chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; + chain_index = nla_get_u32_default(tca[TCA_CHAIN], 0); if (chain_index > TC_ACT_EXT_VAL_MASK) { NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); err = -EINVAL; @@ -2509,7 +2509,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, goto errout; } - chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; + chain_index = nla_get_u32_default(tca[TCA_CHAIN], 0); if (chain_index > TC_ACT_EXT_VAL_MASK) { NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); err = -EINVAL; @@ -2664,7 +2664,7 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, goto errout; } - chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; + chain_index = nla_get_u32_default(tca[TCA_CHAIN], 0); if (chain_index > TC_ACT_EXT_VAL_MASK) { NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); err = -EINVAL; @@ -3104,7 +3104,7 @@ replay: if (IS_ERR(block)) return PTR_ERR(block); - chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; + chain_index = nla_get_u32_default(tca[TCA_CHAIN], 0); if (chain_index > TC_ACT_EXT_VAL_MASK) { NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); err = -EINVAL; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 91072010923d..1e940ad0d2fa 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -356,7 +356,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt, tb[TCA_CHOKE_STAB] == NULL) return -EINVAL; - max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0; + max_P = nla_get_u32_default(tb[TCA_CHOKE_MAX_P], 0); ctl = nla_data(tb[TCA_CHOKE_PARMS]); stab = nla_data(tb[TCA_CHOKE_STAB]); diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 79ba9dc70254..7d2151c62c4a 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -668,7 +668,7 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt, return -EINVAL; } - max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0; + max_P = nla_get_u32_default(tb[TCA_GRED_MAX_P], 0); ctl = nla_data(tb[TCA_GRED_PARMS]); stab = nla_data(tb[TCA_GRED_STAB]); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index ff3de37874e4..c31bc5489bdd 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1810,8 +1810,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, qdisc_put_rtab(qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB], NULL)); - rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0; - ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0; + rate64 = nla_get_u64_default(tb[TCA_HTB_RATE64], 0); + ceil64 = nla_get_u64_default(tb[TCA_HTB_CEIL64], 0); if (!cl) { /* new class */ struct net_device *dev = qdisc_dev(sch); diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index d584c0c25899..6a07cdbdb9e1 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -421,10 +421,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (err < 0) return err; - if (tb[TCA_QFQ_WEIGHT]) - weight = nla_get_u32(tb[TCA_QFQ_WEIGHT]); - else - weight = 1; + weight = nla_get_u32_default(tb[TCA_QFQ_WEIGHT], 1); if (tb[TCA_QFQ_LMAX]) { lmax = nla_get_u32(tb[TCA_QFQ_LMAX]); diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index b5f096588fae..6029bc29b51e 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -248,7 +248,7 @@ static int __red_change(struct Qdisc *sch, struct nlattr **tb, tb[TCA_RED_STAB] == NULL) return -EINVAL; - max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0; + max_P = nla_get_u32_default(tb[TCA_RED_MAX_P], 0); ctl = nla_data(tb[TCA_RED_PARMS]); stab = nla_data(tb[TCA_RED_STAB]); diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 8623dc0bafc0..a68e17891b0b 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1828,7 +1828,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, * zero; (2) the 'flags' of a "running" taprio instance cannot be * changed. */ - taprio_flags = tb[TCA_TAPRIO_ATTR_FLAGS] ? nla_get_u32(tb[TCA_TAPRIO_ATTR_FLAGS]) : 0; + taprio_flags = nla_get_u32_default(tb[TCA_TAPRIO_ATTR_FLAGS], 0); /* txtime-assist and full offload are mutually exclusive */ if ((taprio_flags & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) && diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 1ac8a196f376..8cc9b968dbd8 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1286,10 +1286,7 @@ static unsigned int nl80211_link_id(struct nlattr **attrs) { struct nlattr *linkid = attrs[NL80211_ATTR_MLO_LINK_ID]; - if (!linkid) - return 0; - - return nla_get_u8(linkid); + return nla_get_u8_default(linkid, 0); } static int nl80211_link_id_or_invalid(struct nlattr **attrs) @@ -3414,11 +3411,9 @@ static int _nl80211_parse_chandef(struct cfg80211_registered_device *rdev, if (attrs[NL80211_ATTR_CENTER_FREQ1]) { chandef->center_freq1 = nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ1]); - if (attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET]) - chandef->freq1_offset = nla_get_u32( - attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET]); - else - chandef->freq1_offset = 0; + chandef->freq1_offset = + nla_get_u32_default(attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET], + 0); } if (attrs[NL80211_ATTR_CENTER_FREQ2]) chandef->center_freq2 = @@ -8265,11 +8260,9 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info) if (unlikely(!rcu_access_pointer(cfg80211_regdomain))) return -EINPROGRESS; - if (info->attrs[NL80211_ATTR_USER_REG_HINT_TYPE]) - user_reg_hint_type = - nla_get_u32(info->attrs[NL80211_ATTR_USER_REG_HINT_TYPE]); - else - user_reg_hint_type = NL80211_USER_REG_HINT_USER; + user_reg_hint_type = + nla_get_u32_default(info->attrs[NL80211_ATTR_USER_REG_HINT_TYPE], + NL80211_USER_REG_HINT_USER); switch (user_reg_hint_type) { case NL80211_USER_REG_HINT_USER: @@ -11087,11 +11080,9 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, nla_len(info->attrs[NL80211_ATTR_SAE_PASSWORD]); } - if (info->attrs[NL80211_ATTR_SAE_PWE]) - settings->sae_pwe = - nla_get_u8(info->attrs[NL80211_ATTR_SAE_PWE]); - else - settings->sae_pwe = NL80211_SAE_PWE_UNSPECIFIED; + settings->sae_pwe = + nla_get_u8_default(info->attrs[NL80211_ATTR_SAE_PWE], + NL80211_SAE_PWE_UNSPECIFIED); return 0; } @@ -12347,10 +12338,8 @@ static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info) dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid) return -EPERM; - if (!info->attrs[NL80211_ATTR_REASON_CODE]) - reason = WLAN_REASON_DEAUTH_LEAVING; - else - reason = nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]); + reason = nla_get_u16_default(info->attrs[NL80211_ATTR_REASON_CODE], + WLAN_REASON_DEAUTH_LEAVING); if (reason == 0) return -EINVAL; @@ -13696,10 +13685,7 @@ static int nl80211_parse_wowlan_tcp(struct cfg80211_registered_device *rdev, cfg->dst = nla_get_in_addr(tb[NL80211_WOWLAN_TCP_DST_IPV4]); memcpy(cfg->dst_mac, nla_data(tb[NL80211_WOWLAN_TCP_DST_MAC]), ETH_ALEN); - if (tb[NL80211_WOWLAN_TCP_SRC_PORT]) - port = nla_get_u16(tb[NL80211_WOWLAN_TCP_SRC_PORT]); - else - port = 0; + port = nla_get_u16_default(tb[NL80211_WOWLAN_TCP_SRC_PORT], 0); #ifdef CONFIG_INET /* allocate a socket and port for it and use it */ err = __sock_create(wiphy_net(&rdev->wiphy), PF_INET, SOCK_STREAM, @@ -13910,11 +13896,9 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) pat_len < wowlan->pattern_min_len) goto error; - if (!pat_tb[NL80211_PKTPAT_OFFSET]) - pkt_offset = 0; - else - pkt_offset = nla_get_u32( - pat_tb[NL80211_PKTPAT_OFFSET]); + pkt_offset = + nla_get_u32_default(pat_tb[NL80211_PKTPAT_OFFSET], + 0); if (pkt_offset > wowlan->max_pkt_offset) goto error; new_triggers.patterns[i].pkt_offset = pkt_offset; @@ -14158,10 +14142,8 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev, pat_len < coalesce->pattern_min_len) return -EINVAL; - if (!pat_tb[NL80211_PKTPAT_OFFSET]) - pkt_offset = 0; - else - pkt_offset = nla_get_u32(pat_tb[NL80211_PKTPAT_OFFSET]); + pkt_offset = nla_get_u32_default(pat_tb[NL80211_PKTPAT_OFFSET], + 0); if (pkt_offset > coalesce->max_pkt_offset) return -EINVAL; new_rule->patterns[i].pkt_offset = pkt_offset; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index e3b8ce89831a..e0dd9dfd71c2 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -200,7 +200,7 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, struct netlink_ext_ack *extack) { int err; - u8 sa_dir = attrs[XFRMA_SA_DIR] ? nla_get_u8(attrs[XFRMA_SA_DIR]) : 0; + u8 sa_dir = nla_get_u8_default(attrs[XFRMA_SA_DIR], 0); u16 family = p->sel.family; err = -EINVAL; @@ -767,10 +767,8 @@ static void xfrm_smark_init(struct nlattr **attrs, struct xfrm_mark *m) { if (attrs[XFRMA_SET_MARK]) { m->v = nla_get_u32(attrs[XFRMA_SET_MARK]); - if (attrs[XFRMA_SET_MARK_MASK]) - m->m = nla_get_u32(attrs[XFRMA_SET_MARK_MASK]); - else - m->m = 0xffffffff; + m->m = nla_get_u32_default(attrs[XFRMA_SET_MARK_MASK], + 0xffffffff); } else { m->v = m->m = 0; } -- cgit v1.3 From 3c63d8946e578663b868cb9912dac616ea68bfd0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 17 Sep 2024 12:15:29 -0400 Subject: svcrdma: Address an integer overflow Dan Carpenter reports: > Commit 78147ca8b4a9 ("svcrdma: Add a "parsed chunk list" data > structure") from Jun 22, 2020 (linux-next), leads to the following > Smatch static checker warning: > > net/sunrpc/xprtrdma/svc_rdma_recvfrom.c:498 xdr_check_write_chunk() > warn: potential user controlled sizeof overflow 'segcount * 4 * 4' > > net/sunrpc/xprtrdma/svc_rdma_recvfrom.c > 488 static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt) > 489 { > 490 u32 segcount; > 491 __be32 *p; > 492 > 493 if (xdr_stream_decode_u32(&rctxt->rc_stream, &segcount)) > ^^^^^^^^ > > 494 return false; > 495 > 496 /* A bogus segcount causes this buffer overflow check to fail. */ > 497 p = xdr_inline_decode(&rctxt->rc_stream, > --> 498 segcount * rpcrdma_segment_maxsz * sizeof(*p)); > > > segcount is an untrusted u32. On 32bit systems anything >= SIZE_MAX / 16 will > have an integer overflow and some those values will be accepted by > xdr_inline_decode(). Reported-by: Dan Carpenter Fixes: 78147ca8b4a9 ("svcrdma: Add a "parsed chunk list" data structure") Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index ae3fb9bc8a21..292022f0976e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -493,7 +493,13 @@ static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt) if (xdr_stream_decode_u32(&rctxt->rc_stream, &segcount)) return false; - /* A bogus segcount causes this buffer overflow check to fail. */ + /* Before trusting the segcount value enough to use it in + * a computation, perform a simple range check. This is an + * arbitrary but sensible limit (ie, not architectural). + */ + if (unlikely(segcount > RPCSVC_MAXPAGES)) + return false; + p = xdr_inline_decode(&rctxt->rc_stream, segcount * rpcrdma_segment_maxsz * sizeof(*p)); return p != NULL; -- cgit v1.3 From 3d18dfe69ce46f106af327736d2261d7e3ee81c0 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Mon, 28 Oct 2024 19:53:39 +0800 Subject: mm: page_frag: avoid caller accessing 'page_frag_cache' directly Use appropriate frag_page API instead of caller accessing 'page_frag_cache' directly. CC: Andrew Morton CC: Linux-MM Signed-off-by: Yunsheng Lin Reviewed-by: Alexander Duyck Acked-by: Chuck Lever Link: https://patch.msgid.link/20241028115343.3405838-5-linyunsheng@huawei.com Signed-off-by: Jakub Kicinski --- drivers/vhost/net.c | 2 +- include/linux/page_frag_cache.h | 10 ++++++++++ net/core/skbuff.c | 6 +++--- net/rxrpc/conn_object.c | 4 +--- net/rxrpc/local_object.c | 4 +--- net/sunrpc/svcsock.c | 6 ++---- tools/testing/selftests/mm/page_frag/page_frag_test.c | 2 +- 7 files changed, 19 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index f16279351db5..9ad37c012189 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -1325,7 +1325,7 @@ static int vhost_net_open(struct inode *inode, struct file *f) vqs[VHOST_NET_VQ_RX]); f->private_data = n; - n->pf_cache.va = NULL; + page_frag_cache_init(&n->pf_cache); return 0; } diff --git a/include/linux/page_frag_cache.h b/include/linux/page_frag_cache.h index 67ac8626ed9b..0a52f7a179c8 100644 --- a/include/linux/page_frag_cache.h +++ b/include/linux/page_frag_cache.h @@ -7,6 +7,16 @@ #include #include +static inline void page_frag_cache_init(struct page_frag_cache *nc) +{ + nc->va = NULL; +} + +static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc) +{ + return !!nc->pfmemalloc; +} + void page_frag_cache_drain(struct page_frag_cache *nc); void __page_frag_cache_drain(struct page *page, unsigned int count); void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 00afeb90c23a..6841e61a6bd0 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -753,14 +753,14 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, if (in_hardirq() || irqs_disabled()) { nc = this_cpu_ptr(&netdev_alloc_cache); data = page_frag_alloc(nc, len, gfp_mask); - pfmemalloc = nc->pfmemalloc; + pfmemalloc = page_frag_cache_is_pfmemalloc(nc); } else { local_bh_disable(); local_lock_nested_bh(&napi_alloc_cache.bh_lock); nc = this_cpu_ptr(&napi_alloc_cache.page); data = page_frag_alloc(nc, len, gfp_mask); - pfmemalloc = nc->pfmemalloc; + pfmemalloc = page_frag_cache_is_pfmemalloc(nc); local_unlock_nested_bh(&napi_alloc_cache.bh_lock); local_bh_enable(); @@ -850,7 +850,7 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) len = SKB_HEAD_ALIGN(len); data = page_frag_alloc(&nc->page, len, gfp_mask); - pfmemalloc = nc->page.pfmemalloc; + pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); } local_unlock_nested_bh(&napi_alloc_cache.bh_lock); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 1539d315afe7..694c4df7a1a3 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -337,9 +337,7 @@ static void rxrpc_clean_up_connection(struct work_struct *work) */ rxrpc_purge_queue(&conn->rx_queue); - if (conn->tx_data_alloc.va) - __page_frag_cache_drain(virt_to_page(conn->tx_data_alloc.va), - conn->tx_data_alloc.pagecnt_bias); + page_frag_cache_drain(&conn->tx_data_alloc); call_rcu(&conn->rcu, rxrpc_rcu_free_connection); } diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index f9623ace2201..2792d2304605 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -452,9 +452,7 @@ void rxrpc_destroy_local(struct rxrpc_local *local) #endif rxrpc_purge_queue(&local->rx_queue); rxrpc_purge_client_connections(local); - if (local->tx_alloc.va) - __page_frag_cache_drain(virt_to_page(local->tx_alloc.va), - local->tx_alloc.pagecnt_bias); + page_frag_cache_drain(&local->tx_alloc); } /* diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 825ec5357691..b785425c3315 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1608,7 +1608,6 @@ static void svc_tcp_sock_detach(struct svc_xprt *xprt) static void svc_sock_free(struct svc_xprt *xprt) { struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); - struct page_frag_cache *pfc = &svsk->sk_frag_cache; struct socket *sock = svsk->sk_sock; trace_svcsock_free(svsk, sock); @@ -1618,8 +1617,7 @@ static void svc_sock_free(struct svc_xprt *xprt) sockfd_put(sock); else sock_release(sock); - if (pfc->va) - __page_frag_cache_drain(virt_to_head_page(pfc->va), - pfc->pagecnt_bias); + + page_frag_cache_drain(&svsk->sk_frag_cache); kfree(svsk); } diff --git a/tools/testing/selftests/mm/page_frag/page_frag_test.c b/tools/testing/selftests/mm/page_frag/page_frag_test.c index 13c44133e009..e806c1866e36 100644 --- a/tools/testing/selftests/mm/page_frag/page_frag_test.c +++ b/tools/testing/selftests/mm/page_frag/page_frag_test.c @@ -126,7 +126,7 @@ static int __init page_frag_test_init(void) u64 duration; int ret; - test_nc.va = NULL; + page_frag_cache_init(&test_nc); atomic_set(&nthreads, 2); init_completion(&wait); -- cgit v1.3 From d9ccb18f83ea2bb654289b6ecf014fd267cc988b Mon Sep 17 00:00:00 2001 From: Omid Ehtemam-Haghighi Date: Tue, 5 Nov 2024 17:02:36 -0800 Subject: ipv6: Fix soft lockups in fib6_select_path under high next hop churn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Soft lockups have been observed on a cluster of Linux-based edge routers located in a highly dynamic environment. Using the `bird` service, these routers continuously update BGP-advertised routes due to frequently changing nexthop destinations, while also managing significant IPv6 traffic. The lockups occur during the traversal of the multipath circular linked-list in the `fib6_select_path` function, particularly while iterating through the siblings in the list. The issue typically arises when the nodes of the linked list are unexpectedly deleted concurrently on a different core—indicated by their 'next' and 'previous' elements pointing back to the node itself and their reference count dropping to zero. This results in an infinite loop, leading to a soft lockup that triggers a system panic via the watchdog timer. Apply RCU primitives in the problematic code sections to resolve the issue. Where necessary, update the references to fib6_siblings to annotate or use the RCU APIs. Include a test script that reproduces the issue. The script periodically updates the routing table while generating a heavy load of outgoing IPv6 traffic through multiple iperf3 clients. It consistently induces infinite soft lockups within a couple of minutes. Kernel log: 0 [ffffbd13003e8d30] machine_kexec at ffffffff8ceaf3eb 1 [ffffbd13003e8d90] __crash_kexec at ffffffff8d0120e3 2 [ffffbd13003e8e58] panic at ffffffff8cef65d4 3 [ffffbd13003e8ed8] watchdog_timer_fn at ffffffff8d05cb03 4 [ffffbd13003e8f08] __hrtimer_run_queues at ffffffff8cfec62f 5 [ffffbd13003e8f70] hrtimer_interrupt at ffffffff8cfed756 6 [ffffbd13003e8fd0] __sysvec_apic_timer_interrupt at ffffffff8cea01af 7 [ffffbd13003e8ff0] sysvec_apic_timer_interrupt at ffffffff8df1b83d -- -- 8 [ffffbd13003d3708] asm_sysvec_apic_timer_interrupt at ffffffff8e000ecb [exception RIP: fib6_select_path+299] RIP: ffffffff8ddafe7b RSP: ffffbd13003d37b8 RFLAGS: 00000287 RAX: ffff975850b43600 RBX: ffff975850b40200 RCX: 0000000000000000 RDX: 000000003fffffff RSI: 0000000051d383e4 RDI: ffff975850b43618 RBP: ffffbd13003d3800 R8: 0000000000000000 R9: ffff975850b40200 R10: 0000000000000000 R11: 0000000000000000 R12: ffffbd13003d3830 R13: ffff975850b436a8 R14: ffff975850b43600 R15: 0000000000000007 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 9 [ffffbd13003d3808] ip6_pol_route at ffffffff8ddb030c 10 [ffffbd13003d3888] ip6_pol_route_input at ffffffff8ddb068c 11 [ffffbd13003d3898] fib6_rule_lookup at ffffffff8ddf02b5 12 [ffffbd13003d3928] ip6_route_input at ffffffff8ddb0f47 13 [ffffbd13003d3a18] ip6_rcv_finish_core.constprop.0 at ffffffff8dd950d0 14 [ffffbd13003d3a30] ip6_list_rcv_finish.constprop.0 at ffffffff8dd96274 15 [ffffbd13003d3a98] ip6_sublist_rcv at ffffffff8dd96474 16 [ffffbd13003d3af8] ipv6_list_rcv at ffffffff8dd96615 17 [ffffbd13003d3b60] __netif_receive_skb_list_core at ffffffff8dc16fec 18 [ffffbd13003d3be0] netif_receive_skb_list_internal at ffffffff8dc176b3 19 [ffffbd13003d3c50] napi_gro_receive at ffffffff8dc565b9 20 [ffffbd13003d3c80] ice_receive_skb at ffffffffc087e4f5 [ice] 21 [ffffbd13003d3c90] ice_clean_rx_irq at ffffffffc0881b80 [ice] 22 [ffffbd13003d3d20] ice_napi_poll at ffffffffc088232f [ice] 23 [ffffbd13003d3d80] __napi_poll at ffffffff8dc18000 24 [ffffbd13003d3db8] net_rx_action at ffffffff8dc18581 25 [ffffbd13003d3e40] __do_softirq at ffffffff8df352e9 26 [ffffbd13003d3eb0] run_ksoftirqd at ffffffff8ceffe47 27 [ffffbd13003d3ec0] smpboot_thread_fn at ffffffff8cf36a30 28 [ffffbd13003d3ee8] kthread at ffffffff8cf2b39f 29 [ffffbd13003d3f28] ret_from_fork at ffffffff8ce5fa64 30 [ffffbd13003d3f50] ret_from_fork_asm at ffffffff8ce03cbb Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table") Reported-by: Adrian Oliver Signed-off-by: Omid Ehtemam-Haghighi Cc: Shuah Khan Cc: Ido Schimmel Cc: Kuniyuki Iwashima Cc: Simon Horman Reviewed-by: David Ahern Link: https://patch.msgid.link/20241106010236.1239299-1-omid.ehtemamhaghighi@menlosecurity.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_fib.c | 8 +- net/ipv6/route.c | 45 ++-- tools/testing/selftests/net/Makefile | 1 + .../selftests/net/ipv6_route_update_soft_lockup.sh | 262 +++++++++++++++++++++ 4 files changed, 297 insertions(+), 19 deletions(-) create mode 100755 tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 6383263bfd04..c134ba202c4c 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1183,8 +1183,8 @@ next_iter: while (sibling) { if (sibling->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(sibling)) { - list_add_tail(&rt->fib6_siblings, - &sibling->fib6_siblings); + list_add_tail_rcu(&rt->fib6_siblings, + &sibling->fib6_siblings); break; } sibling = rcu_dereference_protected(sibling->fib6_next, @@ -1245,7 +1245,7 @@ add: fib6_siblings) sibling->fib6_nsiblings--; rt->fib6_nsiblings = 0; - list_del_init(&rt->fib6_siblings); + list_del_rcu(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); return err; } @@ -1963,7 +1963,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, &rt->fib6_siblings, fib6_siblings) sibling->fib6_nsiblings--; rt->fib6_nsiblings = 0; - list_del_init(&rt->fib6_siblings); + list_del_rcu(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 038c1eeef0be..63d7681c929f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -416,8 +416,8 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, struct flowi6 *fl6, int oif, bool have_oif_match, const struct sk_buff *skb, int strict) { - struct fib6_info *sibling, *next_sibling; struct fib6_info *match = res->f6i; + struct fib6_info *sibling; if (!match->nh && (!match->fib6_nsiblings || have_oif_match)) goto out; @@ -443,8 +443,8 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound)) goto out; - list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, - fib6_siblings) { + list_for_each_entry_rcu(sibling, &match->fib6_siblings, + fib6_siblings) { const struct fib6_nh *nh = sibling->fib6_nh; int nh_upper_bound; @@ -5195,14 +5195,18 @@ static void ip6_route_mpath_notify(struct fib6_info *rt, * nexthop. Since sibling routes are always added at the end of * the list, find the first sibling of the last route appended */ + rcu_read_lock(); + if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { - rt = list_first_entry(&rt_last->fib6_siblings, - struct fib6_info, - fib6_siblings); + rt = list_first_or_null_rcu(&rt_last->fib6_siblings, + struct fib6_info, + fib6_siblings); } if (rt) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); + + rcu_read_unlock(); } static bool ip6_route_mpath_should_notify(const struct fib6_info *rt) @@ -5547,17 +5551,21 @@ static size_t rt6_nlmsg_size(struct fib6_info *f6i) nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size, &nexthop_len); } else { - struct fib6_info *sibling, *next_sibling; struct fib6_nh *nh = f6i->fib6_nh; + struct fib6_info *sibling; nexthop_len = 0; if (f6i->fib6_nsiblings) { rt6_nh_nlmsg_size(nh, &nexthop_len); - list_for_each_entry_safe(sibling, next_sibling, - &f6i->fib6_siblings, fib6_siblings) { + rcu_read_lock(); + + list_for_each_entry_rcu(sibling, &f6i->fib6_siblings, + fib6_siblings) { rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len); } + + rcu_read_unlock(); } nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); } @@ -5721,7 +5729,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) goto nla_put_failure; } else if (rt->fib6_nsiblings) { - struct fib6_info *sibling, *next_sibling; + struct fib6_info *sibling; struct nlattr *mp; mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); @@ -5733,14 +5741,21 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, 0) < 0) goto nla_put_failure; - list_for_each_entry_safe(sibling, next_sibling, - &rt->fib6_siblings, fib6_siblings) { + rcu_read_lock(); + + list_for_each_entry_rcu(sibling, &rt->fib6_siblings, + fib6_siblings) { if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, sibling->fib6_nh->fib_nh_weight, - AF_INET6, 0) < 0) + AF_INET6, 0) < 0) { + rcu_read_unlock(); + goto nla_put_failure; + } } + rcu_read_unlock(); + nla_nest_end(skb, mp); } else if (rt->nh) { if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id)) @@ -6177,7 +6192,7 @@ void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, err = -ENOBUFS; seq = info->nlh ? info->nlh->nlmsg_seq : 0; - skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); + skb = nlmsg_new(rt6_nlmsg_size(rt), GFP_ATOMIC); if (!skb) goto errout; @@ -6190,7 +6205,7 @@ void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, goto errout; } rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, - info->nlh, gfp_any()); + info->nlh, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 26a4883a65c9..8c4db5199a42 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -96,6 +96,7 @@ TEST_PROGS += fdb_flush.sh TEST_PROGS += fq_band_pktlimit.sh TEST_PROGS += vlan_hw_filter.sh TEST_PROGS += bpf_offload.py +TEST_PROGS += ipv6_route_update_soft_lockup.sh # YNL files, must be before "include ..lib.mk" YNL_GEN_FILES := ncdevmem diff --git a/tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh b/tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh new file mode 100755 index 000000000000..a6b2b1f9c641 --- /dev/null +++ b/tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh @@ -0,0 +1,262 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Testing for potential kernel soft lockup during IPv6 routing table +# refresh under heavy outgoing IPv6 traffic. If a kernel soft lockup +# occurs, a kernel panic will be triggered to prevent associated issues. +# +# +# Test Environment Layout +# +# ┌----------------┐ ┌----------------┐ +# | SOURCE_NS | | SINK_NS | +# | NAMESPACE | | NAMESPACE | +# |(iperf3 clients)| |(iperf3 servers)| +# | | | | +# | | | | +# | ┌-----------| nexthops |---------┐ | +# | |veth_source|<--------------------------------------->|veth_sink|<┐ | +# | └-----------|2001:0DB8:1::0:1/96 2001:0DB8:1::1:1/96 |---------┘ | | +# | | ^ 2001:0DB8:1::1:2/96 | | | +# | | . . | fwd | | +# | ┌---------┐ | . . | | | +# | | IPv6 | | . . | V | +# | | routing | | . 2001:0DB8:1::1:80/96| ┌-----┐ | +# | | table | | . | | lo | | +# | | nexthop | | . └--------┴-----┴-┘ +# | | update | | ............................> 2001:0DB8:2::1:1/128 +# | └-------- ┘ | +# └----------------┘ +# +# The test script sets up two network namespaces, source_ns and sink_ns, +# connected via a veth link. Within source_ns, it continuously updates the +# IPv6 routing table by flushing and inserting IPV6_NEXTHOP_ADDR_COUNT nexthop +# IPs destined for SINK_LOOPBACK_IP_ADDR in sink_ns. This refresh occurs at a +# rate of 1/ROUTING_TABLE_REFRESH_PERIOD per second for TEST_DURATION seconds. +# +# Simultaneously, multiple iperf3 clients within source_ns generate heavy +# outgoing IPv6 traffic. Each client is assigned a unique port number starting +# at 5000 and incrementing sequentially. Each client targets a unique iperf3 +# server running in sink_ns, connected to the SINK_LOOPBACK_IFACE interface +# using the same port number. +# +# The number of iperf3 servers and clients is set to half of the total +# available cores on each machine. +# +# NOTE: We have tested this script on machines with various CPU specifications, +# ranging from lower to higher performance as listed below. The test script +# effectively triggered a kernel soft lockup on machines running an unpatched +# kernel in under a minute: +# +# - 1x Intel Xeon E-2278G 8-Core Processor @ 3.40GHz +# - 1x Intel Xeon E-2378G Processor 8-Core @ 2.80GHz +# - 1x AMD EPYC 7401P 24-Core Processor @ 2.00GHz +# - 1x AMD EPYC 7402P 24-Core Processor @ 2.80GHz +# - 2x Intel Xeon Gold 5120 14-Core Processor @ 2.20GHz +# - 1x Ampere Altra Q80-30 80-Core Processor @ 3.00GHz +# - 2x Intel Xeon Gold 5120 14-Core Processor @ 2.20GHz +# - 2x Intel Xeon Silver 4214 24-Core Processor @ 2.20GHz +# - 1x AMD EPYC 7502P 32-Core @ 2.50GHz +# - 1x Intel Xeon Gold 6314U 32-Core Processor @ 2.30GHz +# - 2x Intel Xeon Gold 6338 32-Core Processor @ 2.00GHz +# +# On less performant machines, you may need to increase the TEST_DURATION +# parameter to enhance the likelihood of encountering a race condition leading +# to a kernel soft lockup and avoid a false negative result. +# +# NOTE: The test may not produce the expected result in virtualized +# environments (e.g., qemu) due to differences in timing and CPU handling, +# which can affect the conditions needed to trigger a soft lockup. + +source lib.sh +source net_helper.sh + +TEST_DURATION=300 +ROUTING_TABLE_REFRESH_PERIOD=0.01 + +IPERF3_BITRATE="300m" + + +IPV6_NEXTHOP_ADDR_COUNT="128" +IPV6_NEXTHOP_ADDR_MASK="96" +IPV6_NEXTHOP_PREFIX="2001:0DB8:1" + + +SOURCE_TEST_IFACE="veth_source" +SOURCE_TEST_IP_ADDR="2001:0DB8:1::0:1/96" + +SINK_TEST_IFACE="veth_sink" +# ${SINK_TEST_IFACE} is populated with the following range of IPv6 addresses: +# 2001:0DB8:1::1:1 to 2001:0DB8:1::1:${IPV6_NEXTHOP_ADDR_COUNT} +SINK_LOOPBACK_IFACE="lo" +SINK_LOOPBACK_IP_MASK="128" +SINK_LOOPBACK_IP_ADDR="2001:0DB8:2::1:1" + +nexthop_ip_list="" +termination_signal="" +kernel_softlokup_panic_prev_val="" + +terminate_ns_processes_by_pattern() { + local ns=$1 + local pattern=$2 + + for pid in $(ip netns pids ${ns}); do + [ -e /proc/$pid/cmdline ] && grep -qe "${pattern}" /proc/$pid/cmdline && kill -9 $pid + done +} + +cleanup() { + echo "info: cleaning up namespaces and terminating all processes within them..." + + + # Terminate iperf3 instances running in the source_ns. To avoid race + # conditions, first iterate over the PIDs and terminate those + # associated with the bash shells running the + # `while true; do iperf3 -c ...; done` loops. In a second iteration, + # terminate the individual `iperf3 -c ...` instances. + terminate_ns_processes_by_pattern ${source_ns} while + terminate_ns_processes_by_pattern ${source_ns} iperf3 + + # Repeat the same process for sink_ns + terminate_ns_processes_by_pattern ${sink_ns} while + terminate_ns_processes_by_pattern ${sink_ns} iperf3 + + # Check if any iperf3 instances are still running. This could happen + # if a core has entered an infinite loop and the timeout for detecting + # the soft lockup has not expired, but either the test interval has + # already elapsed or the test was terminated manually (e.g., with ^C) + for pid in $(ip netns pids ${source_ns}); do + if [ -e /proc/$pid/cmdline ] && grep -qe 'iperf3' /proc/$pid/cmdline; then + echo "FAIL: unable to terminate some iperf3 instances. Soft lockup is underway. A kernel panic is on the way!" + exit ${ksft_fail} + fi + done + + if [ "$termination_signal" == "SIGINT" ]; then + echo "SKIP: Termination due to ^C (SIGINT)" + elif [ "$termination_signal" == "SIGALRM" ]; then + echo "PASS: No kernel soft lockup occurred during this ${TEST_DURATION} second test" + fi + + cleanup_ns ${source_ns} ${sink_ns} + + sysctl -qw kernel.softlockup_panic=${kernel_softlokup_panic_prev_val} +} + +setup_prepare() { + setup_ns source_ns sink_ns + + ip -n ${source_ns} link add name ${SOURCE_TEST_IFACE} type veth peer name ${SINK_TEST_IFACE} netns ${sink_ns} + + # Setting up the Source namespace + ip -n ${source_ns} addr add ${SOURCE_TEST_IP_ADDR} dev ${SOURCE_TEST_IFACE} + ip -n ${source_ns} link set dev ${SOURCE_TEST_IFACE} qlen 10000 + ip -n ${source_ns} link set dev ${SOURCE_TEST_IFACE} up + ip netns exec ${source_ns} sysctl -qw net.ipv6.fib_multipath_hash_policy=1 + + # Setting up the Sink namespace + ip -n ${sink_ns} addr add ${SINK_LOOPBACK_IP_ADDR}/${SINK_LOOPBACK_IP_MASK} dev ${SINK_LOOPBACK_IFACE} + ip -n ${sink_ns} link set dev ${SINK_LOOPBACK_IFACE} up + ip netns exec ${sink_ns} sysctl -qw net.ipv6.conf.${SINK_LOOPBACK_IFACE}.forwarding=1 + + ip -n ${sink_ns} link set ${SINK_TEST_IFACE} up + ip netns exec ${sink_ns} sysctl -qw net.ipv6.conf.${SINK_TEST_IFACE}.forwarding=1 + + + # Populate nexthop IPv6 addresses on the test interface in the sink_ns + echo "info: populating ${IPV6_NEXTHOP_ADDR_COUNT} IPv6 addresses on the ${SINK_TEST_IFACE} interface ..." + for IP in $(seq 1 ${IPV6_NEXTHOP_ADDR_COUNT}); do + ip -n ${sink_ns} addr add ${IPV6_NEXTHOP_PREFIX}::$(printf "1:%x" "${IP}")/${IPV6_NEXTHOP_ADDR_MASK} dev ${SINK_TEST_IFACE}; + done + + # Preparing list of nexthops + for IP in $(seq 1 ${IPV6_NEXTHOP_ADDR_COUNT}); do + nexthop_ip_list=$nexthop_ip_list" nexthop via ${IPV6_NEXTHOP_PREFIX}::$(printf "1:%x" $IP) dev ${SOURCE_TEST_IFACE} weight 1" + done +} + + +test_soft_lockup_during_routing_table_refresh() { + # Start num_of_iperf_servers iperf3 servers in the sink_ns namespace, + # each listening on ports starting at 5001 and incrementing + # sequentially. Since iperf3 instances may terminate unexpectedly, a + # while loop is used to automatically restart them in such cases. + echo "info: starting ${num_of_iperf_servers} iperf3 servers in the sink_ns namespace ..." + for i in $(seq 1 ${num_of_iperf_servers}); do + cmd="iperf3 --bind ${SINK_LOOPBACK_IP_ADDR} -s -p $(printf '5%03d' ${i}) --rcv-timeout 200 &>/dev/null" + ip netns exec ${sink_ns} bash -c "while true; do ${cmd}; done &" &>/dev/null + done + + # Wait for the iperf3 servers to be ready + for i in $(seq ${num_of_iperf_servers}); do + port=$(printf '5%03d' ${i}); + wait_local_port_listen ${sink_ns} ${port} tcp + done + + # Continuously refresh the routing table in the background within + # the source_ns namespace + ip netns exec ${source_ns} bash -c " + while \$(ip netns list | grep -q ${source_ns}); do + ip -6 route add ${SINK_LOOPBACK_IP_ADDR}/${SINK_LOOPBACK_IP_MASK} ${nexthop_ip_list}; + sleep ${ROUTING_TABLE_REFRESH_PERIOD}; + ip -6 route delete ${SINK_LOOPBACK_IP_ADDR}/${SINK_LOOPBACK_IP_MASK}; + done &" + + # Start num_of_iperf_servers iperf3 clients in the source_ns namespace, + # each sending TCP traffic on sequential ports starting at 5001. + # Since iperf3 instances may terminate unexpectedly (e.g., if the route + # to the server is deleted in the background during a route refresh), a + # while loop is used to automatically restart them in such cases. + echo "info: starting ${num_of_iperf_servers} iperf3 clients in the source_ns namespace ..." + for i in $(seq 1 ${num_of_iperf_servers}); do + cmd="iperf3 -c ${SINK_LOOPBACK_IP_ADDR} -p $(printf '5%03d' ${i}) --length 64 --bitrate ${IPERF3_BITRATE} -t 0 --connect-timeout 150 &>/dev/null" + ip netns exec ${source_ns} bash -c "while true; do ${cmd}; done &" &>/dev/null + done + + echo "info: IPv6 routing table is being updated at the rate of $(echo "1/${ROUTING_TABLE_REFRESH_PERIOD}" | bc)/s for ${TEST_DURATION} seconds ..." + echo "info: A kernel soft lockup, if detected, results in a kernel panic!" + + wait +} + +# Make sure 'iperf3' is installed, skip the test otherwise +if [ ! -x "$(command -v "iperf3")" ]; then + echo "SKIP: 'iperf3' is not installed. Skipping the test." + exit ${ksft_skip} +fi + +# Determine the number of cores on the machine +num_of_iperf_servers=$(( $(nproc)/2 )) + +# Check if we are running on a multi-core machine, skip the test otherwise +if [ "${num_of_iperf_servers}" -eq 0 ]; then + echo "SKIP: This test is not valid on a single core machine!" + exit ${ksft_skip} +fi + +# Since the kernel soft lockup we're testing causes at least one core to enter +# an infinite loop, destabilizing the host and likely affecting subsequent +# tests, we trigger a kernel panic instead of reporting a failure and +# continuing +kernel_softlokup_panic_prev_val=$(sysctl -n kernel.softlockup_panic) +sysctl -qw kernel.softlockup_panic=1 + +handle_sigint() { + termination_signal="SIGINT" + cleanup + exit ${ksft_skip} +} + +handle_sigalrm() { + termination_signal="SIGALRM" + cleanup + exit ${ksft_pass} +} + +trap handle_sigint SIGINT +trap handle_sigalrm SIGALRM + +(sleep ${TEST_DURATION} && kill -s SIGALRM $$)& + +setup_prepare +test_soft_lockup_during_routing_table_refresh -- cgit v1.3 From 8b9a7bd4d6c83300e50bb1d7071c6032a07e2fed Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 6 Nov 2024 13:00:45 +0000 Subject: rxrpc: Add a tracepoint for aborts being proposed Add a tracepoint to rxrpc to trace the proposal of an abort. The abort is performed asynchronously by the I/O thread. Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/726356.1730898045@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 25 +++++++++++++++++++++++++ net/rxrpc/sendmsg.c | 1 + 2 files changed, 26 insertions(+) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index cc22596c7250..d03e0bd8c028 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -773,6 +773,31 @@ TRACE_EVENT(rxrpc_rx_done, TP_printk("r=%d a=%d", __entry->result, __entry->abort_code) ); +TRACE_EVENT(rxrpc_abort_call, + TP_PROTO(const struct rxrpc_call *call, int abort_code), + + TP_ARGS(call, abort_code), + + TP_STRUCT__entry( + __field(unsigned int, call_nr) + __field(enum rxrpc_abort_reason, why) + __field(int, abort_code) + __field(int, error) + ), + + TP_fast_assign( + __entry->call_nr = call->debug_id; + __entry->why = call->send_abort_why; + __entry->abort_code = abort_code; + __entry->error = call->send_abort_err; + ), + + TP_printk("c=%08x a=%d e=%d %s", + __entry->call_nr, + __entry->abort_code, __entry->error, + __print_symbolic(__entry->why, rxrpc_abort_reasons)) + ); + TRACE_EVENT(rxrpc_abort, TP_PROTO(unsigned int call_nr, enum rxrpc_abort_reason why, u32 cid, u32 call_id, rxrpc_seq_t seq, int abort_code, int error), diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 23d18fe5de9f..6abb8eec1b2b 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -29,6 +29,7 @@ bool rxrpc_propose_abort(struct rxrpc_call *call, s32 abort_code, int error, call->send_abort_why = why; call->send_abort_err = error; call->send_abort_seq = 0; + trace_rxrpc_abort_call(call, abort_code); /* Request abort locklessly vs rxrpc_input_call_event(). */ smp_store_release(&call->send_abort, abort_code); rxrpc_poke_call(call, rxrpc_call_poke_abort); -- cgit v1.3 From 073d89808c065ac4c672c0a613a71b27a80691cb Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Thu, 7 Nov 2024 10:34:05 +0800 Subject: net: fix data-races around sk->sk_forward_alloc Syzkaller reported this warning: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 16 at net/ipv4/af_inet.c:156 inet_sock_destruct+0x1c5/0x1e0 Modules linked in: CPU: 0 UID: 0 PID: 16 Comm: ksoftirqd/0 Not tainted 6.12.0-rc5 #26 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 RIP: 0010:inet_sock_destruct+0x1c5/0x1e0 Code: 24 12 4c 89 e2 5b 48 c7 c7 98 ec bb 82 41 5c e9 d1 18 17 ff 4c 89 e6 5b 48 c7 c7 d0 ec bb 82 41 5c e9 bf 18 17 ff 0f 0b eb 83 <0f> 0b eb 97 0f 0b eb 87 0f 0b e9 68 ff ff ff 66 66 2e 0f 1f 84 00 RSP: 0018:ffffc9000008bd90 EFLAGS: 00010206 RAX: 0000000000000300 RBX: ffff88810b172a90 RCX: 0000000000000007 RDX: 0000000000000002 RSI: 0000000000000300 RDI: ffff88810b172a00 RBP: ffff88810b172a00 R08: ffff888104273c00 R09: 0000000000100007 R10: 0000000000020000 R11: 0000000000000006 R12: ffff88810b172a00 R13: 0000000000000004 R14: 0000000000000000 R15: ffff888237c31f78 FS: 0000000000000000(0000) GS:ffff888237c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffc63fecac8 CR3: 000000000342e000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? __warn+0x88/0x130 ? inet_sock_destruct+0x1c5/0x1e0 ? report_bug+0x18e/0x1a0 ? handle_bug+0x53/0x90 ? exc_invalid_op+0x18/0x70 ? asm_exc_invalid_op+0x1a/0x20 ? inet_sock_destruct+0x1c5/0x1e0 __sk_destruct+0x2a/0x200 rcu_do_batch+0x1aa/0x530 ? rcu_do_batch+0x13b/0x530 rcu_core+0x159/0x2f0 handle_softirqs+0xd3/0x2b0 ? __pfx_smpboot_thread_fn+0x10/0x10 run_ksoftirqd+0x25/0x30 smpboot_thread_fn+0xdd/0x1d0 kthread+0xd3/0x100 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x34/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 ---[ end trace 0000000000000000 ]--- Its possible that two threads call tcp_v6_do_rcv()/sk_forward_alloc_add() concurrently when sk->sk_state == TCP_LISTEN with sk->sk_lock unlocked, which triggers a data-race around sk->sk_forward_alloc: tcp_v6_rcv tcp_v6_do_rcv skb_clone_and_charge_r sk_rmem_schedule __sk_mem_schedule sk_forward_alloc_add() skb_set_owner_r sk_mem_charge sk_forward_alloc_add() __kfree_skb skb_release_all skb_release_head_state sock_rfree sk_mem_uncharge sk_forward_alloc_add() sk_mem_reclaim // set local var reclaimable __sk_mem_reclaim sk_forward_alloc_add() In this syzkaller testcase, two threads call tcp_v6_do_rcv() with skb->truesize=768, the sk_forward_alloc changes like this: (cpu 1) | (cpu 2) | sk_forward_alloc ... | ... | 0 __sk_mem_schedule() | | +4096 = 4096 | __sk_mem_schedule() | +4096 = 8192 sk_mem_charge() | | -768 = 7424 | sk_mem_charge() | -768 = 6656 ... | ... | sk_mem_uncharge() | | +768 = 7424 reclaimable=7424 | | | sk_mem_uncharge() | +768 = 8192 | reclaimable=8192 | __sk_mem_reclaim() | | -4096 = 4096 | __sk_mem_reclaim() | -8192 = -4096 != 0 The skb_clone_and_charge_r() should not be called in tcp_v6_do_rcv() when sk->sk_state is TCP_LISTEN, it happens later in tcp_v6_syn_recv_sock(). Fix the same issue in dccp_v6_do_rcv(). Suggested-by: Eric Dumazet Reviewed-by: Eric Dumazet Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets") Signed-off-by: Wang Liang Link: https://patch.msgid.link/20241107023405.889239-1-wangliang74@huawei.com Signed-off-by: Jakub Kicinski --- net/dccp/ipv6.c | 2 +- net/ipv6/tcp_ipv6.c | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index da5dba120bc9..d6649246188d 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -618,7 +618,7 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) by tcp. Feel free to propose better solution. --ANK (980728) */ - if (np->rxopt.all) + if (np->rxopt.all && sk->sk_state != DCCP_LISTEN) opt_skb = skb_clone_and_charge_r(skb, sk); if (sk->sk_state == DCCP_OPEN) { /* Fast path */ diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d71ab4e1efe1..c9de5ef8f267 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1618,7 +1618,7 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) by tcp. Feel free to propose better solution. --ANK (980728) */ - if (np->rxopt.all) + if (np->rxopt.all && sk->sk_state != TCP_LISTEN) opt_skb = skb_clone_and_charge_r(skb, sk); if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ @@ -1656,8 +1656,6 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (reason) goto reset; } - if (opt_skb) - __kfree_skb(opt_skb); return 0; } } else -- cgit v1.3 From d977d7eb09fed1e809074a467a01473f1855657d Mon Sep 17 00:00:00 2001 From: WingMan Kwok Date: Wed, 6 Nov 2024 14:47:07 +0530 Subject: net: hsr: Add VLAN support Add support for creating VLAN interfaces over HSR/PRP interface. Signed-off-by: WingMan Kwok Signed-off-by: Murali Karicheri Signed-off-by: MD Danish Anwar Reviewed-by: Jiri Pirko Link: https://patch.msgid.link/20241106091710.3308519-2-danishanwar@ti.com Signed-off-by: Jakub Kicinski --- net/hsr/hsr_device.c | 5 ----- net/hsr/hsr_forward.c | 19 ++++++++++++++----- 2 files changed, 14 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index ebdfd5b64e17..0ca47ebb01d3 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -572,11 +572,6 @@ void hsr_dev_setup(struct net_device *dev) NETIF_F_HW_VLAN_CTAG_TX; dev->features = dev->hw_features; - - /* VLAN on top of HSR needs testing and probably some work on - * hsr_header_create() etc. - */ - dev->features |= NETIF_F_VLAN_CHALLENGED; } /* Return true if dev is a HSR master; return false otherwise. diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index b38060246e62..aa6acebc7c1e 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -280,6 +280,7 @@ static struct sk_buff *hsr_fill_tag(struct sk_buff *skb, struct hsr_port *port, u8 proto_version) { struct hsr_ethhdr *hsr_ethhdr; + unsigned char *pc; int lsdu_size; /* pad to minimum packet size which is 60 + 6 (HSR tag) */ @@ -290,7 +291,18 @@ static struct sk_buff *hsr_fill_tag(struct sk_buff *skb, if (frame->is_vlan) lsdu_size -= 4; - hsr_ethhdr = (struct hsr_ethhdr *)skb_mac_header(skb); + pc = skb_mac_header(skb); + if (frame->is_vlan) + /* This 4-byte shift (size of a vlan tag) does not + * mean that the ethhdr starts there. But rather it + * provides the proper environment for accessing + * the fields, such as hsr_tag etc., just like + * when the vlan tag is not there. This is because + * the hsr tag is after the vlan tag. + */ + hsr_ethhdr = (struct hsr_ethhdr *)(pc + VLAN_HLEN); + else + hsr_ethhdr = (struct hsr_ethhdr *)pc; hsr_set_path_id(hsr_ethhdr, port); set_hsr_tag_LSDU_size(&hsr_ethhdr->hsr_tag, lsdu_size); @@ -368,7 +380,7 @@ struct sk_buff *prp_create_tagged_frame(struct hsr_frame_info *frame, return skb_clone(frame->skb_std, GFP_ATOMIC); } - skb = skb_copy_expand(frame->skb_std, 0, + skb = skb_copy_expand(frame->skb_std, skb_headroom(frame->skb_std), skb_tailroom(frame->skb_std) + HSR_HLEN, GFP_ATOMIC); return prp_fill_rct(skb, frame, port); @@ -690,9 +702,6 @@ static int fill_frame_info(struct hsr_frame_info *frame, if (frame->is_vlan) { vlan_hdr = (struct hsr_vlan_ethhdr *)ethhdr; proto = vlan_hdr->vlanhdr.h_vlan_encapsulated_proto; - /* FIXME: */ - netdev_warn_once(skb->dev, "VLAN not yet supported"); - return -EINVAL; } frame->is_from_san = false; -- cgit v1.3 From 1a8a63a5305e95519de6f941922dfcd8179f82e5 Mon Sep 17 00:00:00 2001 From: Murali Karicheri Date: Wed, 6 Nov 2024 14:47:08 +0530 Subject: net: hsr: Add VLAN CTAG filter support This patch adds support for VLAN ctag based filtering at slave devices. The slave ethernet device may be capable of filtering ethernet packets based on VLAN ID. This requires that when the VLAN interface is created over an HSR/PRP interface, it passes the VID information to the associated slave ethernet devices so that it updates the hardware filters to filter ethernet frames based on VID. This patch adds the required functions to propagate the vid information to the slave devices. Signed-off-by: Murali Karicheri Signed-off-by: MD Danish Anwar Reviewed-by: Jiri Pirko Link: https://patch.msgid.link/20241106091710.3308519-3-danishanwar@ti.com Signed-off-by: Jakub Kicinski --- net/hsr/hsr_device.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 0ca47ebb01d3..9e64496a5c1c 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -515,6 +515,77 @@ static void hsr_change_rx_flags(struct net_device *dev, int change) } } +static int hsr_ndo_vlan_rx_add_vid(struct net_device *dev, + __be16 proto, u16 vid) +{ + bool is_slave_a_added = false; + bool is_slave_b_added = false; + struct hsr_port *port; + struct hsr_priv *hsr; + int ret = 0; + + hsr = netdev_priv(dev); + + hsr_for_each_port(hsr, port) { + if (port->type == HSR_PT_MASTER || + port->type == HSR_PT_INTERLINK) + continue; + + ret = vlan_vid_add(port->dev, proto, vid); + switch (port->type) { + case HSR_PT_SLAVE_A: + if (ret) { + /* clean up Slave-B */ + netdev_err(dev, "add vid failed for Slave-A\n"); + if (is_slave_b_added) + vlan_vid_del(port->dev, proto, vid); + return ret; + } + + is_slave_a_added = true; + break; + + case HSR_PT_SLAVE_B: + if (ret) { + /* clean up Slave-A */ + netdev_err(dev, "add vid failed for Slave-B\n"); + if (is_slave_a_added) + vlan_vid_del(port->dev, proto, vid); + return ret; + } + + is_slave_b_added = true; + break; + default: + break; + } + } + + return 0; +} + +static int hsr_ndo_vlan_rx_kill_vid(struct net_device *dev, + __be16 proto, u16 vid) +{ + struct hsr_port *port; + struct hsr_priv *hsr; + + hsr = netdev_priv(dev); + + hsr_for_each_port(hsr, port) { + switch (port->type) { + case HSR_PT_SLAVE_A: + case HSR_PT_SLAVE_B: + vlan_vid_del(port->dev, proto, vid); + break; + default: + break; + } + } + + return 0; +} + static const struct net_device_ops hsr_device_ops = { .ndo_change_mtu = hsr_dev_change_mtu, .ndo_open = hsr_dev_open, @@ -523,6 +594,8 @@ static const struct net_device_ops hsr_device_ops = { .ndo_change_rx_flags = hsr_change_rx_flags, .ndo_fix_features = hsr_fix_features, .ndo_set_rx_mode = hsr_set_rx_mode, + .ndo_vlan_rx_add_vid = hsr_ndo_vlan_rx_add_vid, + .ndo_vlan_rx_kill_vid = hsr_ndo_vlan_rx_kill_vid, }; static const struct device_type hsr_type = { @@ -569,7 +642,8 @@ void hsr_dev_setup(struct net_device *dev) dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | NETIF_F_GSO_MASK | NETIF_F_HW_CSUM | - NETIF_F_HW_VLAN_CTAG_TX; + NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_CTAG_FILTER; dev->features = dev->hw_features; } @@ -647,6 +721,10 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], (slave[1]->features & NETIF_F_HW_HSR_FWD)) hsr->fwd_offloaded = true; + if ((slave[0]->features & NETIF_F_HW_VLAN_CTAG_FILTER) && + (slave[1]->features & NETIF_F_HW_VLAN_CTAG_FILTER)) + hsr_dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; + res = register_netdevice(hsr_dev); if (res) goto err_unregister; -- cgit v1.3 From d5ec8d91f82ef78405b506737952dec8af95a95b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:14 -0800 Subject: rtnetlink: Remove __rtnl_link_unregister(). rtnl_link_unregister() holds RTNL and calls __rtnl_link_unregister(), where we call synchronize_srcu() to wait inflight RTM_NEWLINK requests for per-netns RTNL. We put synchronize_srcu() in __rtnl_link_unregister() due to ifb.ko and dummy.ko. However, rtnl_newlink() will acquire SRCU before RTNL later in this series. Then, lockdep will detect the deadlock: rtnl_link_unregister() rtnl_newlink() ---- ---- lock(rtnl_mutex); lock(&ops->srcu); lock(rtnl_mutex); sync(&ops->srcu); To avoid the problem, we must call synchronize_srcu() before RTNL in rtnl_link_unregister(). As a preparation, let's remove __rtnl_link_unregister(). Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241108004823.29419-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/dummy.c | 6 +++++- drivers/net/ifb.c | 6 +++++- include/net/rtnetlink.h | 1 - net/core/rtnetlink.c | 32 ++++++++++---------------------- 4 files changed, 20 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c index e9c5e1e11fa0..72618b6af44e 100644 --- a/drivers/net/dummy.c +++ b/drivers/net/dummy.c @@ -166,6 +166,7 @@ err: static int __init dummy_init_module(void) { + bool need_unregister = false; int i, err = 0; down_write(&pernet_ops_rwsem); @@ -179,12 +180,15 @@ static int __init dummy_init_module(void) cond_resched(); } if (err < 0) - __rtnl_link_unregister(&dummy_link_ops); + need_unregister = true; out: rtnl_unlock(); up_write(&pernet_ops_rwsem); + if (need_unregister) + rtnl_link_unregister(&dummy_link_ops); + return err; } diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c index 2c1b5def4a0b..a4b9ec4e8f30 100644 --- a/drivers/net/ifb.c +++ b/drivers/net/ifb.c @@ -424,6 +424,7 @@ err: static int __init ifb_init_module(void) { + bool need_unregister = false; int i, err; down_write(&pernet_ops_rwsem); @@ -437,12 +438,15 @@ static int __init ifb_init_module(void) cond_resched(); } if (err) - __rtnl_link_unregister(&ifb_link_ops); + need_unregister = true; out: rtnl_unlock(); up_write(&pernet_ops_rwsem); + if (need_unregister) + rtnl_link_unregister(&ifb_link_ops); + return err; } diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index b260c0cc9671..3ebfcc6e56fd 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -165,7 +165,6 @@ struct rtnl_link_ops { }; int __rtnl_link_register(struct rtnl_link_ops *ops); -void __rtnl_link_unregister(struct rtnl_link_ops *ops); int rtnl_link_register(struct rtnl_link_ops *ops); void rtnl_link_unregister(struct rtnl_link_ops *ops); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a5c386a45501..f0246ecec7fa 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -568,27 +568,6 @@ static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) unregister_netdevice_many(&list_kill); } -/** - * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. - * @ops: struct rtnl_link_ops * to unregister - * - * The caller must hold the rtnl_mutex and guarantee net_namespace_list - * integrity (hold pernet_ops_rwsem for writing to close the race - * with setup_net() and cleanup_net()). - */ -void __rtnl_link_unregister(struct rtnl_link_ops *ops) -{ - struct net *net; - - list_del_rcu(&ops->list); - synchronize_srcu(&ops->srcu); - cleanup_srcu_struct(&ops->srcu); - - for_each_net(net) - __rtnl_kill_links(net, ops); -} -EXPORT_SYMBOL_GPL(__rtnl_link_unregister); - /* Return with the rtnl_lock held when there are no network * devices unregistering in any network namespace. */ @@ -617,10 +596,19 @@ static void rtnl_lock_unregistering_all(void) */ void rtnl_link_unregister(struct rtnl_link_ops *ops) { + struct net *net; + /* Close the race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); rtnl_lock_unregistering_all(); - __rtnl_link_unregister(ops); + + list_del_rcu(&ops->list); + synchronize_srcu(&ops->srcu); + cleanup_srcu_struct(&ops->srcu); + + for_each_net(net) + __rtnl_kill_links(net, ops); + rtnl_unlock(); up_write(&pernet_ops_rwsem); } -- cgit v1.3 From 6b57ff21a3109b1dba2d286ff415463e6fb1fca3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:15 -0800 Subject: rtnetlink: Protect link_ops by mutex. rtnl_link_unregister() holds RTNL and calls synchronize_srcu(), but rtnl_newlink() will acquire SRCU frist and then RTNL. Then, we need to unlink ops and call synchronize_srcu() outside of RTNL to avoid the deadlock. rtnl_link_unregister() rtnl_newlink() ---- ---- lock(rtnl_mutex); lock(&ops->srcu); lock(rtnl_mutex); sync(&ops->srcu); Let's move as such and add a mutex to protect link_ops. Now, link_ops is protected by its dedicated mutex and rtnl_link_register() no longer needs to hold RTNL. While at it, we move the initialisation of ops->dellink and ops->srcu out of the mutex scope. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241108004823.29419-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/rtnetlink.h | 2 +- net/core/rtnetlink.c | 33 ++++++++++++++++++++------------- 2 files changed, 21 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 3ebfcc6e56fd..7559020f760c 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -71,7 +71,7 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) /** * struct rtnl_link_ops - rtnetlink link operations * - * @list: Used internally, protected by RTNL and SRCU + * @list: Used internally, protected by link_ops_mutex and SRCU * @srcu: Used internally * @kind: Identifier * @netns_refund: Physical device, move to init_net on netns exit diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f0246ecec7fa..21154ef0048f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -466,6 +466,7 @@ void __rtnl_unregister_many(const struct rtnl_msg_handler *handlers, int n) } EXPORT_SYMBOL_GPL(__rtnl_unregister_many); +static DEFINE_MUTEX(link_ops_mutex); static LIST_HEAD(link_ops); static struct rtnl_link_ops *rtnl_link_ops_get(const char *kind, int *srcu_index) @@ -508,14 +509,6 @@ int __rtnl_link_register(struct rtnl_link_ops *ops) struct rtnl_link_ops *tmp; int err; - /* When RTNL is removed, add lock for link_ops. */ - ASSERT_RTNL(); - - list_for_each_entry(tmp, &link_ops, list) { - if (!strcmp(ops->kind, tmp->kind)) - return -EEXIST; - } - /* The check for alloc/setup is here because if ops * does not have that filled up, it is not possible * to use the ops for creating device. So do not @@ -528,9 +521,20 @@ int __rtnl_link_register(struct rtnl_link_ops *ops) if (err) return err; + mutex_lock(&link_ops_mutex); + + list_for_each_entry(tmp, &link_ops, list) { + if (!strcmp(ops->kind, tmp->kind)) { + err = -EEXIST; + goto unlock; + } + } + list_add_tail_rcu(&ops->list, &link_ops); +unlock: + mutex_unlock(&link_ops_mutex); - return 0; + return err; } EXPORT_SYMBOL_GPL(__rtnl_link_register); @@ -598,14 +602,17 @@ void rtnl_link_unregister(struct rtnl_link_ops *ops) { struct net *net; - /* Close the race with setup_net() and cleanup_net() */ - down_write(&pernet_ops_rwsem); - rtnl_lock_unregistering_all(); - + mutex_lock(&link_ops_mutex); list_del_rcu(&ops->list); + mutex_unlock(&link_ops_mutex); + synchronize_srcu(&ops->srcu); cleanup_srcu_struct(&ops->srcu); + /* Close the race with setup_net() and cleanup_net() */ + down_write(&pernet_ops_rwsem); + rtnl_lock_unregistering_all(); + for_each_net(net) __rtnl_kill_links(net, ops); -- cgit v1.3 From 68297dbb967f87c3c92af9d2f652270f57c547c7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:16 -0800 Subject: rtnetlink: Remove __rtnl_link_register() link_ops is protected by link_ops_mutex and no longer needs RTNL, so we have no reason to have __rtnl_link_register() separately. Let's remove it and call rtnl_link_register() from ifb.ko and dummy.ko. Note that both modules' init() work on init_net only, so we need not export pernet_ops_rwsem and can use rtnl_net_lock() there. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241108004823.29419-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/dummy.c | 17 ++++++----------- drivers/net/ifb.c | 17 ++++++----------- include/net/rtnetlink.h | 2 -- net/core/net_namespace.c | 1 - net/core/rtnetlink.c | 35 +++++++---------------------------- 5 files changed, 19 insertions(+), 53 deletions(-) (limited to 'net') diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c index 72618b6af44e..005d79975f3b 100644 --- a/drivers/net/dummy.c +++ b/drivers/net/dummy.c @@ -166,27 +166,22 @@ err: static int __init dummy_init_module(void) { - bool need_unregister = false; int i, err = 0; - down_write(&pernet_ops_rwsem); - rtnl_lock(); - err = __rtnl_link_register(&dummy_link_ops); + err = rtnl_link_register(&dummy_link_ops); if (err < 0) - goto out; + return err; + + rtnl_net_lock(&init_net); for (i = 0; i < numdummies && !err; i++) { err = dummy_init_one(); cond_resched(); } - if (err < 0) - need_unregister = true; -out: - rtnl_unlock(); - up_write(&pernet_ops_rwsem); + rtnl_net_unlock(&init_net); - if (need_unregister) + if (err < 0) rtnl_link_unregister(&dummy_link_ops); return err; diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c index a4b9ec4e8f30..67424888ff0a 100644 --- a/drivers/net/ifb.c +++ b/drivers/net/ifb.c @@ -424,27 +424,22 @@ err: static int __init ifb_init_module(void) { - bool need_unregister = false; int i, err; - down_write(&pernet_ops_rwsem); - rtnl_lock(); - err = __rtnl_link_register(&ifb_link_ops); + err = rtnl_link_register(&ifb_link_ops); if (err < 0) - goto out; + return err; + + rtnl_net_lock(&init_net); for (i = 0; i < numifbs && !err; i++) { err = ifb_init_one(i); cond_resched(); } - if (err) - need_unregister = true; -out: - rtnl_unlock(); - up_write(&pernet_ops_rwsem); + rtnl_net_unlock(&init_net); - if (need_unregister) + if (err) rtnl_link_unregister(&ifb_link_ops); return err; diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 7559020f760c..ef7c11f0d74c 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -164,8 +164,6 @@ struct rtnl_link_ops { int *prividx, int attr); }; -int __rtnl_link_register(struct rtnl_link_ops *ops); - int rtnl_link_register(struct rtnl_link_ops *ops); void rtnl_link_unregister(struct rtnl_link_ops *ops); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 809b48c0a528..157021ced442 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -56,7 +56,6 @@ static bool init_net_initialized; * outside. */ DECLARE_RWSEM(pernet_ops_rwsem); -EXPORT_SYMBOL_GPL(pernet_ops_rwsem); #define MIN_PERNET_OPS_ID \ ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *)) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 21154ef0048f..e8357a3b9c7e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -495,20 +495,21 @@ static void rtnl_link_ops_put(struct rtnl_link_ops *ops, int srcu_index) } /** - * __rtnl_link_register - Register rtnl_link_ops with rtnetlink. + * rtnl_link_register - Register rtnl_link_ops with rtnetlink. * @ops: struct rtnl_link_ops * to register * - * The caller must hold the rtnl_mutex. This function should be used - * by drivers that create devices during module initialization. It - * must be called before registering the devices. - * * Returns 0 on success or a negative error code. */ -int __rtnl_link_register(struct rtnl_link_ops *ops) +int rtnl_link_register(struct rtnl_link_ops *ops) { struct rtnl_link_ops *tmp; int err; + /* Sanity-check max sizes to avoid stack buffer overflow. */ + if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE || + ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)) + return -EINVAL; + /* The check for alloc/setup is here because if ops * does not have that filled up, it is not possible * to use the ops for creating device. So do not @@ -536,28 +537,6 @@ unlock: return err; } -EXPORT_SYMBOL_GPL(__rtnl_link_register); - -/** - * rtnl_link_register - Register rtnl_link_ops with rtnetlink. - * @ops: struct rtnl_link_ops * to register - * - * Returns 0 on success or a negative error code. - */ -int rtnl_link_register(struct rtnl_link_ops *ops) -{ - int err; - - /* Sanity-check max sizes to avoid stack buffer overflow. */ - if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE || - ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)) - return -EINVAL; - - rtnl_lock(); - err = __rtnl_link_register(ops); - rtnl_unlock(); - return err; -} EXPORT_SYMBOL_GPL(rtnl_link_register); static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) -- cgit v1.3 From cbaaa6326bc58e75239df437a8fdcdb2335d3b24 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:17 -0800 Subject: rtnetlink: Introduce struct rtnl_nets and helpers. rtnl_newlink() needs to hold 3 per-netns RTNL: 2 for a new device and 1 for its peer. We will add rtnl_nets_lock() later, which performs the nested locking based on struct rtnl_nets, which has an array of struct net pointers. rtnl_nets_add() adds a net pointer to the array and sorts it so that rtnl_nets_lock() can simply acquire per-netns RTNL from array[0] to [2]. Before calling rtnl_nets_add(), get_net() must be called for the net, and rtnl_nets_destroy() will call put_net() for each. Let's apply the helpers to rtnl_newlink(). When CONFIG_DEBUG_NET_SMALL_RTNL is disabled, we do not call rtnl_net_lock() thus do not care about the array order, so rtnl_net_cmp_locks() returns -1 so that the loop in rtnl_nets_add() can be optimised to NOP. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20241108004823.29419-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 67 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e8357a3b9c7e..960d9d2c6aec 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -258,8 +258,67 @@ bool lockdep_rtnl_net_is_held(struct net *net) return lockdep_rtnl_is_held() && lockdep_is_held(&net->rtnl_mutex); } EXPORT_SYMBOL(lockdep_rtnl_net_is_held); +#else +static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b) +{ + /* No need to swap */ + return -1; +} #endif +struct rtnl_nets { + /* ->newlink() needs to freeze 3 netns at most; + * 2 for the new device, 1 for its peer. + */ + struct net *net[3]; + unsigned char len; +}; + +static void rtnl_nets_init(struct rtnl_nets *rtnl_nets) +{ + memset(rtnl_nets, 0, sizeof(*rtnl_nets)); +} + +static void rtnl_nets_destroy(struct rtnl_nets *rtnl_nets) +{ + int i; + + for (i = 0; i < rtnl_nets->len; i++) { + put_net(rtnl_nets->net[i]); + rtnl_nets->net[i] = NULL; + } + + rtnl_nets->len = 0; +} + +/** + * rtnl_nets_add - Add netns to be locked before ->newlink(). + * + * @rtnl_nets: rtnl_nets pointer passed to ->get_peer_net(). + * @net: netns pointer with an extra refcnt held. + * + * The extra refcnt is released in rtnl_nets_destroy(). + */ +static void rtnl_nets_add(struct rtnl_nets *rtnl_nets, struct net *net) +{ + int i; + + DEBUG_NET_WARN_ON_ONCE(rtnl_nets->len == ARRAY_SIZE(rtnl_nets->net)); + + for (i = 0; i < rtnl_nets->len; i++) { + switch (rtnl_net_cmp_locks(rtnl_nets->net[i], net)) { + case 0: + put_net(net); + return; + case 1: + swap(rtnl_nets->net[i], net); + } + } + + rtnl_nets->net[i] = net; + rtnl_nets->len++; +} + static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) @@ -3767,6 +3826,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *tgt_net, *link_net = NULL; struct rtnl_link_ops *ops = NULL; struct rtnl_newlink_tbs *tbs; + struct rtnl_nets rtnl_nets; int ops_srcu_index; int ret; @@ -3810,6 +3870,8 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, #endif } + rtnl_nets_init(&rtnl_nets); + if (ops) { if (ops->maxtype > RTNL_MAX_TYPE) { ret = -EINVAL; @@ -3839,6 +3901,8 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto put_ops; } + rtnl_nets_add(&rtnl_nets, tgt_net); + if (tb[IFLA_LINK_NETNSID]) { int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); @@ -3849,6 +3913,8 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto put_net; } + rtnl_nets_add(&rtnl_nets, link_net); + if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; goto put_net; @@ -3858,9 +3924,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, ret = __rtnl_newlink(skb, nlh, ops, tgt_net, link_net, tbs, data, extack); put_net: - if (link_net) - put_net(link_net); - put_net(tgt_net); + rtnl_nets_destroy(&rtnl_nets); put_ops: if (ops) rtnl_link_ops_put(ops, ops_srcu_index); -- cgit v1.3 From 28690e5361c05fd4ef0ca3a17d1c667cba790554 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:18 -0800 Subject: rtnetlink: Add peer_type in struct rtnl_link_ops. In ops->newlink(), veth, vxcan, and netkit call rtnl_link_get_net() with a net pointer, which is the first argument of ->newlink(). rtnl_link_get_net() could return another netns based on IFLA_NET_NS_PID and IFLA_NET_NS_FD in the peer device's attributes. We want to get it and fill rtnl_nets->nets[] in advance in rtnl_newlink() for per-netns RTNL. All of the three get the peer netns in the same way: 1. Call rtnl_nla_parse_ifinfomsg() 2. Call ops->validate() (vxcan doesn't have) 3. Call rtnl_link_get_net_tb() Let's add a new field peer_type to struct rtnl_link_ops and prefetch netns in the peer ifla to add it to rtnl_nets in rtnl_newlink(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20241108004823.29419-6-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/rtnetlink.h | 2 ++ net/core/rtnetlink.c | 55 +++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 53 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index ef7c11f0d74c..bef76abcff8d 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -75,6 +75,7 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) * @srcu: Used internally * @kind: Identifier * @netns_refund: Physical device, move to init_net on netns exit + * @peer_type: Peer device specific netlink attribute number (e.g. VETH_INFO_PEER) * @maxtype: Highest device specific netlink attribute number * @policy: Netlink policy for device specific attribute validation * @validate: Optional validation function for netlink/changelink parameters @@ -116,6 +117,7 @@ struct rtnl_link_ops { void (*setup)(struct net_device *dev); bool netns_refund; + const u16 peer_type; unsigned int maxtype; const struct nla_policy *policy; int (*validate)(struct nlattr *tb[], diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 960d9d2c6aec..1af187a4a3f1 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2492,9 +2492,10 @@ int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer, } EXPORT_SYMBOL(rtnl_nla_parse_ifinfomsg); -struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) +static struct net *rtnl_link_get_net_ifla(struct nlattr *tb[]) { - struct net *net; + struct net *net = NULL; + /* Examine the link attributes and figure out which * network namespace we are talking about. */ @@ -2502,8 +2503,17 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); else if (tb[IFLA_NET_NS_FD]) net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD])); - else + + return net; +} + +struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) +{ + struct net *net = rtnl_link_get_net_ifla(tb); + + if (!net) net = get_net(src_net); + return net; } EXPORT_SYMBOL(rtnl_link_get_net); @@ -3765,6 +3775,37 @@ out_unregister: goto out; } +static int rtnl_add_peer_net(struct rtnl_nets *rtnl_nets, + const struct rtnl_link_ops *ops, + struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_MAX + 1]; + struct net *net; + int err; + + if (!data || !data[ops->peer_type]) + return 0; + + err = rtnl_nla_parse_ifinfomsg(tb, data[ops->peer_type], extack); + if (err < 0) + return err; + + if (ops->validate) { + err = ops->validate(tb, NULL, extack); + if (err < 0) + return err; + } + + net = rtnl_link_get_net_ifla(tb); + if (IS_ERR(net)) + return PTR_ERR(net); + if (net) + rtnl_nets_add(rtnl_nets, net); + + return 0; +} + static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, const struct rtnl_link_ops *ops, struct net *tgt_net, struct net *link_net, @@ -3893,12 +3934,18 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (ret < 0) goto put_ops; } + + if (ops->peer_type) { + ret = rtnl_add_peer_net(&rtnl_nets, ops, data, extack); + if (ret < 0) + goto put_ops; + } } tgt_net = rtnl_link_get_net_capable(skb, sock_net(skb->sk), tb, CAP_NET_ADMIN); if (IS_ERR(tgt_net)) { ret = PTR_ERR(tgt_net); - goto put_ops; + goto put_net; } rtnl_nets_add(&rtnl_nets, tgt_net); -- cgit v1.3 From d91191ffe23f927b14b8e861f22037cf153c48cb Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:22 -0800 Subject: rtnetlink: Convert RTM_NEWLINK to per-netns RTNL. Now, we are ready to convert rtnl_newlink() to per-netns RTNL; rtnl_link_ops is protected by SRCU and netns is prefetched in rtnl_newlink(). Let's register rtnl_newlink() with RTNL_FLAG_DOIT_PERNET and push RTNL down as rtnl_nets_lock(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20241108004823.29419-10-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1af187a4a3f1..30191d17add3 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -319,6 +319,26 @@ static void rtnl_nets_add(struct rtnl_nets *rtnl_nets, struct net *net) rtnl_nets->len++; } +static void rtnl_nets_lock(struct rtnl_nets *rtnl_nets) +{ + int i; + + rtnl_lock(); + + for (i = 0; i < rtnl_nets->len; i++) + __rtnl_net_lock(rtnl_nets->net[i]); +} + +static void rtnl_nets_unlock(struct rtnl_nets *rtnl_nets) +{ + int i; + + for (i = 0; i < rtnl_nets->len; i++) + __rtnl_net_unlock(rtnl_nets->net[i]); + + rtnl_unlock(); +} + static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) @@ -3903,9 +3923,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, ops = rtnl_link_ops_get(kind, &ops_srcu_index); #ifdef CONFIG_MODULES if (!ops) { - __rtnl_unlock(); request_module("rtnl-link-%s", kind); - rtnl_lock(); ops = rtnl_link_ops_get(kind, &ops_srcu_index); } #endif @@ -3968,7 +3986,9 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, } } + rtnl_nets_lock(&rtnl_nets); ret = __rtnl_newlink(skb, nlh, ops, tgt_net, link_net, tbs, data, extack); + rtnl_nets_unlock(&rtnl_nets); put_net: rtnl_nets_destroy(&rtnl_nets); @@ -6972,7 +6992,8 @@ static struct pernet_operations rtnetlink_net_ops = { }; static const struct rtnl_msg_handler rtnetlink_rtnl_msg_handlers[] __initconst = { - {.msgtype = RTM_NEWLINK, .doit = rtnl_newlink}, + {.msgtype = RTM_NEWLINK, .doit = rtnl_newlink, + .flags = RTNL_FLAG_DOIT_PERNET}, {.msgtype = RTM_DELLINK, .doit = rtnl_dellink}, {.msgtype = RTM_GETLINK, .doit = rtnl_getlink, .dumpit = rtnl_dump_ifinfo, .flags = RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, -- cgit v1.3 From 636af13f213bf9b28a34254327934bc72a797754 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 7 Nov 2024 16:48:23 -0800 Subject: rtnetlink: Register rtnl_dellink() and rtnl_setlink() with RTNL_FLAG_DOIT_PERNET_WIP. Currently, rtnl_setlink() and rtnl_dellink() cannot be fully converted to per-netns RTNL due to a lack of handling peer/lower/upper devices in different netns. For example, when we change a device in rtnl_setlink() and need to propagate that to its upper devices, we want to avoid acquiring all netns locks, for which we do not know the upper limit. The same situation happens when we remove a device. rtnl_dellink() could be transformed to remove a single device in the requested netns and delegate other devices to per-netns work, and rtnl_setlink() might be ? Until we come up with a better idea, let's use a new flag RTNL_FLAG_DOIT_PERNET_WIP for rtnl_dellink() and rtnl_setlink(). This will unblock converting RTNL users where such devices are not related. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Nikolay Aleksandrov Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241108004823.29419-11-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/rtnetlink.h | 1 + net/core/rtnetlink.c | 19 ++++++++++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index bef76abcff8d..bc0069a8b6ea 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -13,6 +13,7 @@ typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *); enum rtnl_link_flags { RTNL_FLAG_DOIT_UNLOCKED = BIT(0), #define RTNL_FLAG_DOIT_PERNET RTNL_FLAG_DOIT_UNLOCKED +#define RTNL_FLAG_DOIT_PERNET_WIP RTNL_FLAG_DOIT_UNLOCKED RTNL_FLAG_BULK_DEL_SUPPORTED = BIT(1), RTNL_FLAG_DUMP_UNLOCKED = BIT(2), RTNL_FLAG_DUMP_SPLIT_NLM_DONE = BIT(3), /* legacy behavior */ diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 30191d17add3..327fa4957929 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3379,6 +3379,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *net = sock_net(skb->sk); struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; + struct rtnl_nets rtnl_nets; struct net *tgt_net; int err; @@ -3397,6 +3398,12 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } + rtnl_nets_init(&rtnl_nets); + rtnl_nets_add(&rtnl_nets, get_net(net)); + rtnl_nets_add(&rtnl_nets, tgt_net); + + rtnl_nets_lock(&rtnl_nets); + if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) @@ -3409,7 +3416,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, else if (!err) err = -ENODEV; - put_net(tgt_net); + rtnl_nets_unlock(&rtnl_nets); errout: return err; } @@ -3494,6 +3501,8 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, return PTR_ERR(tgt_net); } + rtnl_net_lock(tgt_net); + if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) @@ -3508,6 +3517,8 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, else err = -EINVAL; + rtnl_net_unlock(tgt_net); + if (netnsid >= 0) put_net(tgt_net); @@ -6994,10 +7005,12 @@ static struct pernet_operations rtnetlink_net_ops = { static const struct rtnl_msg_handler rtnetlink_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWLINK, .doit = rtnl_newlink, .flags = RTNL_FLAG_DOIT_PERNET}, - {.msgtype = RTM_DELLINK, .doit = rtnl_dellink}, + {.msgtype = RTM_DELLINK, .doit = rtnl_dellink, + .flags = RTNL_FLAG_DOIT_PERNET_WIP}, {.msgtype = RTM_GETLINK, .doit = rtnl_getlink, .dumpit = rtnl_dump_ifinfo, .flags = RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, - {.msgtype = RTM_SETLINK, .doit = rtnl_setlink}, + {.msgtype = RTM_SETLINK, .doit = rtnl_setlink, + .flags = RTNL_FLAG_DOIT_PERNET_WIP}, {.msgtype = RTM_GETADDR, .dumpit = rtnl_dump_all}, {.msgtype = RTM_GETROUTE, .dumpit = rtnl_dump_all}, {.msgtype = RTM_GETNETCONF, .dumpit = rtnl_dump_all}, -- cgit v1.3 From f2685c00c3222305f5b6740a8b16ea044640283a Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Thu, 7 Nov 2024 21:03:30 +0000 Subject: net: fix SO_DEVMEM_DONTNEED looping too long Exit early if we're freeing more than 1024 frags, to prevent looping too long. Also minor code cleanups: - Flip checks to reduce indentation. - Use sizeof(*tokens) everywhere for consistentcy. Cc: Yi Lai Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20241107210331.3044434-1-almasrymina@google.com Signed-off-by: Jakub Kicinski --- net/core/sock.c | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 039be95c40cf..da50df485090 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1052,32 +1052,34 @@ static int sock_reserve_memory(struct sock *sk, int bytes) #ifdef CONFIG_PAGE_POOL -/* This is the number of tokens that the user can SO_DEVMEM_DONTNEED in - * 1 syscall. The limit exists to limit the amount of memory the kernel - * allocates to copy these tokens. +/* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED + * in 1 syscall. The limit exists to limit the amount of memory the kernel + * allocates to copy these tokens, and to prevent looping over the frags for + * too long. */ #define MAX_DONTNEED_TOKENS 128 +#define MAX_DONTNEED_FRAGS 1024 static noinline_for_stack int sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) { unsigned int num_tokens, i, j, k, netmem_num = 0; struct dmabuf_token *tokens; + int ret = 0, num_frags = 0; netmem_ref netmems[16]; - int ret = 0; if (!sk_is_tcp(sk)) return -EBADF; - if (optlen % sizeof(struct dmabuf_token) || + if (optlen % sizeof(*tokens) || optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS) return -EINVAL; - tokens = kvmalloc_array(optlen, sizeof(*tokens), GFP_KERNEL); + num_tokens = optlen / sizeof(*tokens); + tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL); if (!tokens) return -ENOMEM; - num_tokens = optlen / sizeof(struct dmabuf_token); if (copy_from_sockptr(tokens, optval, optlen)) { kvfree(tokens); return -EFAULT; @@ -1086,24 +1088,28 @@ sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) xa_lock_bh(&sk->sk_user_frags); for (i = 0; i < num_tokens; i++) { for (j = 0; j < tokens[i].token_count; j++) { + if (++num_frags > MAX_DONTNEED_FRAGS) + goto frag_limit_reached; + netmem_ref netmem = (__force netmem_ref)__xa_erase( &sk->sk_user_frags, tokens[i].token_start + j); - if (netmem && - !WARN_ON_ONCE(!netmem_is_net_iov(netmem))) { - netmems[netmem_num++] = netmem; - if (netmem_num == ARRAY_SIZE(netmems)) { - xa_unlock_bh(&sk->sk_user_frags); - for (k = 0; k < netmem_num; k++) - WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); - netmem_num = 0; - xa_lock_bh(&sk->sk_user_frags); - } - ret++; + if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem))) + continue; + + netmems[netmem_num++] = netmem; + if (netmem_num == ARRAY_SIZE(netmems)) { + xa_unlock_bh(&sk->sk_user_frags); + for (k = 0; k < netmem_num; k++) + WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); + netmem_num = 0; + xa_lock_bh(&sk->sk_user_frags); } + ret++; } } +frag_limit_reached: xa_unlock_bh(&sk->sk_user_frags); for (k = 0; k < netmem_num; k++) WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); -- cgit v1.3 From 5dc51ec86df6e2214d8398079c1e31736593ab53 Mon Sep 17 00:00:00 2001 From: Martin Karsten Date: Sat, 9 Nov 2024 05:02:31 +0000 Subject: net: Add napi_struct parameter irq_suspend_timeout Add a per-NAPI IRQ suspension parameter, which can be get/set with netdev-genl. This patch doesn't change any behavior but prepares the code for other changes in the following commits which use irq_suspend_timeout as a timeout for IRQ suspension. Signed-off-by: Martin Karsten Co-developed-by: Joe Damato Signed-off-by: Joe Damato Tested-by: Joe Damato Tested-by: Martin Karsten Acked-by: Stanislav Fomichev Reviewed-by: Sridhar Samudrala Link: https://patch.msgid.link/20241109050245.191288-2-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/netdev.yaml | 7 +++++++ include/linux/netdevice.h | 2 ++ include/uapi/linux/netdev.h | 1 + net/core/dev.c | 2 ++ net/core/dev.h | 25 +++++++++++++++++++++++++ net/core/netdev-genl-gen.c | 5 +++-- net/core/netdev-genl.c | 12 ++++++++++++ tools/include/uapi/linux/netdev.h | 1 + 8 files changed, 53 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index f9cb97d6106c..cbb544bd6c84 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -263,6 +263,11 @@ attribute-sets: the end of a NAPI cycle. This may add receive latency in exchange for reducing the number of frames processed by the network stack. type: uint + - + name: irq-suspend-timeout + doc: The timeout, in nanoseconds, of how long to suspend irq + processing, if event polling finds events + type: uint - name: queue attributes: @@ -653,6 +658,7 @@ operations: - pid - defer-hard-irqs - gro-flush-timeout + - irq-suspend-timeout dump: request: attributes: @@ -704,6 +710,7 @@ operations: - id - defer-hard-irqs - gro-flush-timeout + - irq-suspend-timeout kernel-family: headers: [ "linux/list.h"] diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index df4483598628..0aae346d919e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -348,6 +348,7 @@ struct gro_list { */ struct napi_config { u64 gro_flush_timeout; + u64 irq_suspend_timeout; u32 defer_hard_irqs; unsigned int napi_id; }; @@ -384,6 +385,7 @@ struct napi_struct { struct hrtimer timer; struct task_struct *thread; unsigned long gro_flush_timeout; + unsigned long irq_suspend_timeout; u32 defer_hard_irqs; /* control-path-only fields follow */ struct list_head dev_list; diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index e3ebb49f60d2..e4be227d3ad6 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -124,6 +124,7 @@ enum { NETDEV_A_NAPI_PID, NETDEV_A_NAPI_DEFER_HARD_IRQS, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, + NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) diff --git a/net/core/dev.c b/net/core/dev.c index 6a31152e4606..4d910872963f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6666,6 +6666,7 @@ static void napi_restore_config(struct napi_struct *n) { n->defer_hard_irqs = n->config->defer_hard_irqs; n->gro_flush_timeout = n->config->gro_flush_timeout; + n->irq_suspend_timeout = n->config->irq_suspend_timeout; /* a NAPI ID might be stored in the config, if so use it. if not, use * napi_hash_add to generate one for us. It will be saved to the config * in napi_disable. @@ -6680,6 +6681,7 @@ static void napi_save_config(struct napi_struct *n) { n->config->defer_hard_irqs = n->defer_hard_irqs; n->config->gro_flush_timeout = n->gro_flush_timeout; + n->config->irq_suspend_timeout = n->irq_suspend_timeout; n->config->napi_id = n->napi_id; napi_hash_del(n); } diff --git a/net/core/dev.h b/net/core/dev.h index 7881bced70a9..d043dee25a68 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -236,6 +236,31 @@ static inline void netdev_set_gro_flush_timeout(struct net_device *netdev, netdev->napi_config[i].gro_flush_timeout = timeout; } +/** + * napi_get_irq_suspend_timeout - get the irq_suspend_timeout + * @n: napi struct to get the irq_suspend_timeout from + * + * Return: the per-NAPI value of the irq_suspend_timeout field. + */ +static inline unsigned long +napi_get_irq_suspend_timeout(const struct napi_struct *n) +{ + return READ_ONCE(n->irq_suspend_timeout); +} + +/** + * napi_set_irq_suspend_timeout - set the irq_suspend_timeout for a napi + * @n: napi struct to set the irq_suspend_timeout + * @timeout: timeout value to set + * + * napi_set_irq_suspend_timeout sets the per-NAPI irq_suspend_timeout + */ +static inline void napi_set_irq_suspend_timeout(struct napi_struct *n, + unsigned long timeout) +{ + WRITE_ONCE(n->irq_suspend_timeout, timeout); +} + int rps_cpumask_housekeeping(struct cpumask *mask); #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 21de7e10be16..a89cbd8d87c3 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -92,10 +92,11 @@ static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] }; /* NETDEV_CMD_NAPI_SET - do */ -static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT + 1] = { +static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT + 1] = { [NETDEV_A_NAPI_ID] = { .type = NLA_U32, }, [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range), [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, }, + [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, }, }; /* Ops table for netdev */ @@ -186,7 +187,7 @@ static const struct genl_split_ops netdev_nl_ops[] = { .cmd = NETDEV_CMD_NAPI_SET, .doit = netdev_nl_napi_set_doit, .policy = netdev_napi_set_nl_policy, - .maxattr = NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, + .maxattr = NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, }; diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index b49c3b4e5fbe..765ce7c9d73b 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -161,6 +161,7 @@ static int netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, const struct genl_info *info) { + unsigned long irq_suspend_timeout; unsigned long gro_flush_timeout; u32 napi_defer_hard_irqs; void *hdr; @@ -196,6 +197,11 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, napi_defer_hard_irqs)) goto nla_put_failure; + irq_suspend_timeout = napi_get_irq_suspend_timeout(napi); + if (nla_put_uint(rsp, NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, + irq_suspend_timeout)) + goto nla_put_failure; + gro_flush_timeout = napi_get_gro_flush_timeout(napi); if (nla_put_uint(rsp, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, gro_flush_timeout)) @@ -306,6 +312,7 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) static int netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info) { + u64 irq_suspend_timeout = 0; u64 gro_flush_timeout = 0; u32 defer = 0; @@ -314,6 +321,11 @@ netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info) napi_set_defer_hard_irqs(napi, defer); } + if (info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]) { + irq_suspend_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]); + napi_set_irq_suspend_timeout(napi, irq_suspend_timeout); + } + if (info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]) { gro_flush_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]); napi_set_gro_flush_timeout(napi, gro_flush_timeout); diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index e3ebb49f60d2..e4be227d3ad6 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -124,6 +124,7 @@ enum { NETDEV_A_NAPI_PID, NETDEV_A_NAPI_DEFER_HARD_IRQS, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, + NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) -- cgit v1.3 From 3fcbecbdeb048dfd1bea824f4276717fed02d10e Mon Sep 17 00:00:00 2001 From: Martin Karsten Date: Sat, 9 Nov 2024 05:02:32 +0000 Subject: net: Add control functions for irq suspension The napi_suspend_irqs routine bootstraps irq suspension by elongating the defer timeout to irq_suspend_timeout. The napi_resume_irqs routine effectively cancels irq suspension by forcing the napi to be scheduled immediately. Signed-off-by: Martin Karsten Co-developed-by: Joe Damato Signed-off-by: Joe Damato Tested-by: Joe Damato Tested-by: Martin Karsten Acked-by: Stanislav Fomichev Reviewed-by: Sridhar Samudrala Link: https://patch.msgid.link/20241109050245.191288-3-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- include/net/busy_poll.h | 3 +++ net/core/dev.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) (limited to 'net') diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index f03040baaefd..c858270141bc 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -52,6 +52,9 @@ void napi_busy_loop_rcu(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget); +void napi_suspend_irqs(unsigned int napi_id); +void napi_resume_irqs(unsigned int napi_id); + #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) { diff --git a/net/core/dev.c b/net/core/dev.c index 4d910872963f..13d00fc10f55 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6507,6 +6507,43 @@ void napi_busy_loop(unsigned int napi_id, } EXPORT_SYMBOL(napi_busy_loop); +void napi_suspend_irqs(unsigned int napi_id) +{ + struct napi_struct *napi; + + rcu_read_lock(); + napi = napi_by_id(napi_id); + if (napi) { + unsigned long timeout = napi_get_irq_suspend_timeout(napi); + + if (timeout) + hrtimer_start(&napi->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); + } + rcu_read_unlock(); +} + +void napi_resume_irqs(unsigned int napi_id) +{ + struct napi_struct *napi; + + rcu_read_lock(); + napi = napi_by_id(napi_id); + if (napi) { + /* If irq_suspend_timeout is set to 0 between the call to + * napi_suspend_irqs and now, the original value still + * determines the safety timeout as intended and napi_watchdog + * will resume irq processing. + */ + if (napi_get_irq_suspend_timeout(napi)) { + local_bh_disable(); + napi_schedule(napi); + local_bh_enable(); + } + } + rcu_read_unlock(); +} + #endif /* CONFIG_NET_RX_BUSY_POLL */ static void __napi_hash_add_with_id(struct napi_struct *napi, -- cgit v1.3 From 581302298524e9d77c4c44ff5156a6cd112227ae Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 8 Nov 2024 11:58:16 +0100 Subject: mptcp: error out earlier on disconnect Eric reported a division by zero splat in the MPTCP protocol: Oops: divide error: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 1 UID: 0 PID: 6094 Comm: syz-executor317 Not tainted 6.12.0-rc5-syzkaller-00291-g05b92660cdfe #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:__tcp_select_window+0x5b4/0x1310 net/ipv4/tcp_output.c:3163 Code: f6 44 01 e3 89 df e8 9b 75 09 f8 44 39 f3 0f 8d 11 ff ff ff e8 0d 74 09 f8 45 89 f4 e9 04 ff ff ff e8 00 74 09 f8 44 89 f0 99 7c 24 14 41 29 d6 45 89 f4 e9 ec fe ff ff e8 e8 73 09 f8 48 89 RSP: 0018:ffffc900041f7930 EFLAGS: 00010293 RAX: 0000000000017e67 RBX: 0000000000017e67 RCX: ffffffff8983314b RDX: 0000000000000000 RSI: ffffffff898331b0 RDI: 0000000000000004 RBP: 00000000005d6000 R08: 0000000000000004 R09: 0000000000017e67 R10: 0000000000003e80 R11: 0000000000000000 R12: 0000000000003e80 R13: ffff888031d9b440 R14: 0000000000017e67 R15: 00000000002eb000 FS: 00007feb5d7f16c0(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007feb5d8adbb8 CR3: 0000000074e4c000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __tcp_cleanup_rbuf+0x3e7/0x4b0 net/ipv4/tcp.c:1493 mptcp_rcv_space_adjust net/mptcp/protocol.c:2085 [inline] mptcp_recvmsg+0x2156/0x2600 net/mptcp/protocol.c:2289 inet_recvmsg+0x469/0x6a0 net/ipv4/af_inet.c:885 sock_recvmsg_nosec net/socket.c:1051 [inline] sock_recvmsg+0x1b2/0x250 net/socket.c:1073 __sys_recvfrom+0x1a5/0x2e0 net/socket.c:2265 __do_sys_recvfrom net/socket.c:2283 [inline] __se_sys_recvfrom net/socket.c:2279 [inline] __x64_sys_recvfrom+0xe0/0x1c0 net/socket.c:2279 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7feb5d857559 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 51 18 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007feb5d7f1208 EFLAGS: 00000246 ORIG_RAX: 000000000000002d RAX: ffffffffffffffda RBX: 00007feb5d8e1318 RCX: 00007feb5d857559 RDX: 000000800000000e RSI: 0000000000000000 RDI: 0000000000000003 RBP: 00007feb5d8e1310 R08: 0000000000000000 R09: ffffffff81000000 R10: 0000000000000100 R11: 0000000000000246 R12: 00007feb5d8e131c R13: 00007feb5d8ae074 R14: 000000800000000e R15: 00000000fffffdef and provided a nice reproducer. The root cause is the current bad handling of racing disconnect. After the blamed commit below, sk_wait_data() can return (with error) with the underlying socket disconnected and a zero rcv_mss. Catch the error and return without performing any additional operations on the current socket. Reported-by: Eric Dumazet Fixes: 419ce133ab92 ("tcp: allow again tcp_disconnect() when threads are waiting") Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/8c82ecf71662ecbc47bf390f9905de70884c9f2d.1731060874.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index d263091659e0..95a5a3da3944 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2205,7 +2205,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, cmsg_flags = MPTCP_CMSG_INQ; while (copied < len) { - int bytes_read; + int err, bytes_read; bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags, &tss, &cmsg_flags); if (unlikely(bytes_read < 0)) { @@ -2267,9 +2267,16 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } pr_debug("block timeout %ld\n", timeo); - sk_wait_data(sk, &timeo, NULL); + mptcp_rcv_space_adjust(msk, copied); + err = sk_wait_data(sk, &timeo, NULL); + if (err < 0) { + err = copied ? : err; + goto out_err; + } } + mptcp_rcv_space_adjust(msk, copied); + out_err: if (cmsg_flags && copied >= 0) { if (cmsg_flags & MPTCP_CMSG_TS) @@ -2285,8 +2292,6 @@ out_err: pr_debug("msk=%p rx queue empty=%d:%d copied=%d\n", msk, skb_queue_empty_lockless(&sk->sk_receive_queue), skb_queue_empty(&msk->receive_queue), copied); - if (!(flags & MSG_PEEK)) - mptcp_rcv_space_adjust(msk, copied); release_sock(sk); return copied; -- cgit v1.3 From ce7356ae35943cc6494cc692e62d51a734062b7d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 8 Nov 2024 11:58:17 +0100 Subject: mptcp: cope racing subflow creation in mptcp_rcv_space_adjust Additional active subflows - i.e. created by the in kernel path manager - are included into the subflow list before starting the 3whs. A racing recvmsg() spooling data received on an already established subflow would unconditionally call tcp_cleanup_rbuf() on all the current subflows, potentially hitting a divide by zero error on the newly created ones. Explicitly check that the subflow is in a suitable state before invoking tcp_cleanup_rbuf(). Fixes: c76c6956566f ("mptcp: call tcp_cleanup_rbuf on subflows") Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/02374660836e1b52afc91966b7535c8c5f7bafb0.1731060874.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 95a5a3da3944..48d480982b78 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2082,7 +2082,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) slow = lock_sock_fast(ssk); WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf); WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp); - tcp_cleanup_rbuf(ssk, 1); + if (tcp_can_send_ack(ssk)) + tcp_cleanup_rbuf(ssk, 1); unlock_sock_fast(ssk, slow); } } -- cgit v1.3 From 7d3f3b4367f315a61fc615e3138f3d320da8c466 Mon Sep 17 00:00:00 2001 From: Vladimir Vdovin Date: Fri, 8 Nov 2024 09:34:24 +0000 Subject: net: ipv4: Cache pmtu for all packet paths if multipath enabled Check number of paths by fib_info_num_path(), and update_or_create_fnhe() for every path. Problem is that pmtu is cached only for the oif that has received icmp message "need to frag", other oifs will still try to use "default" iface mtu. An example topology showing the problem: | host1 +---------+ | dummy0 | 10.179.20.18/32 mtu9000 +---------+ +-----------+----------------+ +---------+ +---------+ | ens17f0 | 10.179.2.141/31 | ens17f1 | 10.179.2.13/31 +---------+ +---------+ | (all here have mtu 9000) | +------+ +------+ | ro1 | 10.179.2.140/31 | ro2 | 10.179.2.12/31 +------+ +------+ | | ---------+------------+-------------------+------ | +-----+ | ro3 | 10.10.10.10 mtu1500 +-----+ | ======================================== some networks ======================================== | +-----+ | eth0| 10.10.30.30 mtu9000 +-----+ | host2 host1 have enabled multipath and sysctl net.ipv4.fib_multipath_hash_policy = 1: default proto static src 10.179.20.18 nexthop via 10.179.2.12 dev ens17f1 weight 1 nexthop via 10.179.2.140 dev ens17f0 weight 1 When host1 tries to do pmtud from 10.179.20.18/32 to host2, host1 receives at ens17f1 iface an icmp packet from ro3 that ro3 mtu=1500. And host1 caches it in nexthop exceptions cache. Problem is that it is cached only for the iface that has received icmp, and there is no way that ro3 will send icmp msg to host1 via another path. Host1 now have this routes to host2: ip r g 10.10.30.30 sport 30000 dport 443 10.10.30.30 via 10.179.2.12 dev ens17f1 src 10.179.20.18 uid 0 cache expires 521sec mtu 1500 ip r g 10.10.30.30 sport 30033 dport 443 10.10.30.30 via 10.179.2.140 dev ens17f0 src 10.179.20.18 uid 0 cache So when host1 tries again to reach host2 with mtu>1500, if packet flow is lucky enough to be hashed with oif=ens17f1 its ok, if oif=ens17f0 it blackholes and still gets icmp msgs from ro3 to ens17f1, until lucky day when ro3 will send it through another flow to ens17f0. Signed-off-by: Vladimir Vdovin Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20241108093427.317942-1-deliran@verdict.gg Signed-off-by: Jakub Kicinski --- net/ipv4/route.c | 13 +++++ tools/testing/selftests/net/pmtu.sh | 112 ++++++++++++++++++++++++++++++------ 2 files changed, 108 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 4c5e773002fe..ccdbe9c70132 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1027,6 +1027,19 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) struct fib_nh_common *nhc; fib_select_path(net, &res, fl4, NULL); +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (fib_info_num_path(res.fi) > 1) { + int nhsel; + + for (nhsel = 0; nhsel < fib_info_num_path(res.fi); nhsel++) { + nhc = fib_info_nhc(res.fi, nhsel); + update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, + jiffies + net->ipv4.ip_rt_mtu_expires); + } + rcu_read_unlock(); + return; + } +#endif /* CONFIG_IP_ROUTE_MULTIPATH */ nhc = FIB_RES_NHC(res); update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + net->ipv4.ip_rt_mtu_expires); diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index 6c651c880fe8..66be7699c72c 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -197,6 +197,12 @@ # # - pmtu_ipv6_route_change # Same as above but with IPv6 +# +# - pmtu_ipv4_mp_exceptions +# Use the same topology as in pmtu_ipv4, but add routeable addresses +# on host A and B on lo reachable via both routers. Host A and B +# addresses have multipath routes to each other, b_r1 mtu = 1500. +# Check that PMTU exceptions are created for both paths. source lib.sh source net_helper.sh @@ -266,7 +272,8 @@ tests=" list_flush_ipv4_exception ipv4: list and flush cached exceptions 1 list_flush_ipv6_exception ipv6: list and flush cached exceptions 1 pmtu_ipv4_route_change ipv4: PMTU exception w/route replace 1 - pmtu_ipv6_route_change ipv6: PMTU exception w/route replace 1" + pmtu_ipv6_route_change ipv6: PMTU exception w/route replace 1 + pmtu_ipv4_mp_exceptions ipv4: PMTU multipath nh exceptions 1" # Addressing and routing for tests with routers: four network segments, with # index SEGMENT between 1 and 4, a common prefix (PREFIX4 or PREFIX6) and an @@ -343,6 +350,9 @@ tunnel6_a_addr="fd00:2::a" tunnel6_b_addr="fd00:2::b" tunnel6_mask="64" +host4_a_addr="192.168.99.99" +host4_b_addr="192.168.88.88" + dummy6_0_prefix="fc00:1000::" dummy6_1_prefix="fc00:1001::" dummy6_mask="64" @@ -984,6 +994,52 @@ setup_ovs_bridge() { run_cmd ip route add ${prefix6}:${b_r1}::1 via ${prefix6}:${a_r1}::2 } +setup_multipath_new() { + # Set up host A with multipath routes to host B host4_b_addr + run_cmd ${ns_a} ip addr add ${host4_a_addr} dev lo + run_cmd ${ns_a} ip nexthop add id 401 via ${prefix4}.${a_r1}.2 dev veth_A-R1 + run_cmd ${ns_a} ip nexthop add id 402 via ${prefix4}.${a_r2}.2 dev veth_A-R2 + run_cmd ${ns_a} ip nexthop add id 403 group 401/402 + run_cmd ${ns_a} ip route add ${host4_b_addr} src ${host4_a_addr} nhid 403 + + # Set up host B with multipath routes to host A host4_a_addr + run_cmd ${ns_b} ip addr add ${host4_b_addr} dev lo + run_cmd ${ns_b} ip nexthop add id 401 via ${prefix4}.${b_r1}.2 dev veth_B-R1 + run_cmd ${ns_b} ip nexthop add id 402 via ${prefix4}.${b_r2}.2 dev veth_B-R2 + run_cmd ${ns_b} ip nexthop add id 403 group 401/402 + run_cmd ${ns_b} ip route add ${host4_a_addr} src ${host4_b_addr} nhid 403 +} + +setup_multipath_old() { + # Set up host A with multipath routes to host B host4_b_addr + run_cmd ${ns_a} ip addr add ${host4_a_addr} dev lo + run_cmd ${ns_a} ip route add ${host4_b_addr} \ + src ${host4_a_addr} \ + nexthop via ${prefix4}.${a_r1}.2 weight 1 \ + nexthop via ${prefix4}.${a_r2}.2 weight 1 + + # Set up host B with multipath routes to host A host4_a_addr + run_cmd ${ns_b} ip addr add ${host4_b_addr} dev lo + run_cmd ${ns_b} ip route add ${host4_a_addr} \ + src ${host4_b_addr} \ + nexthop via ${prefix4}.${b_r1}.2 weight 1 \ + nexthop via ${prefix4}.${b_r2}.2 weight 1 +} + +setup_multipath() { + if [ "$USE_NH" = "yes" ]; then + setup_multipath_new + else + setup_multipath_old + fi + + # Set up routers with routes to dummies + run_cmd ${ns_r1} ip route add ${host4_a_addr} via ${prefix4}.${a_r1}.1 + run_cmd ${ns_r2} ip route add ${host4_a_addr} via ${prefix4}.${a_r2}.1 + run_cmd ${ns_r1} ip route add ${host4_b_addr} via ${prefix4}.${b_r1}.1 + run_cmd ${ns_r2} ip route add ${host4_b_addr} via ${prefix4}.${b_r2}.1 +} + setup() { [ "$(id -u)" -ne 0 ] && echo " need to run as root" && return $ksft_skip @@ -1076,23 +1132,15 @@ link_get_mtu() { } route_get_dst_exception() { - ns_cmd="${1}" - dst="${2}" - dsfield="${3}" + ns_cmd="${1}"; shift - if [ -z "${dsfield}" ]; then - dsfield=0 - fi - - ${ns_cmd} ip route get "${dst}" dsfield "${dsfield}" + ${ns_cmd} ip route get "$@" } route_get_dst_pmtu_from_exception() { - ns_cmd="${1}" - dst="${2}" - dsfield="${3}" + ns_cmd="${1}"; shift - mtu_parse "$(route_get_dst_exception "${ns_cmd}" "${dst}" "${dsfield}")" + mtu_parse "$(route_get_dst_exception "${ns_cmd}" "$@")" } check_pmtu_value() { @@ -1235,10 +1283,10 @@ test_pmtu_ipv4_dscp_icmp_exception() { run_cmd "${ns_a}" ping -q -M want -Q "${dsfield}" -c 1 -w 1 -s "${len}" "${dst2}" # Check that exceptions have been created with the correct PMTU - pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")" + pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" dsfield "${policy_mark}")" check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1 - pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")" + pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" dsfield "${policy_mark}")" check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1 } @@ -1285,9 +1333,9 @@ test_pmtu_ipv4_dscp_udp_exception() { UDP:"${dst2}":50000,tos="${dsfield}" # Check that exceptions have been created with the correct PMTU - pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")" + pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" dsfield "${policy_mark}")" check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1 - pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")" + pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" dsfield "${policy_mark}")" check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1 } @@ -2329,6 +2377,36 @@ test_pmtu_ipv6_route_change() { test_pmtu_ipvX_route_change 6 } +test_pmtu_ipv4_mp_exceptions() { + setup namespaces routing multipath || return $ksft_skip + + trace "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \ + "${ns_r1}" veth_R1-B "${ns_b}" veth_B-R1 \ + "${ns_a}" veth_A-R2 "${ns_r2}" veth_R2-A \ + "${ns_r2}" veth_R2-B "${ns_b}" veth_B-R2 + + # Set up initial MTU values + mtu "${ns_a}" veth_A-R1 2000 + mtu "${ns_r1}" veth_R1-A 2000 + mtu "${ns_r1}" veth_R1-B 1500 + mtu "${ns_b}" veth_B-R1 1500 + + mtu "${ns_a}" veth_A-R2 2000 + mtu "${ns_r2}" veth_R2-A 2000 + mtu "${ns_r2}" veth_R2-B 1500 + mtu "${ns_b}" veth_B-R2 1500 + + # Ping and expect two nexthop exceptions for two routes + run_cmd ${ns_a} ping -q -M want -i 0.1 -c 1 -s 1800 "${host4_b_addr}" + + # Check that exceptions have been created with the correct PMTU + pmtu_a_R1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${host4_b_addr}" oif veth_A-R1)" + pmtu_a_R2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${host4_b_addr}" oif veth_A-R2)" + + check_pmtu_value "1500" "${pmtu_a_R1}" "exceeding MTU (veth_A-R1)" || return 1 + check_pmtu_value "1500" "${pmtu_a_R2}" "exceeding MTU (veth_A-R2)" || return 1 +} + usage() { echo echo "$0 [OPTIONS] [TEST]..." -- cgit v1.3 From 37653a0b8a6f5d6ab23daa8e585c5ed24a0fc500 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:55:53 +0800 Subject: net: ip: make fib_validate_source() support drop reasons In this commit, we make fib_validate_source() and __fib_validate_source() return -reason instead of errno on error. The return value of fib_validate_source can be -errno, 0, and 1. It's hard to make fib_validate_source() return drop reasons directly. The fib_validate_source() will return 1 if the scope of the source(revert) route is HOST. And the __mkroute_input() will mark the skb with IPSKB_DOREDIRECT in this case (combine with some other conditions). And then, a REDIRECT ICMP will be sent in ip_forward() if this flag exists. We can't pass this information to __mkroute_input if we make fib_validate_source() return drop reasons. Therefore, we introduce the wrapper fib_validate_source_reason() for fib_validate_source(), which will return the drop reasons on error. In the origin logic, LINUX_MIB_IPRPFILTER will be counted if fib_validate_source() return -EXDEV. And now, we need to adjust it by checking "reason == SKB_DROP_REASON_IP_RPFILTER". However, this will take effect only after the patch "net: ip: make ip_route_input_noref() return drop reasons", as we can't pass the drop reasons from fib_validate_source() to ip_rcv_finish_core() in this patch. Following new drop reasons are added in this patch: SKB_DROP_REASON_IP_LOCAL_SOURCE SKB_DROP_REASON_IP_INVALID_SOURCE Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/dropreason-core.h | 10 ++++++++++ include/net/ip_fib.h | 12 ++++++++++++ net/ipv4/fib_frontend.c | 17 ++++++++++++----- net/ipv4/ip_input.c | 4 +--- net/ipv4/route.c | 33 +++++++++++++++++++-------------- 5 files changed, 54 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index d59bb96c5a02..62a60be1db84 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -76,6 +76,8 @@ FN(INVALID_PROTO) \ FN(IP_INADDRERRORS) \ FN(IP_INNOROUTES) \ + FN(IP_LOCAL_SOURCE) \ + FN(IP_INVALID_SOURCE) \ FN(PKT_TOO_BIG) \ FN(DUP_FRAG) \ FN(FRAG_REASM_TIMEOUT) \ @@ -373,6 +375,14 @@ enum skb_drop_reason { * IPSTATS_MIB_INADDRERRORS */ SKB_DROP_REASON_IP_INNOROUTES, + /** @SKB_DROP_REASON_IP_LOCAL_SOURCE: the source ip is local */ + SKB_DROP_REASON_IP_LOCAL_SOURCE, + /** + * @SKB_DROP_REASON_IP_INVALID_SOURCE: the source ip is invalid: + * 1) source ip is multicast or limited broadcast + * 2) source ip is zero and not IGMP + */ + SKB_DROP_REASON_IP_INVALID_SOURCE, /** * @SKB_DROP_REASON_PKT_TOO_BIG: packet size is too big (maybe exceed the * MTU) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index b6e44f4eaa4c..a113c11ab56b 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -452,6 +452,18 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, dscp_t dscp, int oif, struct net_device *dev, struct in_device *idev, u32 *itag); +static inline enum skb_drop_reason +fib_validate_source_reason(struct sk_buff *skb, __be32 src, __be32 dst, + dscp_t dscp, int oif, struct net_device *dev, + struct in_device *idev, u32 *itag) +{ + int err = fib_validate_source(skb, src, dst, dscp, oif, dev, idev, + itag); + if (err < 0) + return -err; + return SKB_NOT_DROPPED_YET; +} + #ifdef CONFIG_IP_ROUTE_CLASSID static inline int fib_num_tclassid_users(struct net *net) { diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 0c9ce934b490..87bb36a5bdec 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -346,6 +346,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, int rpf, struct in_device *idev, u32 *itag) { struct net *net = dev_net(dev); + enum skb_drop_reason reason; struct flow_keys flkeys; int ret, no_addr; struct fib_result res; @@ -377,9 +378,15 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, if (fib_lookup(net, &fl4, &res, 0)) goto last_resort; - if (res.type != RTN_UNICAST && - (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) - goto e_inval; + if (res.type != RTN_UNICAST) { + if (res.type != RTN_LOCAL) { + reason = SKB_DROP_REASON_IP_INVALID_SOURCE; + goto e_inval; + } else if (!IN_DEV_ACCEPT_LOCAL(idev)) { + reason = SKB_DROP_REASON_IP_LOCAL_SOURCE; + goto e_inval; + } + } fib_combine_itag(itag, &res); dev_match = fib_info_nh_uses_dev(res.fi, dev); @@ -412,9 +419,9 @@ last_resort: return 0; e_inval: - return -EINVAL; + return -reason; e_rpf: - return -EXDEV; + return -SKB_DROP_REASON_IP_RPFILTER; } /* Ignore rp_filter for packets protected by IPsec. */ diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 89bb63da6852..c40a26972884 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -425,10 +425,8 @@ drop: return NET_RX_DROP; drop_error: - if (err == -EXDEV) { - drop_reason = SKB_DROP_REASON_IP_RPFILTER; + if (drop_reason == SKB_DROP_REASON_IP_RPFILTER) __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); - } goto drop; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ccdbe9c70132..c38e95d9da9e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1682,7 +1682,7 @@ int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, struct in_device *in_dev, u32 *itag) { - int err; + enum skb_drop_reason reason; /* Primary sanity checks. */ if (!in_dev) @@ -1700,10 +1700,10 @@ int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, ip_hdr(skb)->protocol != IPPROTO_IGMP) return -EINVAL; } else { - err = fib_validate_source(skb, saddr, 0, dscp, 0, dev, in_dev, - itag); - if (err < 0) - return err; + reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0, + dev, in_dev, itag); + if (reason) + return -EINVAL; } return 0; } @@ -1801,6 +1801,7 @@ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, err = fib_validate_source(skb, saddr, daddr, dscp, FIB_RES_OIF(*res), in_dev->dev, in_dev, &itag); if (err < 0) { + err = -EINVAL; ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); @@ -2153,6 +2154,7 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct in_device *in_dev = __in_dev_get_rcu(dev); struct rtable *rt = skb_rtable(hint); struct net *net = dev_net(dev); + enum skb_drop_reason reason; int err = -EINVAL; u32 tag = 0; @@ -2171,9 +2173,9 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (rt->rt_type != RTN_LOCAL) goto skip_validate_source; - err = fib_validate_source(skb, saddr, daddr, dscp, 0, dev, in_dev, - &tag); - if (err < 0) + reason = fib_validate_source_reason(skb, saddr, daddr, dscp, 0, dev, + in_dev, &tag); + if (reason) goto martian_source; skip_validate_source: @@ -2215,6 +2217,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, struct fib_result *res) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); struct flow_keys *flkeys = NULL, _flkeys; struct net *net = dev_net(dev); @@ -2309,10 +2312,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto brd_input; } + err = -EINVAL; if (res->type == RTN_LOCAL) { - err = fib_validate_source(skb, saddr, daddr, dscp, 0, dev, - in_dev, &itag); - if (err < 0) + reason = fib_validate_source_reason(skb, saddr, daddr, dscp, + 0, dev, in_dev, &itag); + if (reason) goto martian_source; goto local_input; } @@ -2333,9 +2337,10 @@ brd_input: goto e_inval; if (!ipv4_is_zeronet(saddr)) { - err = fib_validate_source(skb, saddr, 0, dscp, 0, dev, in_dev, - &itag); - if (err < 0) + err = -EINVAL; + reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0, + dev, in_dev, &itag); + if (reason) goto martian_source; } flags |= RTCF_BROADCAST; -- cgit v1.3 From c6c670784b862878deba7e16210ca4b2a2966ca0 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:55:54 +0800 Subject: net: ip: make ip_route_input_mc() return drop reason Make ip_route_input_mc() return drop reason, and adjust the call of it in ip_route_input_rcu(). Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- net/ipv4/route.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c38e95d9da9e..b20316789baa 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1709,8 +1709,9 @@ int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, } /* called in rcu_read_lock() section */ -static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev, int our) +static enum skb_drop_reason +ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, int our) { struct in_device *in_dev = __in_dev_get_rcu(dev); unsigned int flags = RTCF_MULTICAST; @@ -1721,7 +1722,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, err = ip_mc_validate_source(skb, daddr, saddr, dscp, dev, in_dev, &itag); if (err) - return err; + return SKB_DROP_REASON_NOT_SPECIFIED; if (our) flags |= RTCF_LOCAL; @@ -1732,7 +1733,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, false); if (!rth) - return -ENOBUFS; + return SKB_DROP_REASON_NOMEM; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; @@ -1748,7 +1749,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, skb_dst_drop(skb); skb_dst_set(skb, &rth->dst); - return 0; + return SKB_NOT_DROPPED_YET; } @@ -2446,12 +2447,12 @@ static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, * route cache entry is created eventually. */ if (ipv4_is_multicast(daddr)) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); int our = 0; - int err = -EINVAL; if (!in_dev) - return err; + return -EINVAL; our = ip_check_mc_rcu(in_dev, daddr, saddr, ip_hdr(skb)->protocol); @@ -2472,10 +2473,10 @@ static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, IN_DEV_MFORWARD(in_dev)) #endif ) { - err = ip_route_input_mc(skb, daddr, saddr, dscp, dev, - our); + reason = ip_route_input_mc(skb, daddr, saddr, dscp, + dev, our); } - return err; + return reason ? -EINVAL : 0; } return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res); -- cgit v1.3 From d46f827016d891dbc234cb05c406180f77fb3b2d Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:55:55 +0800 Subject: net: ip: make ip_mc_validate_source() return drop reason Make ip_mc_validate_source() return drop reason, and adjust the call of it in ip_route_input_mc(). Another caller of it is ip_rcv_finish_core->udp_v4_early_demux, and the errno is not checked in detail, so we don't do more adjustment for it. The drop reason "SKB_DROP_REASON_IP_LOCALNET" is added in this commit. Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/dropreason-core.h | 3 +++ include/net/route.h | 7 ++++--- net/ipv4/route.c | 35 +++++++++++++++++++---------------- 3 files changed, 26 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 62a60be1db84..a2a1fb90e0e5 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -78,6 +78,7 @@ FN(IP_INNOROUTES) \ FN(IP_LOCAL_SOURCE) \ FN(IP_INVALID_SOURCE) \ + FN(IP_LOCALNET) \ FN(PKT_TOO_BIG) \ FN(DUP_FRAG) \ FN(FRAG_REASM_TIMEOUT) \ @@ -383,6 +384,8 @@ enum skb_drop_reason { * 2) source ip is zero and not IGMP */ SKB_DROP_REASON_IP_INVALID_SOURCE, + /** @SKB_DROP_REASON_IP_LOCALNET: source or dest ip is local net */ + SKB_DROP_REASON_IP_LOCALNET, /** * @SKB_DROP_REASON_PKT_TOO_BIG: packet size is too big (maybe exceed the * MTU) diff --git a/include/net/route.h b/include/net/route.h index 0a690adfdff5..e2e1922c58fb 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -199,9 +199,10 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 return ip_route_output_key(net, fl4); } -int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev, - struct in_device *in_dev, u32 *itag); +enum skb_drop_reason +ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + struct in_device *in_dev, u32 *itag); int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev); int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b20316789baa..ef0b5ffda843 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1678,34 +1678,37 @@ struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt) EXPORT_SYMBOL(rt_dst_clone); /* called in rcu_read_lock() section */ -int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev, - struct in_device *in_dev, u32 *itag) +enum skb_drop_reason +ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + struct in_device *in_dev, u32 *itag) { enum skb_drop_reason reason; /* Primary sanity checks. */ if (!in_dev) - return -EINVAL; + return SKB_DROP_REASON_NOT_SPECIFIED; - if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || - skb->protocol != htons(ETH_P_IP)) - return -EINVAL; + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) + return SKB_DROP_REASON_IP_INVALID_SOURCE; + + if (skb->protocol != htons(ETH_P_IP)) + return SKB_DROP_REASON_INVALID_PROTO; if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) - return -EINVAL; + return SKB_DROP_REASON_IP_LOCALNET; if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr) && ip_hdr(skb)->protocol != IPPROTO_IGMP) - return -EINVAL; + return SKB_DROP_REASON_IP_INVALID_SOURCE; } else { reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0, dev, in_dev, itag); if (reason) - return -EINVAL; + return reason; } - return 0; + return SKB_NOT_DROPPED_YET; } /* called in rcu_read_lock() section */ @@ -1715,14 +1718,14 @@ ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, { struct in_device *in_dev = __in_dev_get_rcu(dev); unsigned int flags = RTCF_MULTICAST; + enum skb_drop_reason reason; struct rtable *rth; u32 itag = 0; - int err; - err = ip_mc_validate_source(skb, daddr, saddr, dscp, dev, in_dev, - &itag); - if (err) - return SKB_DROP_REASON_NOT_SPECIFIED; + reason = ip_mc_validate_source(skb, daddr, saddr, dscp, dev, in_dev, + &itag); + if (reason) + return reason; if (our) flags |= RTCF_LOCAL; -- cgit v1.3 From 5b92112acd8e2ed84a4df653fc20575f4da6fa49 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:55:56 +0800 Subject: net: ip: make ip_route_input_slow() return drop reasons In this commit, we make ip_route_input_slow() return skb drop reasons, and following new skb drop reasons are added: SKB_DROP_REASON_IP_INVALID_DEST The only caller of ip_route_input_slow() is ip_route_input_rcu(), and we adjust it by making it return -EINVAL on error. Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/dropreason-core.h | 6 +++++ net/ipv4/route.c | 56 +++++++++++++++++++++++++++---------------- 2 files changed, 41 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index a2a1fb90e0e5..74624d369d48 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -79,6 +79,7 @@ FN(IP_LOCAL_SOURCE) \ FN(IP_INVALID_SOURCE) \ FN(IP_LOCALNET) \ + FN(IP_INVALID_DEST) \ FN(PKT_TOO_BIG) \ FN(DUP_FRAG) \ FN(FRAG_REASM_TIMEOUT) \ @@ -386,6 +387,11 @@ enum skb_drop_reason { SKB_DROP_REASON_IP_INVALID_SOURCE, /** @SKB_DROP_REASON_IP_LOCALNET: source or dest ip is local net */ SKB_DROP_REASON_IP_LOCALNET, + /** + * @SKB_DROP_REASON_IP_INVALID_DEST: the dest ip is invalid: + * 1) dest ip is 0 + */ + SKB_DROP_REASON_IP_INVALID_DEST, /** * @SKB_DROP_REASON_PKT_TOO_BIG: packet size is too big (maybe exceed the * MTU) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ef0b5ffda843..b73f0355cc38 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2217,9 +2217,10 @@ static struct net_device *ip_rt_get_dev(struct net *net, * called with rcu_read_lock() */ -static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev, - struct fib_result *res) +static enum skb_drop_reason +ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + struct fib_result *res) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -2249,8 +2250,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.flowi4_tun_key.tun_id = 0; skb_dst_drop(skb); - if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) { + reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; + } res->fi = NULL; res->table = NULL; @@ -2260,21 +2263,29 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, /* Accept zero addresses only to limited broadcast; * I even do not know to fix it or not. Waiting for complains :-) */ - if (ipv4_is_zeronet(saddr)) + if (ipv4_is_zeronet(saddr)) { + reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; + } - if (ipv4_is_zeronet(daddr)) + if (ipv4_is_zeronet(daddr)) { + reason = SKB_DROP_REASON_IP_INVALID_DEST; goto martian_destination; + } /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(), * and call it once if daddr or/and saddr are loopback addresses */ if (ipv4_is_loopback(daddr)) { - if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) + if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { + reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_destination; + } } else if (ipv4_is_loopback(saddr)) { - if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) + if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { + reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_source; + } } /* @@ -2329,19 +2340,26 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, err = -EHOSTUNREACH; goto no_route; } - if (res->type != RTN_UNICAST) + if (res->type != RTN_UNICAST) { + reason = SKB_DROP_REASON_IP_INVALID_DEST; goto martian_destination; + } make_route: err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, dscp, flkeys); -out: return err; + if (!err) + reason = SKB_NOT_DROPPED_YET; + +out: + return reason; brd_input: - if (skb->protocol != htons(ETH_P_IP)) - goto e_inval; + if (skb->protocol != htons(ETH_P_IP)) { + reason = SKB_DROP_REASON_INVALID_PROTO; + goto out; + } if (!ipv4_is_zeronet(saddr)) { - err = -EINVAL; reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0, dev, in_dev, &itag); if (reason) @@ -2362,7 +2380,7 @@ local_input: rth = rcu_dereference(nhc->nhc_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); - err = 0; + reason = SKB_NOT_DROPPED_YET; goto out; } } @@ -2399,7 +2417,7 @@ local_input: rt_add_uncached_list(rth); } skb_dst_set(skb, &rth->dst); - err = 0; + reason = SKB_NOT_DROPPED_YET; goto out; no_route: @@ -2420,12 +2438,8 @@ martian_destination: &daddr, &saddr, dev->name); #endif -e_inval: - err = -EINVAL; - goto out; - e_nobufs: - err = -ENOBUFS; + reason = SKB_DROP_REASON_NOMEM; goto out; martian_source: @@ -2482,7 +2496,7 @@ static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, return reason ? -EINVAL : 0; } - return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res); + return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res) ? -EINVAL : 0; } int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, -- cgit v1.3 From 61b95c70f3449c1c0bd1415c8ef37e2959cf1c41 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:55:57 +0800 Subject: net: ip: make ip_route_input_rcu() return drop reasons In this commit, we make ip_route_input_rcu() return drop reasons, which come from ip_route_input_mc() and ip_route_input_slow(). The only caller of ip_route_input_rcu() is ip_route_input_noref(). We adjust it by making it return -EINVAL on error and ignore the reasons that ip_route_input_rcu() returns. In the following patch, we will make ip_route_input_noref() returns the drop reasons. Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- net/ipv4/route.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b73f0355cc38..270bc8c96619 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2448,9 +2448,10 @@ martian_source: } /* called with rcu_read_lock held */ -static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev, - struct fib_result *res) +static enum skb_drop_reason +ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + struct fib_result *res) { /* Multicast recognition logic is moved from route cache to here. * The problem was that too many Ethernet cards have broken/missing @@ -2493,23 +2494,23 @@ static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, reason = ip_route_input_mc(skb, daddr, saddr, dscp, dev, our); } - return reason ? -EINVAL : 0; + return reason; } - return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res) ? -EINVAL : 0; + return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res); } int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev) { + enum skb_drop_reason reason; struct fib_result res; - int err; rcu_read_lock(); - err = ip_route_input_rcu(skb, daddr, saddr, dscp, dev, &res); + reason = ip_route_input_rcu(skb, daddr, saddr, dscp, dev, &res); rcu_read_unlock(); - return err; + return reason ? -EINVAL : 0; } EXPORT_SYMBOL(ip_route_input_noref); @@ -3321,7 +3322,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, skb->mark = mark; err = ip_route_input_rcu(skb, dst, src, inet_dsfield_to_dscp(rtm->rtm_tos), - dev, &res); + dev, &res) ? -EINVAL : 0; rt = skb_rtable(skb); if (err == 0 && rt->dst.error) -- cgit v1.3 From 82d9983ebeb871cb5abd27c12a950c14c68772e1 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:55:58 +0800 Subject: net: ip: make ip_route_input_noref() return drop reasons In this commit, we make ip_route_input_noref() return drop reasons, which come from ip_route_input_rcu(). We need adjust the callers of ip_route_input_noref() to make sure the return value of ip_route_input_noref() is used properly. The errno that ip_route_input_noref() returns comes from ip_route_input and bpf_lwt_input_reroute in the origin logic, and we make them return -EINVAL on error instead. In the following patch, we will make ip_route_input() returns drop reasons too. Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/route.h | 15 ++++++++------- net/core/lwt_bpf.c | 6 ++++-- net/ipv4/ip_fragment.c | 11 ++++++----- net/ipv4/ip_input.c | 7 ++++--- net/ipv4/route.c | 7 ++++--- 5 files changed, 26 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/include/net/route.h b/include/net/route.h index e2e1922c58fb..b85ffa3e042b 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -203,8 +203,9 @@ enum skb_drop_reason ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, struct in_device *in_dev, u32 *itag); -int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev); +enum skb_drop_reason +ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev); int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, const struct sk_buff *hint); @@ -212,18 +213,18 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, dscp_t dscp, struct net_device *devin) { - int err; + enum skb_drop_reason reason; rcu_read_lock(); - err = ip_route_input_noref(skb, dst, src, dscp, devin); - if (!err) { + reason = ip_route_input_noref(skb, dst, src, dscp, devin); + if (!reason) { skb_dst_force(skb); if (!skb_dst(skb)) - err = -EINVAL; + reason = SKB_DROP_REASON_NOT_SPECIFIED; } rcu_read_unlock(); - return err; + return reason ? -EINVAL : 0; } void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index e0ca24a58810..8a78bff53b2c 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -88,6 +88,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, static int bpf_lwt_input_reroute(struct sk_buff *skb) { + enum skb_drop_reason reason; int err = -EINVAL; if (skb->protocol == htons(ETH_P_IP)) { @@ -96,8 +97,9 @@ static int bpf_lwt_input_reroute(struct sk_buff *skb) dev_hold(dev); skb_dst_drop(skb); - err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - ip4h_dscp(iph), dev); + reason = ip_route_input_noref(skb, iph->daddr, iph->saddr, + ip4h_dscp(iph), dev); + err = reason ? -EINVAL : 0; dev_put(dev); } else if (skb->protocol == htons(ETH_P_IPV6)) { skb_dst_drop(skb); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 48e2810f1f27..07036a2943c1 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -132,12 +132,12 @@ static bool frag_expire_skip_icmp(u32 user) */ static void ip_expire(struct timer_list *t) { + enum skb_drop_reason reason = SKB_DROP_REASON_FRAG_REASM_TIMEOUT; struct inet_frag_queue *frag = from_timer(frag, t, timer); const struct iphdr *iph; struct sk_buff *head = NULL; struct net *net; struct ipq *qp; - int err; qp = container_of(frag, struct ipq, q); net = qp->q.fqdir->net; @@ -175,14 +175,15 @@ static void ip_expire(struct timer_list *t) /* skb has no dst, perform route lookup again */ iph = ip_hdr(head); - err = ip_route_input_noref(head, iph->daddr, iph->saddr, ip4h_dscp(iph), - head->dev); - if (err) + reason = ip_route_input_noref(head, iph->daddr, iph->saddr, + ip4h_dscp(iph), head->dev); + if (reason) goto out; /* Only an end host needs to send an ICMP * "Fragment Reassembly Timeout" message, per RFC792. */ + reason = SKB_DROP_REASON_FRAG_REASM_TIMEOUT; if (frag_expire_skip_icmp(qp->q.key.v4.user) && (skb_rtable(head)->rt_type != RTN_LOCAL)) goto out; @@ -195,7 +196,7 @@ out: spin_unlock(&qp->q.lock); out_rcu_unlock: rcu_read_unlock(); - kfree_skb_reason(head, SKB_DROP_REASON_FRAG_REASM_TIMEOUT); + kfree_skb_reason(head, reason); ipq_put(qp); } diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index c40a26972884..513eb0c6435a 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -362,10 +362,11 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk, * how the packet travels inside Linux networking. */ if (!skb_valid_dst(skb)) { - err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - ip4h_dscp(iph), dev); - if (unlikely(err)) + drop_reason = ip_route_input_noref(skb, iph->daddr, iph->saddr, + ip4h_dscp(iph), dev); + if (unlikely(drop_reason)) goto drop_error; + drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; } else { struct in_device *in_dev = __in_dev_get_rcu(dev); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 270bc8c96619..5a7edb66174a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2500,8 +2500,9 @@ ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res); } -int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev) +enum skb_drop_reason ip_route_input_noref(struct sk_buff *skb, __be32 daddr, + __be32 saddr, dscp_t dscp, + struct net_device *dev) { enum skb_drop_reason reason; struct fib_result res; @@ -2510,7 +2511,7 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, reason = ip_route_input_rcu(skb, daddr, saddr, dscp, dev, &res); rcu_read_unlock(); - return reason ? -EINVAL : 0; + return reason; } EXPORT_SYMBOL(ip_route_input_noref); -- cgit v1.3 From 50038bf38e6577a15d52b890d82c197cf3b163a0 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:55:59 +0800 Subject: net: ip: make ip_route_input() return drop reasons In this commit, we make ip_route_input() return skb drop reasons that come from ip_route_input_noref(). Meanwhile, adjust all the call to it. Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/route.h | 7 ++++--- net/bridge/br_netfilter_hooks.c | 11 ++++++----- net/ipv4/icmp.c | 2 +- net/ipv4/ip_options.c | 2 +- net/ipv6/seg6_local.c | 14 +++++++------- 5 files changed, 19 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/net/route.h b/include/net/route.h index b85ffa3e042b..fb3433dc9c72 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -210,8 +210,9 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, const struct sk_buff *hint); -static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, - dscp_t dscp, struct net_device *devin) +static inline enum skb_drop_reason +ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, dscp_t dscp, + struct net_device *devin) { enum skb_drop_reason reason; @@ -224,7 +225,7 @@ static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, } rcu_read_unlock(); - return reason ? -EINVAL : 0; + return reason; } void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 7f2f40cef5fe..451e45b9a6a5 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -373,8 +373,8 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); struct net_device *dev = skb->dev, *br_indev; const struct iphdr *iph = ip_hdr(skb); + enum skb_drop_reason reason; struct rtable *rt; - int err; br_indev = nf_bridge_get_physindev(skb, net); if (!br_indev) { @@ -390,9 +390,9 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_ } nf_bridge->in_prerouting = 0; if (br_nf_ipv4_daddr_was_changed(skb, nf_bridge)) { - err = ip_route_input(skb, iph->daddr, iph->saddr, - ip4h_dscp(iph), dev); - if (err) { + reason = ip_route_input(skb, iph->daddr, iph->saddr, + ip4h_dscp(iph), dev); + if (reason) { struct in_device *in_dev = __in_dev_get_rcu(dev); /* If err equals -EHOSTUNREACH the error is due to a @@ -402,7 +402,8 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_ * martian destinations: loopback destinations and destination * 0.0.0.0. In both cases the packet will be dropped because the * destination is the loopback device and not the bridge. */ - if (err != -EHOSTUNREACH || !in_dev || IN_DEV_FORWARD(in_dev)) + if (reason != SKB_DROP_REASON_IP_INADDRERRORS || !in_dev || + IN_DEV_FORWARD(in_dev)) goto free_skb; rt = ip_route_output(net, iph->daddr, 0, diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 33eec844a5a0..4f088fa1c2f2 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -545,7 +545,7 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, orefdst = skb_in->_skb_refdst; /* save old refdst */ skb_dst_set(skb_in, NULL); err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr, - dscp, rt2->dst.dev); + dscp, rt2->dst.dev) ? -EINVAL : 0; dst_release(&rt2->dst); rt2 = skb_rtable(skb_in); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 81e86e5defee..e3321932bec0 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -618,7 +618,7 @@ int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev) orefdst = skb->_skb_refdst; skb_dst_set(skb, NULL); err = ip_route_input(skb, nexthop, iph->saddr, ip4h_dscp(iph), - dev); + dev) ? -EINVAL : 0; rt2 = skb_rtable(skb); if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { skb_dst_drop(skb); diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index c74705ead984..ac1dbd492c22 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -954,10 +954,10 @@ static int input_action_end_dx4_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); + enum skb_drop_reason reason; struct seg6_local_lwt *slwt; struct iphdr *iph; __be32 nhaddr; - int err; slwt = seg6_local_lwtunnel(orig_dst->lwtstate); @@ -967,9 +967,9 @@ static int input_action_end_dx4_finish(struct net *net, struct sock *sk, skb_dst_drop(skb); - err = ip_route_input(skb, nhaddr, iph->saddr, 0, skb->dev); - if (err) { - kfree_skb(skb); + reason = ip_route_input(skb, nhaddr, iph->saddr, 0, skb->dev); + if (reason) { + kfree_skb_reason(skb, reason); return -EINVAL; } @@ -1174,8 +1174,8 @@ drop: static int input_action_end_dt4(struct sk_buff *skb, struct seg6_local_lwt *slwt) { + enum skb_drop_reason reason; struct iphdr *iph; - int err; if (!decap_and_validate(skb, IPPROTO_IPIP)) goto drop; @@ -1193,8 +1193,8 @@ static int input_action_end_dt4(struct sk_buff *skb, iph = ip_hdr(skb); - err = ip_route_input(skb, iph->daddr, iph->saddr, 0, skb->dev); - if (unlikely(err)) + reason = ip_route_input(skb, iph->daddr, iph->saddr, 0, skb->dev); + if (unlikely(reason)) goto drop; return dst_input(skb); -- cgit v1.3 From d9340d1e02779dbf83d53b0deb4068c7768b8261 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:56:00 +0800 Subject: net: ip: make ip_mkroute_input/__mkroute_input return drop reasons In this commit, we make ip_mkroute_input() and __mkroute_input() return drop reasons. The drop reason "SKB_DROP_REASON_ARP_PVLAN_DISABLE" is introduced for the case: the packet which is not IP is forwarded to the in_dev, and the proxy_arp_pvlan is not enabled. Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/dropreason-core.h | 7 +++++++ net/ipv4/route.c | 34 ++++++++++++++++++---------------- 2 files changed, 25 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 74624d369d48..6c5a1ea209a2 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -104,6 +104,7 @@ FN(IP_TUNNEL_ECN) \ FN(TUNNEL_TXINFO) \ FN(LOCAL_MAC) \ + FN(ARP_PVLAN_DISABLE) \ FNe(MAX) /** @@ -477,6 +478,12 @@ enum skb_drop_reason { * the MAC address of the local netdev. */ SKB_DROP_REASON_LOCAL_MAC, + /** + * @SKB_DROP_REASON_ARP_PVLAN_DISABLE: packet which is not IP is + * forwarded to the in_dev, and the proxy_arp_pvlan is not + * enabled. + */ + SKB_DROP_REASON_ARP_PVLAN_DISABLE, /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 5a7edb66174a..2697a6c88416 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1782,10 +1782,12 @@ static void ip_handle_martian_source(struct net_device *dev, } /* called in rcu_read_lock() section */ -static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, - struct in_device *in_dev, __be32 daddr, - __be32 saddr, dscp_t dscp) +static enum skb_drop_reason +__mkroute_input(struct sk_buff *skb, const struct fib_result *res, + struct in_device *in_dev, __be32 daddr, + __be32 saddr, dscp_t dscp) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct fib_nh_common *nhc = FIB_RES_NHC(*res); struct net_device *dev = nhc->nhc_dev; struct fib_nh_exception *fnhe; @@ -1799,13 +1801,13 @@ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, out_dev = __in_dev_get_rcu(dev); if (!out_dev) { net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); - return -EINVAL; + return reason; } err = fib_validate_source(skb, saddr, daddr, dscp, FIB_RES_OIF(*res), in_dev->dev, in_dev, &itag); if (err < 0) { - err = -EINVAL; + reason = -err; ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); @@ -1833,7 +1835,7 @@ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, */ if (out_dev == in_dev && IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { - err = -EINVAL; + reason = SKB_DROP_REASON_ARP_PVLAN_DISABLE; goto cleanup; } } @@ -1856,7 +1858,7 @@ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, rth = rt_dst_alloc(out_dev->dev, 0, res->type, IN_DEV_ORCONF(out_dev, NOXFRM)); if (!rth) { - err = -ENOBUFS; + reason = SKB_DROP_REASON_NOMEM; goto cleanup; } @@ -1870,9 +1872,9 @@ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, lwtunnel_set_redirect(&rth->dst); skb_dst_set(skb, &rth->dst); out: - err = 0; - cleanup: - return err; + reason = SKB_NOT_DROPPED_YET; +cleanup: + return reason; } #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -2130,9 +2132,10 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, } #endif /* CONFIG_IP_ROUTE_MULTIPATH */ -static int ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, - struct in_device *in_dev, __be32 daddr, - __be32 saddr, dscp_t dscp, struct flow_keys *hkeys) +static enum skb_drop_reason +ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, + struct in_device *in_dev, __be32 daddr, + __be32 saddr, dscp_t dscp, struct flow_keys *hkeys) { #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && fib_info_num_path(res->fi) > 1) { @@ -2346,9 +2349,8 @@ ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, } make_route: - err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, dscp, flkeys); - if (!err) - reason = SKB_NOT_DROPPED_YET; + reason = ip_mkroute_input(skb, res, in_dev, daddr, saddr, dscp, + flkeys); out: return reason; -- cgit v1.3 From 479aed04e84a5d66caa3b25bfc651292c153ef70 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 7 Nov 2024 20:56:01 +0800 Subject: net: ip: make ip_route_use_hint() return drop reasons In this commit, we make ip_route_use_hint() return drop reasons. The drop reasons that we return are similar to what we do in ip_route_input_slow(), and no drop reasons are added in this commit. Signed-off-by: Menglong Dong Signed-off-by: Paolo Abeni --- include/net/route.h | 7 ++++--- net/ipv4/ip_input.c | 9 ++++----- net/ipv4/route.c | 28 +++++++++++++++++----------- 3 files changed, 25 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/include/net/route.h b/include/net/route.h index fb3433dc9c72..84cb1e04f5cd 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -206,9 +206,10 @@ ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, enum skb_drop_reason ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev); -int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev, - const struct sk_buff *hint); +enum skb_drop_reason +ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + const struct sk_buff *hint); static inline enum skb_drop_reason ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, dscp_t dscp, diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 513eb0c6435a..f0a4dda246ab 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -322,15 +322,14 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk, int err, drop_reason; struct rtable *rt; - drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; - if (ip_can_use_hint(skb, iph, hint)) { - err = ip_route_use_hint(skb, iph->daddr, iph->saddr, - ip4h_dscp(iph), dev, hint); - if (unlikely(err)) + drop_reason = ip_route_use_hint(skb, iph->daddr, iph->saddr, + ip4h_dscp(iph), dev, hint); + if (unlikely(drop_reason)) goto drop_error; } + drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) && !skb_dst(skb) && !skb->sk && diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 2697a6c88416..e5603e84b20d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2154,28 +2154,34 @@ ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, * assuming daddr is valid and the destination is not a local broadcast one. * Uses the provided hint instead of performing a route lookup. */ -int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, - dscp_t dscp, struct net_device *dev, - const struct sk_buff *hint) +enum skb_drop_reason +ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + const struct sk_buff *hint) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); struct rtable *rt = skb_rtable(hint); struct net *net = dev_net(dev); - enum skb_drop_reason reason; - int err = -EINVAL; u32 tag = 0; if (!in_dev) - return -EINVAL; + return reason; - if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) { + reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; + } - if (ipv4_is_zeronet(saddr)) + if (ipv4_is_zeronet(saddr)) { + reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; + } - if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) + if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { + reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_source; + } if (rt->rt_type != RTN_LOCAL) goto skip_validate_source; @@ -2187,11 +2193,11 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, skip_validate_source: skb_dst_copy(skb, hint); - return 0; + return SKB_NOT_DROPPED_YET; martian_source: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); - return err; + return reason; } /* get device for dst_alloc with local routes */ -- cgit v1.3 From 12079a59ce52e72a342c49cfacf0281213fd6f32 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 7 Nov 2024 08:11:44 -0800 Subject: net: Implement fault injection forcing skb reallocation Introduce a fault injection mechanism to force skb reallocation. The primary goal is to catch bugs related to pointer invalidation after potential skb reallocation. The fault injection mechanism aims to identify scenarios where callers retain pointers to various headers in the skb but fail to reload these pointers after calling a function that may reallocate the data. This type of bug can lead to memory corruption or crashes if the old, now-invalid pointers are used. By forcing reallocation through fault injection, we can stress-test code paths and ensure proper pointer management after potential skb reallocations. Add a hook for fault injection in the following functions: * pskb_trim_rcsum() * pskb_may_pull_reason() * pskb_trim() As the other fault injection mechanism, protect it under a debug Kconfig called CONFIG_FAIL_SKB_REALLOC. This patch was *heavily* inspired by Jakub's proposal from: https://lore.kernel.org/all/20240719174140.47a868e6@kernel.org/ CC: Akinobu Mita Suggested-by: Jakub Kicinski Signed-off-by: Breno Leitao Reviewed-by: Akinobu Mita Acked-by: Paolo Abeni Acked-by: Guillaume Nault Link: https://patch.msgid.link/20241107-fault_v6-v6-1-1b82cb6ecacd@debian.org Signed-off-by: Paolo Abeni --- Documentation/admin-guide/kernel-parameters.txt | 1 + Documentation/fault-injection/fault-injection.rst | 40 ++++++++ include/linux/skbuff.h | 9 ++ lib/Kconfig.debug | 10 ++ net/core/Makefile | 1 + net/core/skb_fault_injection.c | 106 ++++++++++++++++++++++ 6 files changed, 167 insertions(+) create mode 100644 net/core/skb_fault_injection.c (limited to 'net') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 1518343bbe22..2fb830453dcc 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1546,6 +1546,7 @@ failslab= fail_usercopy= fail_page_alloc= + fail_skb_realloc= fail_make_request=[KNL] General fault injection mechanism. Format: ,,, diff --git a/Documentation/fault-injection/fault-injection.rst b/Documentation/fault-injection/fault-injection.rst index 8b8aeea71c68..1c14ba08fbfc 100644 --- a/Documentation/fault-injection/fault-injection.rst +++ b/Documentation/fault-injection/fault-injection.rst @@ -45,6 +45,32 @@ Available fault injection capabilities ALLOW_ERROR_INJECTION() macro, by setting debugfs entries under /sys/kernel/debug/fail_function. No boot option supported. +- fail_skb_realloc + + inject skb (socket buffer) reallocation events into the network path. The + primary goal is to identify and prevent issues related to pointer + mismanagement in the network subsystem. By forcing skb reallocation at + strategic points, this feature creates scenarios where existing pointers to + skb headers become invalid. + + When the fault is injected and the reallocation is triggered, cached pointers + to skb headers and data no longer reference valid memory locations. This + deliberate invalidation helps expose code paths where proper pointer updating + is neglected after a reallocation event. + + By creating these controlled fault scenarios, the system can catch instances + where stale pointers are used, potentially leading to memory corruption or + system instability. + + To select the interface to act on, write the network name to + /sys/kernel/debug/fail_skb_realloc/devname. + If this field is left empty (which is the default value), skb reallocation + will be forced on all network interfaces. + + The effectiveness of this fault detection is enhanced when KASAN is + enabled, as it helps identify invalid memory references and use-after-free + (UAF) issues. + - NVMe fault injection inject NVMe status code and retry flag on devices permitted by setting @@ -216,6 +242,19 @@ configuration of fault-injection capabilities. use a negative errno, you better use 'printf' instead of 'echo', e.g.: $ printf %#x -12 > retval +- /sys/kernel/debug/fail_skb_realloc/devname: + + Specifies the network interface on which to force SKB reallocation. If + left empty, SKB reallocation will be applied to all network interfaces. + + Example usage:: + + # Force skb reallocation on eth0 + echo "eth0" > /sys/kernel/debug/fail_skb_realloc/devname + + # Clear the selection and force skb reallocation on all interfaces + echo "" > /sys/kernel/debug/fail_skb_realloc/devname + Boot option ^^^^^^^^^^^ @@ -227,6 +266,7 @@ use the boot option:: fail_usercopy= fail_make_request= fail_futex= + fail_skb_realloc= mmc_core.fail_request=,,, proc entries diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 60535c706851..58009fa66102 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2682,6 +2682,12 @@ static inline void skb_assert_len(struct sk_buff *skb) #endif /* CONFIG_DEBUG_NET */ } +#if defined(CONFIG_FAIL_SKB_REALLOC) +void skb_might_realloc(struct sk_buff *skb); +#else +static inline void skb_might_realloc(struct sk_buff *skb) {} +#endif + /* * Add data to an sk_buff */ @@ -2782,6 +2788,7 @@ static inline enum skb_drop_reason pskb_may_pull_reason(struct sk_buff *skb, unsigned int len) { DEBUG_NET_WARN_ON_ONCE(len > INT_MAX); + skb_might_realloc(skb); if (likely(len <= skb_headlen(skb))) return SKB_NOT_DROPPED_YET; @@ -3240,6 +3247,7 @@ static inline int __pskb_trim(struct sk_buff *skb, unsigned int len) static inline int pskb_trim(struct sk_buff *skb, unsigned int len) { + skb_might_realloc(skb); return (len < skb->len) ? __pskb_trim(skb, len) : 0; } @@ -3994,6 +4002,7 @@ int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len); static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len) { + skb_might_realloc(skb); if (likely(len >= skb->len)) return 0; return pskb_trim_rcsum_slow(skb, len); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7312ae7c3cc5..67b669d2e70e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2115,6 +2115,16 @@ config FAIL_SUNRPC Provide fault-injection capability for SunRPC and its consumers. +config FAIL_SKB_REALLOC + bool "Fault-injection capability forcing skb to reallocate" + depends on FAULT_INJECTION_DEBUG_FS + help + Provide fault-injection capability that forces the skb to be + reallocated, catching possible invalid pointers to the skb. + + For more information, check + Documentation/dev-tools/fault-injection/fault-injection.rst + config FAULT_INJECTION_CONFIGFS bool "Configfs interface for fault-injection capabilities" depends on FAULT_INJECTION diff --git a/net/core/Makefile b/net/core/Makefile index 5a72a87ee0f1..d9326600e289 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -46,3 +46,4 @@ obj-$(CONFIG_OF) += of_net.o obj-$(CONFIG_NET_TEST) += net_test.o obj-$(CONFIG_NET_DEVMEM) += devmem.o obj-$(CONFIG_DEBUG_NET_SMALL_RTNL) += rtnl_net_debug.o +obj-$(CONFIG_FAIL_SKB_REALLOC) += skb_fault_injection.o diff --git a/net/core/skb_fault_injection.c b/net/core/skb_fault_injection.c new file mode 100644 index 000000000000..4235db6bdfad --- /dev/null +++ b/net/core/skb_fault_injection.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include + +static struct { + struct fault_attr attr; + char devname[IFNAMSIZ]; + bool filtered; +} skb_realloc = { + .attr = FAULT_ATTR_INITIALIZER, + .filtered = false, +}; + +static bool should_fail_net_realloc_skb(struct sk_buff *skb) +{ + struct net_device *net = skb->dev; + + if (skb_realloc.filtered && + strncmp(net->name, skb_realloc.devname, IFNAMSIZ)) + /* device name filter set, but names do not match */ + return false; + + if (!should_fail(&skb_realloc.attr, 1)) + return false; + + return true; +} +ALLOW_ERROR_INJECTION(should_fail_net_realloc_skb, TRUE); + +void skb_might_realloc(struct sk_buff *skb) +{ + if (!should_fail_net_realloc_skb(skb)) + return; + + pskb_expand_head(skb, 0, 0, GFP_ATOMIC); +} +EXPORT_SYMBOL(skb_might_realloc); + +static int __init fail_skb_realloc_setup(char *str) +{ + return setup_fault_attr(&skb_realloc.attr, str); +} +__setup("fail_skb_realloc=", fail_skb_realloc_setup); + +static void reset_settings(void) +{ + skb_realloc.filtered = false; + memset(&skb_realloc.devname, 0, IFNAMSIZ); +} + +static ssize_t devname_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + ssize_t ret; + + reset_settings(); + ret = simple_write_to_buffer(&skb_realloc.devname, IFNAMSIZ, + ppos, buffer, count); + if (ret < 0) + return ret; + + skb_realloc.devname[IFNAMSIZ - 1] = '\0'; + /* Remove a possible \n at the end of devname */ + strim(skb_realloc.devname); + + if (strnlen(skb_realloc.devname, IFNAMSIZ)) + skb_realloc.filtered = true; + + return count; +} + +static ssize_t devname_read(struct file *file, + char __user *buffer, + size_t size, loff_t *ppos) +{ + if (!skb_realloc.filtered) + return 0; + + return simple_read_from_buffer(buffer, size, ppos, &skb_realloc.devname, + strlen(skb_realloc.devname)); +} + +static const struct file_operations devname_ops = { + .write = devname_write, + .read = devname_read, +}; + +static int __init fail_skb_realloc_debugfs(void) +{ + umode_t mode = S_IFREG | 0600; + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_skb_realloc", NULL, + &skb_realloc.attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + debugfs_create_file("devname", mode, dir, NULL, &devname_ops); + + return 0; +} + +late_initcall(fail_skb_realloc_debugfs); -- cgit v1.3 From d7b0ff5a866724c3ad21f2628c22a63336deec3f Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 7 Nov 2024 21:46:12 +0100 Subject: virtio/vsock: Fix accept_queue memory leak As the final stages of socket destruction may be delayed, it is possible that virtio_transport_recv_listen() will be called after the accept_queue has been flushed, but before the SOCK_DONE flag has been set. As a result, sockets enqueued after the flush would remain unremoved, leading to a memory leak. vsock_release __vsock_release lock virtio_transport_release virtio_transport_close schedule_delayed_work(close_work) sk_shutdown = SHUTDOWN_MASK (!) flush accept_queue release virtio_transport_recv_pkt vsock_find_bound_socket lock if flag(SOCK_DONE) return virtio_transport_recv_listen child = vsock_create_connected (!) vsock_enqueue_accept(child) release close_work lock virtio_transport_do_close set_flag(SOCK_DONE) virtio_transport_remove_sock vsock_remove_sock vsock_remove_bound release Introduce a sk_shutdown check to disallow vsock_enqueue_accept() during socket destruction. unreferenced object 0xffff888109e3f800 (size 2040): comm "kworker/5:2", pid 371, jiffies 4294940105 hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 28 00 0b 40 00 00 00 00 00 00 00 00 00 00 00 00 (..@............ backtrace (crc 9e5f4e84): [] kmem_cache_alloc_noprof+0x2c1/0x360 [] sk_prot_alloc+0x30/0x120 [] sk_alloc+0x2c/0x4b0 [] __vsock_create.constprop.0+0x2a/0x310 [] virtio_transport_recv_pkt+0x4dc/0x9a0 [] vsock_loopback_work+0xfd/0x140 [] process_one_work+0x20c/0x570 [] worker_thread+0x1bf/0x3a0 [] kthread+0xdd/0x110 [] ret_from_fork+0x2d/0x50 [] ret_from_fork_asm+0x1a/0x30 Fixes: 3fe356d58efa ("vsock/virtio: discard packets only when socket is really closed") Reviewed-by: Stefano Garzarella Signed-off-by: Michal Luczaj Signed-off-by: Paolo Abeni --- net/vmw_vsock/virtio_transport_common.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index ccbd2bc0d210..cd075f608d4f 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -1512,6 +1512,14 @@ virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb, return -ENOMEM; } + /* __vsock_release() might have already flushed accept_queue. + * Subsequent enqueues would lead to a memory leak. + */ + if (sk->sk_shutdown == SHUTDOWN_MASK) { + virtio_transport_reset_no_sock(t, skb); + return -ESHUTDOWN; + } + child = vsock_create_connected(sk); if (!child) { virtio_transport_reset_no_sock(t, skb); -- cgit v1.3 From fbf7085b3ad1c7cc0677834c90f985f1b4f77a33 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 7 Nov 2024 21:46:13 +0100 Subject: vsock: Fix sk_error_queue memory leak Kernel queues MSG_ZEROCOPY completion notifications on the error queue. Where they remain, until explicitly recv()ed. To prevent memory leaks, clean up the queue when the socket is destroyed. unreferenced object 0xffff8881028beb00 (size 224): comm "vsock_test", pid 1218, jiffies 4294694897 hex dump (first 32 bytes): 90 b0 21 17 81 88 ff ff 90 b0 21 17 81 88 ff ff ..!.......!..... 00 00 00 00 00 00 00 00 00 b0 21 17 81 88 ff ff ..........!..... backtrace (crc 6c7031ca): [] kmem_cache_alloc_node_noprof+0x2f7/0x370 [] __alloc_skb+0x132/0x180 [] sock_omalloc+0x4b/0x80 [] msg_zerocopy_realloc+0x9e/0x240 [] virtio_transport_send_pkt_info+0x412/0x4c0 [] virtio_transport_stream_enqueue+0x43/0x50 [] vsock_connectible_sendmsg+0x373/0x450 [] ____sys_sendmsg+0x365/0x3a0 [] ___sys_sendmsg+0x84/0xd0 [] __sys_sendmsg+0x47/0x80 [] do_syscall_64+0x93/0x180 [] entry_SYSCALL_64_after_hwframe+0x76/0x7e Fixes: 581512a6dc93 ("vsock/virtio: MSG_ZEROCOPY flag support") Signed-off-by: Michal Luczaj Reviewed-by: Stefano Garzarella Acked-by: Arseniy Krasnov Signed-off-by: Paolo Abeni --- net/vmw_vsock/af_vsock.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 35681adedd9a..dfd29160fe11 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -836,6 +836,9 @@ static void vsock_sk_destruct(struct sock *sk) { struct vsock_sock *vsk = vsock_sk(sk); + /* Flush MSG_ZEROCOPY leftovers. */ + __skb_queue_purge(&sk->sk_error_queue); + vsock_deassign_transport(vsk); /* When clearing these addresses, there's no need to set the family and -- cgit v1.3 From 60cf6206a1f513512f5d73fa4d3dbbcad2e7dcd6 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 7 Nov 2024 21:46:14 +0100 Subject: virtio/vsock: Improve MSG_ZEROCOPY error handling Add a missing kfree_skb() to prevent memory leaks. Fixes: 581512a6dc93 ("vsock/virtio: MSG_ZEROCOPY flag support") Reviewed-by: Stefano Garzarella Signed-off-by: Michal Luczaj Acked-by: Arseniy Krasnov Signed-off-by: Paolo Abeni --- net/vmw_vsock/virtio_transport_common.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index cd075f608d4f..e2e6a30b759b 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -400,6 +400,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, if (virtio_transport_init_zcopy_skb(vsk, skb, info->msg, can_zcopy)) { + kfree_skb(skb); ret = -ENOMEM; break; } -- cgit v1.3 From a58f00ed24b849d449f7134fd5d86f07090fe2f5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 7 Nov 2024 17:02:54 -0800 Subject: net: sched: cls_api: improve the error message for ID allocation failure We run into an exhaustion problem with the kernel-allocated filter IDs. Our allocation problem can be fixed on the user space side, but the error message in this case was quite misleading: "Filter with specified priority/protocol not found" (EINVAL) Specifically when we can't allocate a _new_ ID because filter with lowest ID already _exists_, saying "filter not found", is confusing. Kernel allocates IDs in range of 0xc0000 -> 0x8000, giving out ID one lower than lowest existing in that range. The error message makes sense when tcf_chain_tp_find() gets called for GET and DEL but for NEW we need to provide more specific error messages for all three cases: - user wants the ID to be auto-allocated but filter with ID 0x8000 already exists - filter already exists and can be replaced, but user asked for a protocol change - filter doesn't exist Caller of tcf_chain_tp_insert_unique() doesn't set extack today, so don't bother plumbing it in. Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski Acked-by: Jamal Hadi Salim Link: https://patch.msgid.link/20241108010254.2995438-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/sched/cls_api.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 04942f8c62e0..7578e27260c9 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1933,7 +1933,8 @@ static void tcf_chain_tp_remove(struct tcf_chain *chain, static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, struct tcf_chain_info *chain_info, u32 protocol, u32 prio, - bool prio_allocate); + bool prio_allocate, + struct netlink_ext_ack *extack); /* Try to insert new proto. * If proto with specified priority already exists, free new proto @@ -1957,8 +1958,7 @@ static struct tcf_proto *tcf_chain_tp_insert_unique(struct tcf_chain *chain, return ERR_PTR(-EAGAIN); } - tp = tcf_chain_tp_find(chain, &chain_info, - protocol, prio, false); + tp = tcf_chain_tp_find(chain, &chain_info, protocol, prio, false, NULL); if (!tp) err = tcf_chain_tp_insert(chain, &chain_info, tp_new); mutex_unlock(&chain->filter_chain_lock); @@ -2018,7 +2018,8 @@ static void tcf_chain_tp_delete_empty(struct tcf_chain *chain, static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, struct tcf_chain_info *chain_info, u32 protocol, u32 prio, - bool prio_allocate) + bool prio_allocate, + struct netlink_ext_ack *extack) { struct tcf_proto **pprev; struct tcf_proto *tp; @@ -2029,9 +2030,14 @@ static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, pprev = &tp->next) { if (tp->prio >= prio) { if (tp->prio == prio) { - if (prio_allocate || - (tp->protocol != protocol && protocol)) + if (prio_allocate) { + NL_SET_ERR_MSG(extack, "Lowest ID from auto-alloc range already in use"); + return ERR_PTR(-ENOSPC); + } + if (tp->protocol != protocol && protocol) { + NL_SET_ERR_MSG(extack, "Protocol mismatch for filter with specified priority"); return ERR_PTR(-EINVAL); + } } else { tp = NULL; } @@ -2312,9 +2318,8 @@ replay: mutex_lock(&chain->filter_chain_lock); tp = tcf_chain_tp_find(chain, &chain_info, protocol, - prio, prio_allocate); + prio, prio_allocate, extack); if (IS_ERR(tp)) { - NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found"); err = PTR_ERR(tp); goto errout_locked; } @@ -2539,10 +2544,13 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, mutex_lock(&chain->filter_chain_lock); tp = tcf_chain_tp_find(chain, &chain_info, protocol, - prio, false); - if (!tp || IS_ERR(tp)) { + prio, false, extack); + if (!tp) { + err = -ENOENT; NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found"); - err = tp ? PTR_ERR(tp) : -ENOENT; + goto errout_locked; + } else if (IS_ERR(tp)) { + err = PTR_ERR(tp); goto errout_locked; } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) { NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one"); @@ -2679,11 +2687,14 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, mutex_lock(&chain->filter_chain_lock); tp = tcf_chain_tp_find(chain, &chain_info, protocol, - prio, false); + prio, false, extack); mutex_unlock(&chain->filter_chain_lock); - if (!tp || IS_ERR(tp)) { + if (!tp) { + err = -ENOENT; NL_SET_ERR_MSG(extack, "Filter with specified priority/protocol not found"); - err = tp ? PTR_ERR(tp) : -ENOENT; + goto errout; + } else if (IS_ERR(tp)) { + err = PTR_ERR(tp); goto errout; } else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind)) { NL_SET_ERR_MSG(extack, "Specified filter kind does not match existing one"); -- cgit v1.3 From 8251e7621b25ccdb689f1dd9553b8789e3745ea1 Mon Sep 17 00:00:00 2001 From: Mingwei Zheng Date: Fri, 8 Nov 2024 14:53:41 -0500 Subject: net: rfkill: gpio: Add check for clk_enable() Add check for the return value of clk_enable() to catch the potential error. Fixes: 7176ba23f8b5 ("net: rfkill: add generic gpio rfkill driver") Signed-off-by: Mingwei Zheng Signed-off-by: Jiasheng Jiang Link: https://patch.msgid.link/20241108195341.1853080-1-zmw12306@gmail.com Signed-off-by: Johannes Berg --- net/rfkill/rfkill-gpio.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index c268c2b011f4..a8e21060112f 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -32,8 +32,12 @@ static int rfkill_gpio_set_power(void *data, bool blocked) { struct rfkill_gpio_data *rfkill = data; - if (!blocked && !IS_ERR(rfkill->clk) && !rfkill->clk_enabled) - clk_enable(rfkill->clk); + if (!blocked && !IS_ERR(rfkill->clk) && !rfkill->clk_enabled) { + int ret = clk_enable(rfkill->clk); + + if (ret) + return ret; + } gpiod_set_value_cansleep(rfkill->shutdown_gpio, !blocked); gpiod_set_value_cansleep(rfkill->reset_gpio, !blocked); -- cgit v1.3 From 406c5548c661df0bff6bb6ee79bf9d49faf23e31 Mon Sep 17 00:00:00 2001 From: MeiChia Chiu Date: Tue, 12 Nov 2024 16:38:46 +0800 Subject: wifi: mac80211: Support EHT 1024 aggregation size in TX Support EHT 1024 aggregation size in TX The 1024 agg size for RX is supported but not for TX. This patch adds this support and refactors common parsing logics for addbaext in both process_addba_resp and process_addba_req into a function. Reviewed-by: Shayne Chen Reviewed-by: Money Wang Co-developed-by: Peter Chiu Signed-off-by: Peter Chiu Signed-off-by: MeiChia Chiu Link: https://patch.msgid.link/20241112083846.32063-1-MeiChia.Chiu@mediatek.com [pass elems/len instead of mgmt/len/is_req] Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 2 + net/mac80211/agg-rx.c | 94 ++++++++++++++++++++++++++++------------------ net/mac80211/agg-tx.c | 31 ++++++++++----- net/mac80211/ht.c | 2 +- net/mac80211/ieee80211_i.h | 9 ++++- 5 files changed, 90 insertions(+), 48 deletions(-) (limited to 'net') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 456bca45ff05..05dedc45505c 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1445,6 +1445,8 @@ struct ieee80211_mgmt { __le16 status; __le16 capab; __le16 timeout; + /* followed by BA Extension */ + u8 variable[]; } __packed addba_resp; struct{ u8 action_code; diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index fe7eab4b681b..f3fbe5a4395e 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -170,28 +170,63 @@ static void sta_rx_agg_reorder_timer_expired(struct timer_list *t) rcu_read_unlock(); } -static void ieee80211_add_addbaext(struct ieee80211_sub_if_data *sdata, - struct sk_buff *skb, - const struct ieee80211_addba_ext_ie *req, - u16 buf_size) +void ieee80211_add_addbaext(struct sk_buff *skb, + const u8 req_addba_ext_data, + u16 buf_size) { - struct ieee80211_addba_ext_ie *resp; + struct ieee80211_addba_ext_ie *addba_ext; u8 *pos; pos = skb_put_zero(skb, 2 + sizeof(struct ieee80211_addba_ext_ie)); *pos++ = WLAN_EID_ADDBA_EXT; *pos++ = sizeof(struct ieee80211_addba_ext_ie); - resp = (struct ieee80211_addba_ext_ie *)pos; - resp->data = req->data & IEEE80211_ADDBA_EXT_NO_FRAG; + addba_ext = (struct ieee80211_addba_ext_ie *)pos; - resp->data |= u8_encode_bits(buf_size >> IEEE80211_ADDBA_EXT_BUF_SIZE_SHIFT, - IEEE80211_ADDBA_EXT_BUF_SIZE_MASK); + addba_ext->data = IEEE80211_ADDBA_EXT_NO_FRAG; + if (req_addba_ext_data) + addba_ext->data &= req_addba_ext_data; + + addba_ext->data |= + u8_encode_bits(buf_size >> IEEE80211_ADDBA_EXT_BUF_SIZE_SHIFT, + IEEE80211_ADDBA_EXT_BUF_SIZE_MASK); +} + +u8 ieee80211_retrieve_addba_ext_data(struct sta_info *sta, + const void *elem_data, ssize_t elem_len, + u16 *buf_size) +{ + struct ieee802_11_elems *elems; + u8 buf_size_1k, data = 0; + + if (!sta->sta.deflink.he_cap.has_he) + return 0; + + if (elem_len <= 0) + return 0; + + elems = ieee802_11_parse_elems(elem_data, elem_len, true, NULL); + + if (elems && !elems->parse_error && elems->addba_ext_ie) { + data = elems->addba_ext_ie->data; + + if (!sta->sta.deflink.eht_cap.has_eht || !buf_size) + goto free; + + buf_size_1k = u8_get_bits(elems->addba_ext_ie->data, + IEEE80211_ADDBA_EXT_BUF_SIZE_MASK); + *buf_size |= (u16)buf_size_1k << + IEEE80211_ADDBA_EXT_BUF_SIZE_SHIFT; + } +free: + kfree(elems); + + return data; } static void ieee80211_send_addba_resp(struct sta_info *sta, u8 *da, u16 tid, u8 dialog_token, u16 status, u16 policy, u16 buf_size, u16 timeout, - const struct ieee80211_addba_ext_ie *addbaext) + const u8 req_addba_ext_data) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; @@ -223,8 +258,8 @@ static void ieee80211_send_addba_resp(struct sta_info *sta, u8 *da, u16 tid, mgmt->u.action.u.addba_resp.timeout = cpu_to_le16(timeout); mgmt->u.action.u.addba_resp.status = cpu_to_le16(status); - if (sta->sta.deflink.he_cap.has_he && addbaext) - ieee80211_add_addbaext(sdata, skb, addbaext, buf_size); + if (sta->sta.deflink.he_cap.has_he) + ieee80211_add_addbaext(skb, req_addba_ext_data, buf_size); ieee80211_tx_skb(sdata, skb); } @@ -233,7 +268,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, u8 dialog_token, u16 timeout, u16 start_seq_num, u16 ba_policy, u16 tid, u16 buf_size, bool tx, bool auto_seq, - const struct ieee80211_addba_ext_ie *addbaext) + const u8 addba_ext_data) { struct ieee80211_local *local = sta->sdata->local; struct tid_ampdu_rx *tid_agg_rx; @@ -419,7 +454,7 @@ end: if (tx) ieee80211_send_addba_resp(sta, sta->sta.addr, tid, dialog_token, status, 1, buf_size, - timeout, addbaext); + timeout, addba_ext_data); } void ieee80211_process_addba_request(struct ieee80211_local *local, @@ -428,9 +463,7 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, size_t len) { u16 capab, tid, timeout, ba_policy, buf_size, start_seq_num; - struct ieee802_11_elems *elems = NULL; - u8 dialog_token; - int ies_len; + u8 dialog_token, addba_ext_data; /* extract session parameters from addba request frame */ dialog_token = mgmt->u.action.u.addba_req.dialog_token; @@ -443,28 +476,17 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2; buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6; - ies_len = len - offsetof(struct ieee80211_mgmt, - u.action.u.addba_req.variable); - if (ies_len) { - elems = ieee802_11_parse_elems(mgmt->u.action.u.addba_req.variable, - ies_len, true, NULL); - if (!elems || elems->parse_error) - goto free; - } - - if (sta->sta.deflink.eht_cap.has_eht && elems && elems->addba_ext_ie) { - u8 buf_size_1k = u8_get_bits(elems->addba_ext_ie->data, - IEEE80211_ADDBA_EXT_BUF_SIZE_MASK); - - buf_size |= buf_size_1k << IEEE80211_ADDBA_EXT_BUF_SIZE_SHIFT; - } + addba_ext_data = + ieee80211_retrieve_addba_ext_data(sta, + mgmt->u.action.u.addba_req.variable, + len - + offsetof(typeof(*mgmt), + u.action.u.addba_req.variable), + &buf_size); __ieee80211_start_rx_ba_session(sta, dialog_token, timeout, start_seq_num, ba_policy, tid, - buf_size, true, false, - elems ? elems->addba_ext_ie : NULL); -free: - kfree(elems); + buf_size, true, false, addba_ext_data); } void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif, diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 04cb45cfb310..61f2cac37728 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -58,23 +58,24 @@ * complete. */ -static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata, - const u8 *da, u16 tid, +static void ieee80211_send_addba_request(struct sta_info *sta, u16 tid, u8 dialog_token, u16 start_seq_num, u16 agg_size, u16 timeout) { + struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; u16 capab; - skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom); - + skb = dev_alloc_skb(sizeof(*mgmt) + + 2 + sizeof(struct ieee80211_addba_ext_ie) + + local->hw.extra_tx_headroom); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); - mgmt = ieee80211_mgmt_ba(skb, da, sdata); + mgmt = ieee80211_mgmt_ba(skb, sta->sta.addr, sdata); skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_req)); @@ -93,6 +94,9 @@ static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata, mgmt->u.action.u.addba_req.start_seq_num = cpu_to_le16(start_seq_num << 4); + if (sta->sta.deflink.he_cap.has_he) + ieee80211_add_addbaext(skb, 0, agg_size); + ieee80211_tx_skb_tid(sdata, skb, tid, -1); } @@ -460,8 +464,11 @@ static void ieee80211_send_addba_with_timeout(struct sta_info *sta, sta->ampdu_mlme.addba_req_num[tid]++; spin_unlock_bh(&sta->lock); - if (sta->sta.deflink.he_cap.has_he) { + if (sta->sta.deflink.eht_cap.has_eht) { buf_size = local->hw.max_tx_aggregation_subframes; + } else if (sta->sta.deflink.he_cap.has_he) { + buf_size = min_t(u16, local->hw.max_tx_aggregation_subframes, + IEEE80211_MAX_AMPDU_BUF_HE); } else { /* * We really should use what the driver told us it will @@ -473,9 +480,8 @@ static void ieee80211_send_addba_with_timeout(struct sta_info *sta, } /* send AddBA request */ - ieee80211_send_addba_request(sdata, sta->sta.addr, tid, - tid_tx->dialog_token, tid_tx->ssn, - buf_size, tid_tx->timeout); + ieee80211_send_addba_request(sta, tid, tid_tx->dialog_token, + tid_tx->ssn, buf_size, tid_tx->timeout); WARN_ON(test_and_set_bit(HT_AGG_STATE_SENT_ADDBA, &tid_tx->state)); } @@ -970,6 +976,13 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, amsdu = capab & IEEE80211_ADDBA_PARAM_AMSDU_MASK; tid = u16_get_bits(capab, IEEE80211_ADDBA_PARAM_TID_MASK); buf_size = u16_get_bits(capab, IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK); + + ieee80211_retrieve_addba_ext_data(sta, + mgmt->u.action.u.addba_resp.variable, + len - offsetof(typeof(*mgmt), + u.action.u.addba_resp.variable), + &buf_size); + buf_size = min(buf_size, local->hw.max_tx_aggregation_subframes); txq = sta->sta.txq[tid]; diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 1c2b7dd8976a..32390d8a9d75 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -379,7 +379,7 @@ void ieee80211_ba_session_work(struct wiphy *wiphy, struct wiphy_work *work) sta->ampdu_mlme.tid_rx_manage_offl)) __ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid, IEEE80211_MAX_AMPDU_BUF_HT, - false, true, NULL); + false, true, 0); if (test_and_clear_bit(tid + IEEE80211_NUM_TIDS, sta->ampdu_mlme.tid_rx_manage_offl)) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 7dcb46120abc..752297bcde76 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2111,14 +2111,19 @@ int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, const u8 *bssid, int link_id); bool ieee80211_smps_is_restrictive(enum ieee80211_smps_mode smps_mode_old, enum ieee80211_smps_mode smps_mode_new); - +void ieee80211_add_addbaext(struct sk_buff *skb, + const u8 req_addba_ext_data, + u16 buf_size); +u8 ieee80211_retrieve_addba_ext_data(struct sta_info *sta, + const void *elem_data, ssize_t elem_len, + u16 *buf_size); void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, u16 initiator, u16 reason, bool stop); void __ieee80211_start_rx_ba_session(struct sta_info *sta, u8 dialog_token, u16 timeout, u16 start_seq_num, u16 ba_policy, u16 tid, u16 buf_size, bool tx, bool auto_seq, - const struct ieee80211_addba_ext_ie *addbaext); + const u8 addba_ext_data); void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, enum ieee80211_agg_stop_reason reason); void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, -- cgit v1.3 From f2aadc721274a4b27d3dfe8244e73fbdc8c17715 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 8 Nov 2024 09:22:27 +0100 Subject: wifi: mac80211: pass MBSSID config by reference It's inefficient and confusing to pass the MBSSID config by value, requiring the whole struct to be copied. Pass it by reference instead. Link: https://patch.msgid.link/20241108092227.48fbd8a00112.I64abc1296a7557aadf798d88db931024486ab3b6@changeid Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 6c0b228523cb..132e194c8d72 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -143,7 +143,7 @@ static int ieee80211_set_mon_options(struct ieee80211_sub_if_data *sdata, } static int ieee80211_set_ap_mbssid_options(struct ieee80211_sub_if_data *sdata, - struct cfg80211_mbssid_config params, + struct cfg80211_mbssid_config *params, struct ieee80211_bss_conf *link_conf) { struct ieee80211_sub_if_data *tx_sdata; @@ -154,10 +154,10 @@ static int ieee80211_set_ap_mbssid_options(struct ieee80211_sub_if_data *sdata, link_conf->ema_ap = false; link_conf->bssid_indicator = 0; - if (sdata->vif.type != NL80211_IFTYPE_AP || !params.tx_wdev) + if (sdata->vif.type != NL80211_IFTYPE_AP || !params->tx_wdev) return -EINVAL; - tx_sdata = IEEE80211_WDEV_TO_SUB_IF(params.tx_wdev); + tx_sdata = IEEE80211_WDEV_TO_SUB_IF(params->tx_wdev); if (!tx_sdata) return -EINVAL; @@ -166,9 +166,9 @@ static int ieee80211_set_ap_mbssid_options(struct ieee80211_sub_if_data *sdata, } else { sdata->vif.mbssid_tx_vif = &tx_sdata->vif; link_conf->nontransmitted = true; - link_conf->bssid_index = params.index; + link_conf->bssid_index = params->index; } - if (params.ema) + if (params->ema) link_conf->ema_ap = true; return 0; @@ -1414,7 +1414,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, if (sdata->vif.type == NL80211_IFTYPE_AP && params->mbssid_config.tx_wdev) { err = ieee80211_set_ap_mbssid_options(sdata, - params->mbssid_config, + ¶ms->mbssid_config, link_conf); if (err) return err; -- cgit v1.3 From 7967dc8f797f454d4f4acec15c7df0cdf4801617 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 8 Nov 2024 11:19:54 -0500 Subject: Bluetooth: hci_core: Fix calling mgmt_device_connected Since 61a939c68ee0 ("Bluetooth: Queue incoming ACL data until BT_CONNECTED state is reached") there is no long the need to call mgmt_device_connected as ACL data will be queued until BT_CONNECTED state. Link: https://bugzilla.kernel.org/show_bug.cgi?id=219458 Link: https://github.com/bluez/bluez/issues/1014 Fixes: 333b4fd11e89 ("Bluetooth: L2CAP: Fix uaf in l2cap_connect") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 96d097b21d13..0ac354db8177 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3788,8 +3788,6 @@ static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_lock(hdev); conn = hci_conn_hash_lookup_handle(hdev, handle); - if (conn && hci_dev_test_flag(hdev, HCI_MGMT)) - mgmt_device_connected(hdev, conn, NULL, 0); hci_dev_unlock(hdev); if (conn) { -- cgit v1.3 From b169e76ebad22cbd055101ee5aa1a7bed0e66606 Mon Sep 17 00:00:00 2001 From: Dmitry Kandybka Date: Thu, 7 Nov 2024 13:36:57 +0300 Subject: mptcp: fix possible integer overflow in mptcp_reset_tout_timer In 'mptcp_reset_tout_timer', promote 'probe_timestamp' to unsigned long to avoid possible integer overflow. Compile tested only. Found by Linux Verification Center (linuxtesting.org) with SVACE. Signed-off-by: Dmitry Kandybka Link: https://patch.msgid.link/20241107103657.1560536-1-d.kandybka@gmail.com Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index b0e9a745ea62..a6f2a25edb11 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2722,8 +2722,8 @@ void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout) if (!fail_tout && !inet_csk(sk)->icsk_mtup.probe_timestamp) return; - close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + - mptcp_close_timeout(sk); + close_timeout = (unsigned long)inet_csk(sk)->icsk_mtup.probe_timestamp - + tcp_jiffies32 + jiffies + mptcp_close_timeout(sk); /* the close timeout takes precedence on the fail one, and here at least one of * them is active -- cgit v1.3 From 73af53d82076bbe184d9ece9e14b0dc8599e6055 Mon Sep 17 00:00:00 2001 From: Alexandre Ferrieux Date: Sun, 10 Nov 2024 18:28:36 +0100 Subject: net: sched: cls_u32: Fix u32's systematic failure to free IDR entries for hnodes. To generate hnode handles (in gen_new_htid()), u32 uses IDR and encodes the returned small integer into a structured 32-bit word. Unfortunately, at disposal time, the needed decoding is not done. As a result, idr_remove() fails, and the IDR fills up. Since its size is 2048, the following script ends up with "Filter already exists": tc filter add dev myve $FILTER1 tc filter add dev myve $FILTER2 for i in {1..2048} do echo $i tc filter del dev myve $FILTER2 tc filter add dev myve $FILTER2 done This patch adds the missing decoding logic for handles that deserve it. Fixes: e7614370d6f0 ("net_sched: use idr to allocate u32 filter handles") Reviewed-by: Eric Dumazet Acked-by: Jamal Hadi Salim Signed-off-by: Alexandre Ferrieux Tested-by: Victor Nogueira Link: https://patch.msgid.link/20241110172836.331319-1-alexandre.ferrieux@orange.com Signed-off-by: Jakub Kicinski --- net/sched/cls_u32.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 9412d88a99bc..d3a03c57545b 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -92,6 +92,16 @@ struct tc_u_common { long knodes; }; +static u32 handle2id(u32 h) +{ + return ((h & 0x80000000) ? ((h >> 20) & 0x7FF) : h); +} + +static u32 id2handle(u32 id) +{ + return (id | 0x800U) << 20; +} + static inline unsigned int u32_hash_fold(__be32 key, const struct tc_u32_sel *sel, u8 fshift) @@ -310,7 +320,7 @@ static u32 gen_new_htid(struct tc_u_common *tp_c, struct tc_u_hnode *ptr) int id = idr_alloc_cyclic(&tp_c->handle_idr, ptr, 1, 0x7FF, GFP_KERNEL); if (id < 0) return 0; - return (id | 0x800U) << 20; + return id2handle(id); } static struct hlist_head *tc_u_common_hash; @@ -360,7 +370,7 @@ static int u32_init(struct tcf_proto *tp) return -ENOBUFS; refcount_set(&root_ht->refcnt, 1); - root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : 0x80000000; + root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : id2handle(0); root_ht->prio = tp->prio; root_ht->is_root = true; idr_init(&root_ht->handle_idr); @@ -612,7 +622,7 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, if (phn == ht) { u32_clear_hw_hnode(tp, ht, extack); idr_destroy(&ht->handle_idr); - idr_remove(&tp_c->handle_idr, ht->handle); + idr_remove(&tp_c->handle_idr, handle2id(ht->handle)); RCU_INIT_POINTER(*hn, ht->next); kfree_rcu(ht, rcu); return 0; @@ -989,7 +999,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, err = u32_replace_hw_hnode(tp, ht, userflags, extack); if (err) { - idr_remove(&tp_c->handle_idr, handle); + idr_remove(&tp_c->handle_idr, handle2id(handle)); kfree(ht); return err; } -- cgit v1.3 From ef04d290c01301b7467df48425c36891d86ff417 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 8 Nov 2024 18:33:03 -0800 Subject: net: page_pool: do not count normal frag allocation in stats Commit 0f6deac3a079 ("net: page_pool: add page allocation stats for two fast page allocate path") added increments for "fast path" allocation to page frag alloc. It mentions performance degradation analysis but the details are unclear. Could be that the author was simply surprised by the alloc stats not matching packet count. In my experience the key metric for page pool is the recycling rate. Page return stats, however, count returned _pages_ not frags. This makes it impossible to calculate recycling rate for drivers using the frag API. Here is example output of the page-pool YNL sample for a driver allocating 1200B frags (4k pages) with nearly perfect recycling: $ ./page-pool eth0[2] page pools: 32 (zombies: 0) refs: 291648 bytes: 1194590208 (refs: 0 bytes: 0) recycling: 33.3% (alloc: 4557:2256365862 recycle: 200476245:551541893) The recycling rate is reported as 33.3% because we give out 4096 // 1200 = 3 frags for every recycled page. Effectively revert the aforementioned commit. This also aligns with the stats we would see for drivers which do the fragmentation themselves, although that's not a strong reason in itself. On the (very unlikely) path where we can reuse the current page let's bump the "cached" stat. The fact that we don't put the page in the cache is just an optimization. Acked-by: Jesper Dangaard Brouer Reviewed-by: Xuan Zhuo Acked-by: Ilias Apalodimas Link: https://patch.msgid.link/20241109023303.3366500-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/page_pool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/page_pool.c b/net/core/page_pool.c index a813d30d2135..f89cf93f6eb4 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -950,6 +950,7 @@ netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, if (netmem && *offset + size > max_size) { netmem = page_pool_drain_frag(pool, netmem); if (netmem) { + recycle_stat_inc(pool, cached); alloc_stat_inc(pool, fast); goto frag_reset; } @@ -974,7 +975,6 @@ frag_reset: pool->frag_users++; pool->frag_offset = *offset + size; - alloc_stat_inc(pool, fast); return netmem; } EXPORT_SYMBOL(page_pool_alloc_frag_netmem); -- cgit v1.3 From e0266319413d5d687ba7b6df7ca99e4b9724a4f2 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 12 Nov 2024 20:18:33 +0100 Subject: mptcp: update local address flags when setting it Just like in-kernel pm, when userspace pm does set_flags, it needs to send out MP_PRIO signal, and also modify the flags of the corresponding address entry in the local address list. This patch implements the missing logic. Traverse all address entries on userspace_pm_local_addr_list to find the local address entry, if bkup is true, set the flags of this entry with FLAG_BACKUP, otherwise, clear FLAG_BACKUP. Fixes: 892f396c8e68 ("mptcp: netlink: issue MP_PRIO signals from userspace PMs") Cc: stable@vger.kernel.org Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241112-net-mptcp-misc-6-12-pm-v1-1-b835580cefa8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_userspace.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 56dfea9862b7..3f888bfe1462 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -560,6 +560,7 @@ int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info) struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct net *net = sock_net(skb->sk); + struct mptcp_pm_addr_entry *entry; struct mptcp_sock *msk; int ret = -EINVAL; struct sock *sk; @@ -601,6 +602,17 @@ int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info) if (loc.flags & MPTCP_PM_ADDR_FLAG_BACKUP) bkup = 1; + spin_lock_bh(&msk->pm.lock); + list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { + if (mptcp_addresses_equal(&entry->addr, &loc.addr, false)) { + if (bkup) + entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP; + else + entry->flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP; + } + } + spin_unlock_bh(&msk->pm.lock); + lock_sock(sk); ret = mptcp_pm_nl_mp_prio_send_ack(msk, &loc.addr, &rem.addr, bkup); release_sock(sk); -- cgit v1.3 From f642c5c4d528d11bd78b6c6f84f541cd3c0bea86 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 12 Nov 2024 20:18:34 +0100 Subject: mptcp: hold pm lock when deleting entry When traversing userspace_pm_local_addr_list and deleting an entry from it in mptcp_pm_nl_remove_doit(), msk->pm.lock should be held. This patch holds this lock before mptcp_userspace_pm_lookup_addr_by_id() and releases it after list_move() in mptcp_pm_nl_remove_doit(). Fixes: d9a4594edabf ("mptcp: netlink: Add MPTCP_PM_CMD_REMOVE") Cc: stable@vger.kernel.org Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241112-net-mptcp-misc-6-12-pm-v1-2-b835580cefa8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_userspace.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 3f888bfe1462..e35178f5205f 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -308,14 +308,17 @@ int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) lock_sock(sk); + spin_lock_bh(&msk->pm.lock); match = mptcp_userspace_pm_lookup_addr_by_id(msk, id_val); if (!match) { GENL_SET_ERR_MSG(info, "address with specified id not found"); + spin_unlock_bh(&msk->pm.lock); release_sock(sk); goto out; } list_move(&match->list, &free_list); + spin_unlock_bh(&msk->pm.lock); mptcp_pm_remove_addrs(msk, &free_list); -- cgit v1.3 From db3eab8110bc0520416101b6a5b52f44a43fb4cf Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 12 Nov 2024 20:18:35 +0100 Subject: mptcp: pm: use _rcu variant under rcu_read_lock In mptcp_pm_create_subflow_or_signal_addr(), rcu_read_(un)lock() are used as expected to iterate over the list of local addresses, but list_for_each_entry() was used instead of list_for_each_entry_rcu() in __lookup_addr(). It is important to use this variant which adds the required READ_ONCE() (and diagnostic checks if enabled). Because __lookup_addr() is also used in mptcp_pm_nl_set_flags() where it is called under the pernet->lock and not rcu_read_lock(), an extra condition is then passed to help the diagnostic checks making sure either the associated spin lock or the RCU lock is held. Fixes: 86e39e04482b ("mptcp: keep track of local endpoint still available for each msk") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241112-net-mptcp-misc-6-12-pm-v1-3-b835580cefa8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index db586a5b3866..45a2b5f05d38 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -524,7 +524,8 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) { struct mptcp_pm_addr_entry *entry; - list_for_each_entry(entry, &pernet->local_addr_list, list) { + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, + lockdep_is_held(&pernet->lock)) { if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) return entry; } -- cgit v1.3 From e28acc9c1ccfcb24c08e020828f69d0a915b06ae Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 8 Nov 2024 06:08:36 -0800 Subject: ipmr: Fix access to mfc_cache_list without lock held Accessing `mr_table->mfc_cache_list` is protected by an RCU lock. In the following code flow, the RCU read lock is not held, causing the following error when `RCU_PROVE` is not held. The same problem might show up in the IPv6 code path. 6.12.0-rc5-kbuilder-01145-gbac17284bdcb #33 Tainted: G E N ----------------------------- net/ipv4/ipmr_base.c:313 RCU-list traversed in non-reader section!! rcu_scheduler_active = 2, debug_locks = 1 2 locks held by RetransmitAggre/3519: #0: ffff88816188c6c0 (nlk_cb_mutex-ROUTE){+.+.}-{3:3}, at: __netlink_dump_start+0x8a/0x290 #1: ffffffff83fcf7a8 (rtnl_mutex){+.+.}-{3:3}, at: rtnl_dumpit+0x6b/0x90 stack backtrace: lockdep_rcu_suspicious mr_table_dump ipmr_rtm_dumproute rtnl_dump_all rtnl_dumpit netlink_dump __netlink_dump_start rtnetlink_rcv_msg netlink_rcv_skb netlink_unicast netlink_sendmsg This is not a problem per see, since the RTNL lock is held here, so, it is safe to iterate in the list without the RCU read lock, as suggested by Eric. To alleviate the concern, modify the code to use list_for_each_entry_rcu() with the RTNL-held argument. The annotation will raise an error only if RTNL or RCU read lock are missing during iteration, signaling a legitimate problem, otherwise it will avoid this false positive. This will solve the IPv6 case as well, since ip6mr_rtm_dumproute() calls this function as well. Signed-off-by: Breno Leitao Reviewed-by: David Ahern Link: https://patch.msgid.link/20241108-ipmr_rcu-v2-1-c718998e209b@debian.org Signed-off-by: Jakub Kicinski --- net/ipv4/ipmr_base.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index 271dc03fc6db..f0af12a2f70b 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -310,7 +310,8 @@ int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb, if (filter->filter_set) flags |= NLM_F_DUMP_FILTERED; - list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) { + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list, + lockdep_rtnl_is_held()) { if (e < s_e) goto next_entry; if (filter->dev && -- cgit v1.3 From 9d287e70c51f1c141ac588add261ed2efdd6fc6b Mon Sep 17 00:00:00 2001 From: "Everest K.C" Date: Tue, 12 Nov 2024 16:36:06 -0700 Subject: xfrm: Add error handling when nla_put_u32() returns an error Error handling is missing when call to nla_put_u32() fails. Handle the error when the call to nla_put_u32() returns an error. The error was reported by Coverity Scan. Report: CID 1601525: (#1 of 1): Unused value (UNUSED_VALUE) returned_value: Assigning value from nla_put_u32(skb, XFRMA_SA_PCPU, x->pcpu_num) to err here, but that stored value is overwritten before it can be used Fixes: 1ddf9916ac09 ("xfrm: Add support for per cpu xfrm state handling.") Signed-off-by: Everest K.C. Reviewed-by: Simon Horman Reviewed-by: Przemek Kitszel Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index b6ce2b3c6b87..fab18b85af53 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2609,8 +2609,11 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct err = xfrm_if_id_put(skb, x->if_id); if (err) goto out_cancel; - if (x->pcpu_num != UINT_MAX) + if (x->pcpu_num != UINT_MAX) { err = nla_put_u32(skb, XFRMA_SA_PCPU, x->pcpu_num); + if (err) + goto out_cancel; + } if (x->dir) { err = nla_put_u8(skb, XFRMA_SA_DIR, x->dir); -- cgit v1.3 From 9e1a6db68e3ccc5c20fd2d6243285d1cc7215fe4 Mon Sep 17 00:00:00 2001 From: Daniel Yang Date: Wed, 13 Nov 2024 01:20:58 -0800 Subject: xfrm: replace deprecated strncpy with strscpy_pad The function strncpy is deprecated since it does not guarantee the destination buffer is NULL terminated. Recommended replacement is strscpy. The padded version was used to remain consistent with the other strscpy_pad usage in the modified function. Signed-off-by: Daniel Yang Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index fab18b85af53..6b0800c7c75e 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1101,7 +1101,7 @@ static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb) if (!nla) return -EMSGSIZE; algo = nla_data(nla); - strncpy(algo->alg_name, auth->alg_name, sizeof(algo->alg_name)); + strscpy_pad(algo->alg_name, auth->alg_name, sizeof(algo->alg_name)); if (redact_secret && auth->alg_key_len) memset(algo->alg_key, 0, (auth->alg_key_len + 7) / 8); -- cgit v1.3 From 3f5495962824fbef3b9a577ccd9b02f967452c11 Mon Sep 17 00:00:00 2001 From: Donald Hunter Date: Fri, 1 Nov 2024 14:32:07 +0000 Subject: netfilter: nfnetlink: Report extack policy errors for batched ops The nftables batch processing does not currently populate extack with policy errors. Fix this by passing extack when parsing batch messages. Signed-off-by: Donald Hunter Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 7784ec094097..e598a2a252b0 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -517,7 +517,7 @@ replay_abort: err = nla_parse_deprecated(cda, ss->cb[cb_id].attr_count, attr, attrlen, - ss->cb[cb_id].policy, NULL); + ss->cb[cb_id].policy, &extack); if (err < 0) goto ack; -- cgit v1.3 From 8340b0056ac723d04918573761b5d8f979d15a75 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 11 Nov 2024 14:47:51 +0000 Subject: netfilter: bpf: Pass string literal as format argument of request_module() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both gcc-14 and clang-18 report that passing a non-string literal as the format argument of request_module() is potentially insecure. E.g. clang-18 says: .../nf_bpf_link.c:46:24: warning: format string is not a string literal (potentially insecure) [-Wformat-security] 46 | err = request_module(mod); | ^~~ .../kmod.h:25:55: note: expanded from macro 'request_module' 25 | #define request_module(mod...) __request_module(true, mod) | ^~~ .../nf_bpf_link.c:46:24: note: treat the string as an argument to avoid this 46 | err = request_module(mod); | ^ | "%s", .../kmod.h:25:55: note: expanded from macro 'request_module' 25 | #define request_module(mod...) __request_module(true, mod) | ^ It is always the case where the contents of mod is safe to pass as the format argument. That is, in my understanding, it never contains any format escape sequences. But, it seems better to be safe than sorry. And, as a bonus, compiler output becomes less verbose by addressing this issue as suggested by clang-18. Compile tested only. Signed-off-by: Simon Horman Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_bpf_link.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c index 3d64a4511fcf..06b084844700 100644 --- a/net/netfilter/nf_bpf_link.c +++ b/net/netfilter/nf_bpf_link.c @@ -43,7 +43,7 @@ get_proto_defrag_hook(struct bpf_nf_link *link, hook = rcu_dereference(*ptr_global_hook); if (!hook) { rcu_read_unlock(); - err = request_module(mod); + err = request_module("%s", mod); if (err) return ERR_PTR(err < 0 ? err : -EINVAL); -- cgit v1.3 From 4ee29181216d2acb7be210126324ec3bc0e3bd01 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 13 Nov 2024 16:35:49 +0100 Subject: netfilter: nf_tables: add nft_trans_commit_list_add_elem helper Add and use a wrapper to append trans_elem structures to the transaction log. Unlike the existing helper, pass a gfp_t to indicate if sleeping is allowed. This will be used by a followup patch to realloc nft_trans_elem structures after they gain a flexible array member to reduce number of such container structures on the transaction list. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 6090ba9f1bb2..75c84b17ab99 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -421,6 +421,17 @@ static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *tr } } +static void nft_trans_commit_list_add_elem(struct net *net, struct nft_trans *trans, + gfp_t gfp) +{ + WARN_ON_ONCE(trans->msg_type != NFT_MSG_NEWSETELEM && + trans->msg_type != NFT_MSG_DELSETELEM); + + might_alloc(gfp); + + nft_trans_commit_list_add_tail(net, trans); +} + static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) { struct nft_trans *trans; @@ -7205,7 +7216,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (update_flags) { nft_trans_elem_priv(trans) = elem_priv; nft_trans_elem_update_flags(trans) = update_flags; - nft_trans_commit_list_add_tail(ctx->net, trans); + nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); goto err_elem_free; } } @@ -7229,7 +7240,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } nft_trans_elem_priv(trans) = elem.priv; - nft_trans_commit_list_add_tail(ctx->net, trans); + nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); return 0; err_set_full: @@ -7446,7 +7457,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, nft_setelem_data_deactivate(ctx->net, set, elem.priv); nft_trans_elem_priv(trans) = elem.priv; - nft_trans_commit_list_add_tail(ctx->net, trans); + nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); return 0; fail_ops: @@ -7482,7 +7493,7 @@ static int nft_setelem_flush(const struct nft_ctx *ctx, nft_setelem_data_deactivate(ctx->net, set, elem_priv); nft_trans_elem_set(trans) = set; nft_trans_elem_priv(trans) = elem_priv; - nft_trans_commit_list_add_tail(ctx->net, trans); + nft_trans_commit_list_add_elem(ctx->net, trans, GFP_ATOMIC); return 0; } @@ -7499,7 +7510,7 @@ static int __nft_set_catchall_flush(const struct nft_ctx *ctx, nft_setelem_data_deactivate(ctx->net, set, elem_priv); nft_trans_elem_priv(trans) = elem_priv; - nft_trans_commit_list_add_tail(ctx->net, trans); + nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); return 0; } -- cgit v1.3 From a8ee6b900c147d3bedced6c52ba6cb603226aaa3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 13 Nov 2024 16:35:50 +0100 Subject: netfilter: nf_tables: prepare for multiple elements in nft_trans_elem structure Add helpers to release the individual elements contained in the trans_elem container structure. No functional change intended. Followup patch will add 'nelems' member and will turn 'priv' into a flexible array. These helpers can then loop over all elements. Care needs to be taken to handle a mix of new elements and existing elements that are being updated (e.g. timeout refresh). Before this patch, NEWSETELEM transaction with update is released early so nft_trans_set_elem_destroy() won't get called, so we need to skip elements marked as update. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 21 ++-- net/netfilter/nf_tables_api.c | 228 +++++++++++++++++++++++++++----------- 2 files changed, 173 insertions(+), 76 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index f24278767bfd..37af0b174c39 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1759,28 +1759,25 @@ enum nft_trans_elem_flags { NFT_TRANS_UPD_EXPIRATION = (1 << 1), }; -struct nft_trans_elem { - struct nft_trans nft_trans; - struct nft_set *set; - struct nft_elem_priv *elem_priv; +struct nft_trans_one_elem { + struct nft_elem_priv *priv; u64 timeout; u64 expiration; u8 update_flags; +}; + +struct nft_trans_elem { + struct nft_trans nft_trans; + struct nft_set *set; bool bound; + unsigned int nelems; + struct nft_trans_one_elem elems[] __counted_by(nelems); }; #define nft_trans_container_elem(t) \ container_of(t, struct nft_trans_elem, nft_trans) #define nft_trans_elem_set(trans) \ nft_trans_container_elem(trans)->set -#define nft_trans_elem_priv(trans) \ - nft_trans_container_elem(trans)->elem_priv -#define nft_trans_elem_update_flags(trans) \ - nft_trans_container_elem(trans)->update_flags -#define nft_trans_elem_timeout(trans) \ - nft_trans_container_elem(trans)->timeout -#define nft_trans_elem_expiration(trans) \ - nft_trans_container_elem(trans)->expiration #define nft_trans_elem_set_bound(trans) \ nft_trans_container_elem(trans)->bound diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 75c84b17ab99..0882f78c2204 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6446,13 +6446,17 @@ static struct nft_trans *nft_trans_elem_alloc(const struct nft_ctx *ctx, int msg_type, struct nft_set *set) { + struct nft_trans_elem *te; struct nft_trans *trans; - trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_elem)); + trans = nft_trans_alloc(ctx, msg_type, struct_size(te, elems, 1)); if (trans == NULL) return NULL; - nft_trans_elem_set(trans) = set; + te = nft_trans_container_elem(trans); + te->nelems = 1; + te->set = set; + return trans; } @@ -6574,28 +6578,51 @@ static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx, } /* Drop references and destroy. Called from gc, dynset and abort path. */ -void nft_set_elem_destroy(const struct nft_set *set, - const struct nft_elem_priv *elem_priv, - bool destroy_expr) +static void __nft_set_elem_destroy(const struct nft_ctx *ctx, + const struct nft_set *set, + const struct nft_elem_priv *elem_priv, + bool destroy_expr) { struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); - struct nft_ctx ctx = { - .net = read_pnet(&set->net), - .family = set->table->family, - }; nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_release(nft_set_ext_data(ext), set->dtype); if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) - nft_set_elem_expr_destroy(&ctx, nft_set_ext_expr(ext)); + nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext)); if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF)) nft_use_dec(&(*nft_set_ext_obj(ext))->use); kfree(elem_priv); } + +/* Drop references and destroy. Called from gc and dynset. */ +void nft_set_elem_destroy(const struct nft_set *set, + const struct nft_elem_priv *elem_priv, + bool destroy_expr) +{ + struct nft_ctx ctx = { + .net = read_pnet(&set->net), + .family = set->table->family, + }; + + __nft_set_elem_destroy(&ctx, set, elem_priv, destroy_expr); +} EXPORT_SYMBOL_GPL(nft_set_elem_destroy); +/* Drop references and destroy. Called from abort path. */ +static void nft_trans_set_elem_destroy(const struct nft_ctx *ctx, struct nft_trans_elem *te) +{ + int i; + + for (i = 0; i < te->nelems; i++) { + if (te->elems[i].update_flags) + continue; + + __nft_set_elem_destroy(ctx, te->set, te->elems[i].priv, true); + } +} + /* Destroy element. References have been already dropped in the preparation * path via nft_setelem_data_deactivate(). */ @@ -6611,6 +6638,15 @@ void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, kfree(elem_priv); } +static void nft_trans_elems_destroy(const struct nft_ctx *ctx, + const struct nft_trans_elem *te) +{ + int i; + + for (i = 0; i < te->nelems; i++) + nf_tables_set_elem_destroy(ctx, te->set, te->elems[i].priv); +} + int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, struct nft_expr *expr_array[]) { @@ -6767,6 +6803,36 @@ static void nft_setelem_activate(struct net *net, struct nft_set *set, } } +static void nft_trans_elem_update(const struct nft_set *set, + const struct nft_trans_one_elem *elem) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + + if (elem->update_flags & NFT_TRANS_UPD_TIMEOUT) + WRITE_ONCE(nft_set_ext_timeout(ext)->timeout, elem->timeout); + + if (elem->update_flags & NFT_TRANS_UPD_EXPIRATION) + WRITE_ONCE(nft_set_ext_timeout(ext)->expiration, get_jiffies_64() + elem->expiration); +} + +static void nft_trans_elems_add(const struct nft_ctx *ctx, + struct nft_trans_elem *te) +{ + int i; + + for (i = 0; i < te->nelems; i++) { + const struct nft_trans_one_elem *elem = &te->elems[i]; + + if (elem->update_flags) + nft_trans_elem_update(te->set, elem); + else + nft_setelem_activate(ctx->net, te->set, elem->priv); + + nf_tables_setelem_notify(ctx, te->set, elem->priv, + NFT_MSG_NEWSETELEM); + } +} + static int nft_setelem_catchall_deactivate(const struct net *net, struct nft_set *set, struct nft_set_elem *elem) @@ -6849,6 +6915,24 @@ static void nft_setelem_remove(const struct net *net, set->ops->remove(net, set, elem_priv); } +static void nft_trans_elems_remove(const struct nft_ctx *ctx, + const struct nft_trans_elem *te) +{ + int i; + + for (i = 0; i < te->nelems; i++) { + nf_tables_setelem_notify(ctx, te->set, + te->elems[i].priv, + te->nft_trans.msg_type); + + nft_setelem_remove(ctx->net, te->set, te->elems[i].priv); + if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) { + atomic_dec(&te->set->nelems); + te->set->ndeact--; + } + } +} + static bool nft_setelem_valid_key_end(const struct nft_set *set, struct nlattr **nla, u32 flags) { @@ -7200,22 +7284,26 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, else if (!(nlmsg_flags & NLM_F_EXCL)) { err = 0; if (nft_set_ext_exists(ext2, NFT_SET_EXT_TIMEOUT)) { + struct nft_trans_one_elem *update; + + update = &nft_trans_container_elem(trans)->elems[0]; + update_flags = 0; if (timeout != nft_set_ext_timeout(ext2)->timeout) { - nft_trans_elem_timeout(trans) = timeout; + update->timeout = timeout; if (expiration == 0) expiration = timeout; update_flags |= NFT_TRANS_UPD_TIMEOUT; } if (expiration) { - nft_trans_elem_expiration(trans) = expiration; + update->expiration = expiration; update_flags |= NFT_TRANS_UPD_EXPIRATION; } if (update_flags) { - nft_trans_elem_priv(trans) = elem_priv; - nft_trans_elem_update_flags(trans) = update_flags; + update->priv = elem_priv; + update->update_flags = update_flags; nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); goto err_elem_free; } @@ -7239,7 +7327,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } } - nft_trans_elem_priv(trans) = elem.priv; + nft_trans_container_elem(trans)->elems[0].priv = elem.priv; nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); return 0; @@ -7377,6 +7465,50 @@ void nft_setelem_data_deactivate(const struct net *net, nft_use_dec(&(*nft_set_ext_obj(ext))->use); } +/* similar to nft_trans_elems_remove, but called from abort path to undo newsetelem. + * No notifications and no ndeact changes. + * + * Returns true if set had been added to (i.e., elements need to be removed again). + */ +static bool nft_trans_elems_new_abort(const struct nft_ctx *ctx, + const struct nft_trans_elem *te) +{ + bool removed = false; + int i; + + for (i = 0; i < te->nelems; i++) { + if (te->elems[i].update_flags) + continue; + + if (!te->set->ops->abort || nft_setelem_is_catchall(te->set, te->elems[i].priv)) + nft_setelem_remove(ctx->net, te->set, te->elems[i].priv); + + if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) + atomic_dec(&te->set->nelems); + + removed = true; + } + + return removed; +} + +/* Called from abort path to undo DELSETELEM/DESTROYSETELEM. */ +static void nft_trans_elems_destroy_abort(const struct nft_ctx *ctx, + const struct nft_trans_elem *te) +{ + int i; + + for (i = 0; i < te->nelems; i++) { + if (!nft_setelem_active_next(ctx->net, te->set, te->elems[i].priv)) { + nft_setelem_data_activate(ctx->net, te->set, te->elems[i].priv); + nft_setelem_activate(ctx->net, te->set, te->elems[i].priv); + } + + if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) + te->set->ndeact--; + } +} + static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { @@ -7456,7 +7588,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, nft_setelem_data_deactivate(ctx->net, set, elem.priv); - nft_trans_elem_priv(trans) = elem.priv; + nft_trans_container_elem(trans)->elems[0].priv = elem.priv; nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); return 0; @@ -7483,7 +7615,8 @@ static int nft_setelem_flush(const struct nft_ctx *ctx, return 0; trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM, - sizeof(struct nft_trans_elem), GFP_ATOMIC); + struct_size_t(struct nft_trans_elem, elems, 1), + GFP_ATOMIC); if (!trans) return -ENOMEM; @@ -7492,7 +7625,8 @@ static int nft_setelem_flush(const struct nft_ctx *ctx, nft_setelem_data_deactivate(ctx->net, set, elem_priv); nft_trans_elem_set(trans) = set; - nft_trans_elem_priv(trans) = elem_priv; + nft_trans_container_elem(trans)->nelems = 1; + nft_trans_container_elem(trans)->elems[0].priv = elem_priv; nft_trans_commit_list_add_elem(ctx->net, trans, GFP_ATOMIC); return 0; @@ -7509,7 +7643,7 @@ static int __nft_set_catchall_flush(const struct nft_ctx *ctx, return -ENOMEM; nft_setelem_data_deactivate(ctx->net, set, elem_priv); - nft_trans_elem_priv(trans) = elem_priv; + nft_trans_container_elem(trans)->elems[0].priv = elem_priv; nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); return 0; @@ -9691,9 +9825,7 @@ static void nft_commit_release(struct nft_trans *trans) break; case NFT_MSG_DELSETELEM: case NFT_MSG_DESTROYSETELEM: - nf_tables_set_elem_destroy(&ctx, - nft_trans_elem_set(trans), - nft_trans_elem_priv(trans)); + nft_trans_elems_destroy(&ctx, nft_trans_container_elem(trans)); break; case NFT_MSG_DELOBJ: case NFT_MSG_DESTROYOBJ: @@ -10546,25 +10678,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) case NFT_MSG_NEWSETELEM: te = nft_trans_container_elem(trans); - if (te->update_flags) { - const struct nft_set_ext *ext = - nft_set_elem_ext(te->set, te->elem_priv); + nft_trans_elems_add(&ctx, te); - if (te->update_flags & NFT_TRANS_UPD_TIMEOUT) { - WRITE_ONCE(nft_set_ext_timeout(ext)->timeout, - te->timeout); - } - if (te->update_flags & NFT_TRANS_UPD_EXPIRATION) { - WRITE_ONCE(nft_set_ext_timeout(ext)->expiration, - get_jiffies_64() + te->expiration); - } - } else { - nft_setelem_activate(net, te->set, te->elem_priv); - } - - nf_tables_setelem_notify(&ctx, te->set, - te->elem_priv, - NFT_MSG_NEWSETELEM); if (te->set->ops->commit && list_empty(&te->set->pending_update)) { list_add_tail(&te->set->pending_update, @@ -10576,14 +10691,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) case NFT_MSG_DESTROYSETELEM: te = nft_trans_container_elem(trans); - nf_tables_setelem_notify(&ctx, te->set, - te->elem_priv, - trans->msg_type); - nft_setelem_remove(net, te->set, te->elem_priv); - if (!nft_setelem_is_catchall(te->set, te->elem_priv)) { - atomic_dec(&te->set->nelems); - te->set->ndeact--; - } + nft_trans_elems_remove(&ctx, te); + if (te->set->ops->commit && list_empty(&te->set->pending_update)) { list_add_tail(&te->set->pending_update, @@ -10703,8 +10812,7 @@ static void nf_tables_abort_release(struct nft_trans *trans) nft_set_destroy(&ctx, nft_trans_set(trans)); break; case NFT_MSG_NEWSETELEM: - nft_set_elem_destroy(nft_trans_elem_set(trans), - nft_trans_elem_priv(trans), true); + nft_trans_set_elem_destroy(&ctx, nft_trans_container_elem(trans)); break; case NFT_MSG_NEWOBJ: nft_obj_destroy(&ctx, nft_trans_obj(trans)); @@ -10861,18 +10969,15 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_trans_destroy(trans); break; case NFT_MSG_NEWSETELEM: - if (nft_trans_elem_update_flags(trans) || - nft_trans_elem_set_bound(trans)) { + if (nft_trans_elem_set_bound(trans)) { nft_trans_destroy(trans); break; } te = nft_trans_container_elem(trans); - if (!te->set->ops->abort || - nft_setelem_is_catchall(te->set, te->elem_priv)) - nft_setelem_remove(net, te->set, te->elem_priv); - - if (!nft_setelem_is_catchall(te->set, te->elem_priv)) - atomic_dec(&te->set->nelems); + if (!nft_trans_elems_new_abort(&ctx, te)) { + nft_trans_destroy(trans); + break; + } if (te->set->ops->abort && list_empty(&te->set->pending_update)) { @@ -10884,12 +10989,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) case NFT_MSG_DESTROYSETELEM: te = nft_trans_container_elem(trans); - if (!nft_setelem_active_next(net, te->set, te->elem_priv)) { - nft_setelem_data_activate(net, te->set, te->elem_priv); - nft_setelem_activate(net, te->set, te->elem_priv); - } - if (!nft_setelem_is_catchall(te->set, te->elem_priv)) - te->set->ndeact--; + nft_trans_elems_destroy_abort(&ctx, te); if (te->set->ops->abort && list_empty(&te->set->pending_update)) { -- cgit v1.3 From 466c9b3b2a92602360e9fa25943b8aa191122dfc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 13 Nov 2024 16:35:51 +0100 Subject: netfilter: nf_tables: prepare nft audit for set element compaction nftables audit log format emits the number of added/deleted rules, sets, set elements and so on, to userspace: table=t1 family=2 entries=4 op=nft_register_set ~~~~~~~~~ At this time, the 'entries' key is the number of transactions that will be applied. The upcoming set element compression will coalesce subsequent adds/deletes to the same set requests in the same transaction request to conseve memory. Without this patch, we'd under-report the number of altered elements. Increment the audit counter by the number of elements to keep the reported entries value the same. Without this, nft_audit.sh selftest fails because the recorded (expected) entries key is smaller than the expected one. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 0882f78c2204..5b5178841553 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -10398,9 +10398,24 @@ static void nf_tables_commit_audit_free(struct list_head *adl) } } +/* nft audit emits the number of elements that get added/removed/updated, + * so NEW/DELSETELEM needs to increment based on the total elem count. + */ +static unsigned int nf_tables_commit_audit_entrycount(const struct nft_trans *trans) +{ + switch (trans->msg_type) { + case NFT_MSG_NEWSETELEM: + case NFT_MSG_DELSETELEM: + return nft_trans_container_elem(trans)->nelems; + } + + return 1; +} + static void nf_tables_commit_audit_collect(struct list_head *adl, - struct nft_table *table, u32 op) + const struct nft_trans *trans, u32 op) { + const struct nft_table *table = trans->table; struct nft_audit_data *adp; list_for_each_entry(adp, adl, list) { @@ -10410,7 +10425,7 @@ static void nf_tables_commit_audit_collect(struct list_head *adl, WARN_ONCE(1, "table=%s not expected in commit list", table->name); return; found: - adp->entries++; + adp->entries += nf_tables_commit_audit_entrycount(trans); if (!adp->op || adp->op > op) adp->op = op; } @@ -10569,7 +10584,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_ctx_update(&ctx, trans); - nf_tables_commit_audit_collect(&adl, table, trans->msg_type); + nf_tables_commit_audit_collect(&adl, trans, trans->msg_type); switch (trans->msg_type) { case NFT_MSG_NEWTABLE: if (nft_trans_table_update(trans)) { -- cgit v1.3 From b0c49466043a4878d8ef1263a4c9020698958a4c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 13 Nov 2024 16:35:52 +0100 Subject: netfilter: nf_tables: switch trans_elem to real flex array When queueing a set element add or removal operation to the transaction log, check if the previous operation already asks for a the identical operation on the same set. If so, store the element reference in the preceding operation. This significantlty reduces memory consumption when many set add/delete operations appear in a single transaction. Example: 10k elements require 937kb of memory (10k allocations from kmalloc-96 slab). Assuming we can compact 4 elements in the same set, 468 kbytes are needed (64 bytes for base struct, nft_trans_elemn, 32 bytes for nft_trans_one_elem structure, so 2500 allocations from kmalloc-192 slab). For large batch updates we can compact up to 62 elements into one single nft_trans_elem structure (~65% mem reduction): (64 bytes for base struct, nft_trans_elem, 32 byte for nft_trans_one_elem struct). We can halve size of nft_trans_one_elem struct by moving timeout/expire/update_flags into a dynamically allocated structure, this allows to store 124 elements in a 2k slab nft_trans_elem struct. This is done in a followup patch. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 90 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 5b5178841553..679312d71bbe 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -26,6 +26,9 @@ #define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-")) #define NFT_SET_MAX_ANONLEN 16 +/* limit compaction to avoid huge kmalloc/krealloc sizes. */ +#define NFT_MAX_SET_NELEMS ((2048 - sizeof(struct nft_trans_elem)) / sizeof(struct nft_trans_one_elem)) + unsigned int nf_tables_net_id __read_mostly; static LIST_HEAD(nf_tables_expressions); @@ -391,6 +394,86 @@ static void nf_tables_unregister_hook(struct net *net, return __nf_tables_unregister_hook(net, table, chain, false); } +static bool nft_trans_collapse_set_elem_allowed(const struct nft_trans_elem *a, const struct nft_trans_elem *b) +{ + /* NB: the ->bound equality check is defensive, at this time we only merge + * a new nft_trans_elem transaction request with the transaction tail + * element, but a->bound != b->bound would imply a NEWRULE transaction + * is queued in-between. + * + * The set check is mandatory, the NFT_MAX_SET_NELEMS check prevents + * huge krealloc() requests. + */ + return a->set == b->set && a->bound == b->bound && a->nelems < NFT_MAX_SET_NELEMS; +} + +static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net, + struct nft_trans_elem *tail, + struct nft_trans_elem *trans, + gfp_t gfp) +{ + unsigned int nelems, old_nelems = tail->nelems; + struct nft_trans_elem *new_trans; + + if (!nft_trans_collapse_set_elem_allowed(tail, trans)) + return false; + + /* "cannot happen", at this time userspace element add + * requests always allocate a new transaction element. + * + * This serves as a reminder to adjust the list_add_tail + * logic below in case this ever changes. + */ + if (WARN_ON_ONCE(trans->nelems != 1)) + return false; + + if (check_add_overflow(old_nelems, trans->nelems, &nelems)) + return false; + + /* krealloc might free tail which invalidates list pointers */ + list_del_init(&tail->nft_trans.list); + + new_trans = krealloc(tail, struct_size(tail, elems, nelems), gfp); + if (!new_trans) { + list_add_tail(&tail->nft_trans.list, &nft_net->commit_list); + return false; + } + + /* + * new_trans->nft_trans.list contains garbage, but + * list_add_tail() doesn't care. + */ + new_trans->nelems = nelems; + new_trans->elems[old_nelems] = trans->elems[0]; + list_add_tail(&new_trans->nft_trans.list, &nft_net->commit_list); + + return true; +} + +static bool nft_trans_try_collapse(struct nftables_pernet *nft_net, + struct nft_trans *trans, gfp_t gfp) +{ + struct nft_trans *tail; + + if (list_empty(&nft_net->commit_list)) + return false; + + tail = list_last_entry(&nft_net->commit_list, struct nft_trans, list); + + if (tail->msg_type != trans->msg_type) + return false; + + switch (trans->msg_type) { + case NFT_MSG_NEWSETELEM: + case NFT_MSG_DELSETELEM: + return nft_trans_collapse_set_elem(nft_net, + nft_trans_container_elem(tail), + nft_trans_container_elem(trans), gfp); + } + + return false; +} + static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *trans) { struct nftables_pernet *nft_net = nft_pernet(net); @@ -424,11 +507,18 @@ static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *tr static void nft_trans_commit_list_add_elem(struct net *net, struct nft_trans *trans, gfp_t gfp) { + struct nftables_pernet *nft_net = nft_pernet(net); + WARN_ON_ONCE(trans->msg_type != NFT_MSG_NEWSETELEM && trans->msg_type != NFT_MSG_DELSETELEM); might_alloc(gfp); + if (nft_trans_try_collapse(nft_net, trans, gfp)) { + kfree(trans); + return; + } + nft_trans_commit_list_add_tail(net, trans); } -- cgit v1.3 From 508180850b732c7a0e3a728460cf3f95f25e1fbd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 13 Nov 2024 16:35:53 +0100 Subject: netfilter: nf_tables: allocate element update information dynamically Move the timeout/expire/flag members from nft_trans_one_elem struct into a dybamically allocated structure, only needed when timeout update was requested. This halves size of nft_trans_one_elem struct and allows to compact up to 124 elements in one transaction container rather than 62. This halves memory requirements for a large flush or insert transaction, where ->update remains NULL. Care has to be taken to release the extra data in all spots, including abort path. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 10 ++++--- net/netfilter/nf_tables_api.c | 57 ++++++++++++++++++++++++--------------- 2 files changed, 43 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 37af0b174c39..80a537ac26cd 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1759,11 +1759,15 @@ enum nft_trans_elem_flags { NFT_TRANS_UPD_EXPIRATION = (1 << 1), }; -struct nft_trans_one_elem { - struct nft_elem_priv *priv; +struct nft_elem_update { u64 timeout; u64 expiration; - u8 update_flags; + u8 flags; +}; + +struct nft_trans_one_elem { + struct nft_elem_priv *priv; + struct nft_elem_update *update; }; struct nft_trans_elem { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 679312d71bbe..21b6f7410a1f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6706,7 +6706,8 @@ static void nft_trans_set_elem_destroy(const struct nft_ctx *ctx, struct nft_tra int i; for (i = 0; i < te->nelems; i++) { - if (te->elems[i].update_flags) + /* skip update request, see nft_trans_elems_new_abort() */ + if (!te->elems[i].priv) continue; __nft_set_elem_destroy(ctx, te->set, te->elems[i].priv, true); @@ -6897,12 +6898,13 @@ static void nft_trans_elem_update(const struct nft_set *set, const struct nft_trans_one_elem *elem) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + const struct nft_elem_update *update = elem->update; - if (elem->update_flags & NFT_TRANS_UPD_TIMEOUT) - WRITE_ONCE(nft_set_ext_timeout(ext)->timeout, elem->timeout); + if (update->flags & NFT_TRANS_UPD_TIMEOUT) + WRITE_ONCE(nft_set_ext_timeout(ext)->timeout, update->timeout); - if (elem->update_flags & NFT_TRANS_UPD_EXPIRATION) - WRITE_ONCE(nft_set_ext_timeout(ext)->expiration, get_jiffies_64() + elem->expiration); + if (update->flags & NFT_TRANS_UPD_EXPIRATION) + WRITE_ONCE(nft_set_ext_timeout(ext)->expiration, get_jiffies_64() + update->expiration); } static void nft_trans_elems_add(const struct nft_ctx *ctx, @@ -6911,15 +6913,16 @@ static void nft_trans_elems_add(const struct nft_ctx *ctx, int i; for (i = 0; i < te->nelems; i++) { - const struct nft_trans_one_elem *elem = &te->elems[i]; + struct nft_trans_one_elem *elem = &te->elems[i]; - if (elem->update_flags) + if (elem->update) nft_trans_elem_update(te->set, elem); else nft_setelem_activate(ctx->net, te->set, elem->priv); nf_tables_setelem_notify(ctx, te->set, elem->priv, NFT_MSG_NEWSETELEM); + kfree(elem->update); } } @@ -7011,6 +7014,8 @@ static void nft_trans_elems_remove(const struct nft_ctx *ctx, int i; for (i = 0; i < te->nelems; i++) { + WARN_ON_ONCE(te->elems[i].update); + nf_tables_setelem_notify(ctx, te->set, te->elems[i].priv, te->nft_trans.msg_type); @@ -7059,7 +7064,6 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_data_desc desc; enum nft_registers dreg; struct nft_trans *trans; - u8 update_flags; u64 expiration; u64 timeout; int err, i; @@ -7374,26 +7378,32 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, else if (!(nlmsg_flags & NLM_F_EXCL)) { err = 0; if (nft_set_ext_exists(ext2, NFT_SET_EXT_TIMEOUT)) { - struct nft_trans_one_elem *update; - - update = &nft_trans_container_elem(trans)->elems[0]; + struct nft_elem_update update = { }; - update_flags = 0; if (timeout != nft_set_ext_timeout(ext2)->timeout) { - update->timeout = timeout; + update.timeout = timeout; if (expiration == 0) expiration = timeout; - update_flags |= NFT_TRANS_UPD_TIMEOUT; + update.flags |= NFT_TRANS_UPD_TIMEOUT; } if (expiration) { - update->expiration = expiration; - update_flags |= NFT_TRANS_UPD_EXPIRATION; + update.expiration = expiration; + update.flags |= NFT_TRANS_UPD_EXPIRATION; } - if (update_flags) { - update->priv = elem_priv; - update->update_flags = update_flags; + if (update.flags) { + struct nft_trans_one_elem *ue; + + ue = &nft_trans_container_elem(trans)->elems[0]; + + ue->update = kmemdup(&update, sizeof(update), GFP_KERNEL); + if (!ue->update) { + err = -ENOMEM; + goto err_element_clash; + } + + ue->priv = elem_priv; nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); goto err_elem_free; } @@ -7561,14 +7571,19 @@ void nft_setelem_data_deactivate(const struct net *net, * Returns true if set had been added to (i.e., elements need to be removed again). */ static bool nft_trans_elems_new_abort(const struct nft_ctx *ctx, - const struct nft_trans_elem *te) + struct nft_trans_elem *te) { bool removed = false; int i; for (i = 0; i < te->nelems; i++) { - if (te->elems[i].update_flags) + if (te->elems[i].update) { + kfree(te->elems[i].update); + te->elems[i].update = NULL; + /* Update request, so do not release this element */ + te->elems[i].priv = NULL; continue; + } if (!te->set->ops->abort || nft_setelem_is_catchall(te->set, te->elems[i].priv)) nft_setelem_remove(ctx->net, te->set, te->elems[i].priv); -- cgit v1.3 From 35f56c554eb1b56b77b3cf197a6b00922d49033d Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Wed, 13 Nov 2024 22:02:09 +0900 Subject: netfilter: ipset: add missing range check in bitmap_ip_uadt When tb[IPSET_ATTR_IP_TO] is not present but tb[IPSET_ATTR_CIDR] exists, the values of ip and ip_to are slightly swapped. Therefore, the range check for ip should be done later, but this part is missing and it seems that the vulnerability occurs. So we should add missing range checks and remove unnecessary range checks. Cc: Reported-by: syzbot+58c872f7790a4d2ac951@syzkaller.appspotmail.com Fixes: 72205fc68bd1 ("netfilter: ipset: bitmap:ip set type support") Signed-off-by: Jeongjun Park Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_bitmap_ip.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index e4fa00abde6a..5988b9bb9029 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -163,11 +163,8 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; - if (ip > ip_to) { + if (ip > ip_to) swap(ip, ip_to); - if (ip < map->first_ip) - return -IPSET_ERR_BITMAP_RANGE; - } } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); @@ -178,7 +175,7 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[], ip_to = ip; } - if (ip_to > map->last_ip) + if (ip < map->first_ip || ip_to > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; for (; !before(ip_to, ip); ip += map->hosts) { -- cgit v1.3 From d96b543c6f3b78b6440b68b5a5bbface553eff28 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 1 Oct 2024 09:21:25 +0200 Subject: Bluetooth: hci_conn: Reduce hci_conn_drop() calls in two functions An hci_conn_drop() call was immediately used after a null pointer check for an hci_conn_link() call in two function implementations. Thus call such a function only once instead directly before the checks. This issue was transformed by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index c4c74b82ed21..50e65b2f54ee 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -2224,13 +2224,9 @@ struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, conn->iso_qos.bcast.big); if (parent && parent != conn) { link = hci_conn_link(parent, conn); - if (!link) { - hci_conn_drop(conn); - return ERR_PTR(-ENOLINK); - } - - /* Link takes the refcount */ hci_conn_drop(conn); + if (!link) + return ERR_PTR(-ENOLINK); } return conn; @@ -2320,15 +2316,12 @@ struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, } link = hci_conn_link(le, cis); + hci_conn_drop(cis); if (!link) { hci_conn_drop(le); - hci_conn_drop(cis); return ERR_PTR(-ENOLINK); } - /* Link takes the refcount */ - hci_conn_drop(cis); - cis->state = BT_CONNECT; hci_le_create_cis_pending(hdev); -- cgit v1.3 From 2b0f2fc9ed62e73c95df1fa8ed2ba3dac54699df Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 1 Oct 2024 12:34:06 -0400 Subject: Bluetooth: hci_conn: Use disable_delayed_work_sync This makes use of disable_delayed_work_sync instead cancel_delayed_work_sync as it not only cancel the ongoing work but also disables new submit which is disarable since the object holding the work is about to be freed. Reported-by: syzbot+2446dd3cb07277388db6@syzkaller.appspotmail.com Tested-by: syzbot+2446dd3cb07277388db6@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=2446dd3cb07277388db6 Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 50e65b2f54ee..40c4a36d2be3 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1127,9 +1127,9 @@ void hci_conn_del(struct hci_conn *conn) hci_conn_unlink(conn); - cancel_delayed_work_sync(&conn->disc_work); - cancel_delayed_work_sync(&conn->auto_accept_work); - cancel_delayed_work_sync(&conn->idle_work); + disable_delayed_work_sync(&conn->disc_work); + disable_delayed_work_sync(&conn->auto_accept_work); + disable_delayed_work_sync(&conn->idle_work); if (conn->type == ACL_LINK) { /* Unacked frames */ -- cgit v1.3 From 3fe288a8214e7dd784d1f9b7c9e448244d316b47 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 8 Oct 2024 10:16:48 -0400 Subject: Bluetooth: hci_core: Fix not checking skb length on hci_acldata_packet This fixes not checking if skb really contains an ACL header otherwise the code may attempt to access some uninitilized/invalid memory past the valid skb->data. Reported-by: syzbot+6ea290ba76d8c1eb1ac2@syzkaller.appspotmail.com Tested-by: syzbot+6ea290ba76d8c1eb1ac2@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=6ea290ba76d8c1eb1ac2 Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 96d097b21d13..f9e1df409015 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3771,18 +3771,22 @@ static void hci_tx_work(struct work_struct *work) /* ACL data packet */ static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb) { - struct hci_acl_hdr *hdr = (void *) skb->data; + struct hci_acl_hdr *hdr; struct hci_conn *conn; __u16 handle, flags; - skb_pull(skb, HCI_ACL_HDR_SIZE); + hdr = skb_pull_data(skb, sizeof(*hdr)); + if (!hdr) { + bt_dev_err(hdev, "ACL packet too small"); + goto drop; + } handle = __le16_to_cpu(hdr->handle); flags = hci_flags(handle); handle = hci_handle(handle); - BT_DBG("%s len %d handle 0x%4.4x flags 0x%4.4x", hdev->name, skb->len, - handle, flags); + bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len, + handle, flags); hdev->stat.acl_rx++; @@ -3803,6 +3807,7 @@ static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb) handle); } +drop: kfree_skb(skb); } -- cgit v1.3 From 59437cbb5781227567dec226aaa88c66a09ccc5a Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 8 Oct 2024 11:33:15 -0400 Subject: Bluetooth: hci_core: Fix not checking skb length on hci_scodata_packet This fixes not checking if skb really contains an SCO header otherwise the code may attempt to access some uninitilized/invalid memory past the valid skb->data. Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index f9e1df409015..f6cff34a8542 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3814,18 +3814,22 @@ drop: /* SCO data packet */ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb) { - struct hci_sco_hdr *hdr = (void *) skb->data; + struct hci_sco_hdr *hdr; struct hci_conn *conn; __u16 handle, flags; - skb_pull(skb, HCI_SCO_HDR_SIZE); + hdr = skb_pull_data(skb, sizeof(*hdr)); + if (!hdr) { + bt_dev_err(hdev, "SCO packet too small"); + goto drop; + } handle = __le16_to_cpu(hdr->handle); flags = hci_flags(handle); handle = hci_handle(handle); - BT_DBG("%s len %d handle 0x%4.4x flags 0x%4.4x", hdev->name, skb->len, - handle, flags); + bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len, + handle, flags); hdev->stat.sco_rx++; @@ -3843,6 +3847,7 @@ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb) handle); } +drop: kfree_skb(skb); } -- cgit v1.3 From 5fe6caa62b07fd39cd6a28acc8f92ba2955e11a6 Mon Sep 17 00:00:00 2001 From: Andrej Shadura Date: Wed, 9 Oct 2024 14:14:24 +0200 Subject: Bluetooth: Fix type of len in rfcomm_sock_getsockopt{,_old}() Commit 9bf4e919ccad worked around an issue introduced after an innocuous optimisation change in LLVM main: > len is defined as an 'int' because it is assigned from > '__user int *optlen'. However, it is clamped against the result of > sizeof(), which has a type of 'size_t' ('unsigned long' for 64-bit > platforms). This is done with min_t() because min() requires compatible > types, which results in both len and the result of sizeof() being casted > to 'unsigned int', meaning len changes signs and the result of sizeof() > is truncated. From there, len is passed to copy_to_user(), which has a > third parameter type of 'unsigned long', so it is widened and changes > signs again. This excessive casting in combination with the KCSAN > instrumentation causes LLVM to fail to eliminate the __bad_copy_from() > call, failing the build. The same issue occurs in rfcomm in functions rfcomm_sock_getsockopt and rfcomm_sock_getsockopt_old. Change the type of len to size_t in both rfcomm_sock_getsockopt and rfcomm_sock_getsockopt_old and replace min_t() with min(). Cc: stable@vger.kernel.org Co-authored-by: Aleksei Vetrov Improves: 9bf4e919ccad ("Bluetooth: Fix type of len in {l2cap,sco}_sock_getsockopt_old()") Link: https://github.com/ClangBuiltLinux/linux/issues/2007 Link: https://github.com/llvm/llvm-project/issues/85647 Signed-off-by: Andrej Shadura Reviewed-by: Nathan Chancellor Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/rfcomm/sock.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 355e1a1698f5..40766f8119ed 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -729,7 +729,8 @@ static int rfcomm_sock_getsockopt_old(struct socket *sock, int optname, char __u struct sock *l2cap_sk; struct l2cap_conn *conn; struct rfcomm_conninfo cinfo; - int len, err = 0; + int err = 0; + size_t len; u32 opt; BT_DBG("sk %p", sk); @@ -783,7 +784,7 @@ static int rfcomm_sock_getsockopt_old(struct socket *sock, int optname, char __u cinfo.hci_handle = conn->hcon->handle; memcpy(cinfo.dev_class, conn->hcon->dev_class, 3); - len = min_t(unsigned int, len, sizeof(cinfo)); + len = min(len, sizeof(cinfo)); if (copy_to_user(optval, (char *) &cinfo, len)) err = -EFAULT; @@ -802,7 +803,8 @@ static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, c { struct sock *sk = sock->sk; struct bt_security sec; - int len, err = 0; + int err = 0; + size_t len; BT_DBG("sk %p", sk); @@ -827,7 +829,7 @@ static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, c sec.level = rfcomm_pi(sk)->sec_level; sec.key_size = 0; - len = min_t(unsigned int, len, sizeof(sec)); + len = min(len, sizeof(sec)); if (copy_to_user(optval, (char *) &sec, len)) err = -EFAULT; -- cgit v1.3 From 5bd3135924b4570dcecc8793f7771cb8d42d8b19 Mon Sep 17 00:00:00 2001 From: Danil Pylaev Date: Mon, 21 Oct 2024 12:22:45 +0000 Subject: Bluetooth: Support new quirks for ATS2851 This adds support for quirks for broken extended create connection, and write auth payload timeout. Signed-off-by: Danil Pylaev Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 7 +++++++ net/bluetooth/hci_sync.c | 9 ++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 0bbad90ddd6f..65f5ed2ded70 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3626,6 +3626,13 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, void *data, goto unlock; } + /* We skip the WRITE_AUTH_PAYLOAD_TIMEOUT for ATS2851 based controllers + * to avoid unexpected SMP command errors when pairing. + */ + if (test_bit(HCI_QUIRK_BROKEN_WRITE_AUTH_PAYLOAD_TIMEOUT, + &hdev->quirks)) + goto notify; + /* Set the default Authenticated Payload Timeout after * an LE Link is established. As per Core Spec v5.0, Vol 2, Part B * Section 3.3, the HCI command WRITE_AUTH_PAYLOAD_TIMEOUT should be diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index c0203a2b5107..c86f4e42e69c 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -4842,6 +4842,13 @@ static const struct { HCI_QUIRK_BROKEN(SET_RPA_TIMEOUT, "HCI LE Set Random Private Address Timeout command is " "advertised, but not supported."), + HCI_QUIRK_BROKEN(EXT_CREATE_CONN, + "HCI LE Extended Create Connection command is " + "advertised, but not supported."), + HCI_QUIRK_BROKEN(WRITE_AUTH_PAYLOAD_TIMEOUT, + "HCI WRITE AUTH PAYLOAD TIMEOUT command leads " + "to unexpected SMP errors when pairing " + "and will not be used."), HCI_QUIRK_BROKEN(LE_CODED, "HCI LE Coded PHY feature bit is set, " "but its usage is not supported.") @@ -6477,7 +6484,7 @@ static int hci_le_create_conn_sync(struct hci_dev *hdev, void *data) &own_addr_type); if (err) goto done; - + /* Send command LE Extended Create Connection if supported */ if (use_ext_conn(hdev)) { err = hci_le_ext_create_conn_sync(hdev, conn, own_addr_type); goto done; -- cgit v1.3 From 4a5e0ba68676b3a77298cf646cd2b39c94fbd2f5 Mon Sep 17 00:00:00 2001 From: Iulia Tanasescu Date: Fri, 1 Nov 2024 10:23:36 +0200 Subject: Bluetooth: ISO: Do not emit LE PA Create Sync if previous is pending The Bluetooth Core spec does not allow a LE PA Create sync command to be sent to Controller if another one is pending (Vol 4, Part E, page 2493). In order to avoid this issue, the HCI_CONN_CREATE_PA_SYNC was added to mark that the LE PA Create Sync command has been sent for a hcon. Once the PA Sync Established event is received, the hcon flag is erased and the next pending hcon is handled. Signed-off-by: Iulia Tanasescu Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 3 +- include/net/bluetooth/hci_core.h | 34 +++++++++++ net/bluetooth/hci_conn.c | 123 +++++++++++++++++++++++++++------------ net/bluetooth/hci_event.c | 19 +++++- 4 files changed, 139 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 4f64066915be..4becf201b063 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1,7 +1,7 @@ /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated - Copyright 2023 NXP + Copyright 2023-2024 NXP Written 2000,2001 by Maxim Krasnyansky @@ -697,6 +697,7 @@ enum { #define HCI_RSSI_INVALID 127 #define HCI_SYNC_HANDLE_INVALID 0xffff +#define HCI_SID_INVALID 0xff #define HCI_ROLE_MASTER 0x00 #define HCI_ROLE_SLAVE 0x01 diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 94ddc8684973..43474b751a50 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -668,6 +668,7 @@ struct hci_conn { __u8 adv_instance; __u16 handle; __u16 sync_handle; + __u8 sid; __u16 state; __u16 mtu; __u8 mode; @@ -947,6 +948,7 @@ enum { HCI_CONN_CREATE_CIS, HCI_CONN_BIG_SYNC, HCI_CONN_BIG_SYNC_FAILED, + HCI_CONN_CREATE_PA_SYNC, HCI_CONN_PA_SYNC, HCI_CONN_PA_SYNC_FAILED, }; @@ -1099,6 +1101,30 @@ static inline struct hci_conn *hci_conn_hash_lookup_bis(struct hci_dev *hdev, return NULL; } +static inline struct hci_conn *hci_conn_hash_lookup_sid(struct hci_dev *hdev, + __u8 sid, + bdaddr_t *dst, + __u8 dst_type) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type != ISO_LINK || bacmp(&c->dst, dst) || + c->dst_type != dst_type || c->sid != sid) + continue; + + rcu_read_unlock(); + return c; + } + + rcu_read_unlock(); + + return NULL; +} + static inline struct hci_conn * hci_conn_hash_lookup_per_adv_bis(struct hci_dev *hdev, bdaddr_t *ba, @@ -1328,6 +1354,13 @@ hci_conn_hash_lookup_pa_sync_handle(struct hci_dev *hdev, __u16 sync_handle) if (c->type != ISO_LINK) continue; + /* Ignore the listen hcon, we are looking + * for the child hcon that was created as + * a result of the PA sync established event. + */ + if (c->state == BT_LISTEN) + continue; + if (c->sync_handle == sync_handle) { rcu_read_unlock(); return c; @@ -1445,6 +1478,7 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle); void hci_sco_setup(struct hci_conn *conn, __u8 status); bool hci_iso_setup_path(struct hci_conn *conn); int hci_le_create_cis_pending(struct hci_dev *hdev); +int hci_pa_create_sync_pending(struct hci_dev *hdev); int hci_conn_check_create_cis(struct hci_conn *conn); struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 40c4a36d2be3..f9da12339db8 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -952,6 +952,7 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t conn->tx_power = HCI_TX_POWER_INVALID; conn->max_tx_power = HCI_TX_POWER_INVALID; conn->sync_handle = HCI_SYNC_HANDLE_INVALID; + conn->sid = HCI_SID_INVALID; set_bit(HCI_CONN_POWER_SAVE, &conn->flags); conn->disc_timeout = HCI_DISCONN_TIMEOUT; @@ -2062,73 +2063,119 @@ static int create_big_sync(struct hci_dev *hdev, void *data) static void create_pa_complete(struct hci_dev *hdev, void *data, int err) { - struct hci_cp_le_pa_create_sync *cp = data; - bt_dev_dbg(hdev, ""); if (err) bt_dev_err(hdev, "Unable to create PA: %d", err); +} + +static bool hci_conn_check_create_pa_sync(struct hci_conn *conn) +{ + if (conn->type != ISO_LINK || conn->sid == HCI_SID_INVALID) + return false; - kfree(cp); + return true; } static int create_pa_sync(struct hci_dev *hdev, void *data) { - struct hci_cp_le_pa_create_sync *cp = data; - int err; + struct hci_cp_le_pa_create_sync *cp = NULL; + struct hci_conn *conn; + int err = 0; - err = __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_CREATE_SYNC, - sizeof(*cp), cp, HCI_CMD_TIMEOUT); - if (err) { - hci_dev_clear_flag(hdev, HCI_PA_SYNC); - return err; + hci_dev_lock(hdev); + + rcu_read_lock(); + + /* The spec allows only one pending LE Periodic Advertising Create + * Sync command at a time. If the command is pending now, don't do + * anything. We check for pending connections after each PA Sync + * Established event. + * + * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E + * page 2493: + * + * If the Host issues this command when another HCI_LE_Periodic_ + * Advertising_Create_Sync command is pending, the Controller shall + * return the error code Command Disallowed (0x0C). + */ + list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { + if (test_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags)) + goto unlock; } - return hci_update_passive_scan_sync(hdev); + list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { + if (hci_conn_check_create_pa_sync(conn)) { + struct bt_iso_qos *qos = &conn->iso_qos; + + cp = kzalloc(sizeof(*cp), GFP_KERNEL); + if (!cp) { + err = -ENOMEM; + goto unlock; + } + + cp->options = qos->bcast.options; + cp->sid = conn->sid; + cp->addr_type = conn->dst_type; + bacpy(&cp->addr, &conn->dst); + cp->skip = cpu_to_le16(qos->bcast.skip); + cp->sync_timeout = cpu_to_le16(qos->bcast.sync_timeout); + cp->sync_cte_type = qos->bcast.sync_cte_type; + + break; + } + } + +unlock: + rcu_read_unlock(); + + hci_dev_unlock(hdev); + + if (cp) { + hci_dev_set_flag(hdev, HCI_PA_SYNC); + set_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags); + + err = __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_CREATE_SYNC, + sizeof(*cp), cp, HCI_CMD_TIMEOUT); + if (!err) + err = hci_update_passive_scan_sync(hdev); + + kfree(cp); + + if (err) { + hci_dev_clear_flag(hdev, HCI_PA_SYNC); + clear_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags); + } + } + + return err; +} + +int hci_pa_create_sync_pending(struct hci_dev *hdev) +{ + /* Queue start pa_create_sync and scan */ + return hci_cmd_sync_queue(hdev, create_pa_sync, + NULL, create_pa_complete); } struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, __u8 sid, struct bt_iso_qos *qos) { - struct hci_cp_le_pa_create_sync *cp; struct hci_conn *conn; - int err; - - if (hci_dev_test_and_set_flag(hdev, HCI_PA_SYNC)) - return ERR_PTR(-EBUSY); conn = hci_conn_add_unset(hdev, ISO_LINK, dst, HCI_ROLE_SLAVE); if (IS_ERR(conn)) return conn; conn->iso_qos = *qos; + conn->dst_type = dst_type; + conn->sid = sid; conn->state = BT_LISTEN; hci_conn_hold(conn); - cp = kzalloc(sizeof(*cp), GFP_KERNEL); - if (!cp) { - hci_dev_clear_flag(hdev, HCI_PA_SYNC); - hci_conn_drop(conn); - return ERR_PTR(-ENOMEM); - } - - cp->options = qos->bcast.options; - cp->sid = sid; - cp->addr_type = dst_type; - bacpy(&cp->addr, dst); - cp->skip = cpu_to_le16(qos->bcast.skip); - cp->sync_timeout = cpu_to_le16(qos->bcast.sync_timeout); - cp->sync_cte_type = qos->bcast.sync_cte_type; - - /* Queue start pa_create_sync and scan */ - err = hci_cmd_sync_queue(hdev, create_pa_sync, cp, create_pa_complete); - if (err < 0) { - hci_conn_drop(conn); - kfree(cp); - return ERR_PTR(err); - } + hci_pa_create_sync_pending(hdev); return conn; } diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 65f5ed2ded70..fd269fcabc2e 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6352,7 +6352,7 @@ static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data, struct hci_ev_le_pa_sync_established *ev = data; int mask = hdev->link_mode; __u8 flags = 0; - struct hci_conn *pa_sync; + struct hci_conn *pa_sync, *conn; bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); @@ -6360,6 +6360,20 @@ static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data, hci_dev_clear_flag(hdev, HCI_PA_SYNC); + conn = hci_conn_hash_lookup_sid(hdev, ev->sid, &ev->bdaddr, + ev->bdaddr_type); + if (!conn) { + bt_dev_err(hdev, + "Unable to find connection for dst %pMR sid 0x%2.2x", + &ev->bdaddr, ev->sid); + goto unlock; + } + + clear_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags); + + conn->sync_handle = le16_to_cpu(ev->handle); + conn->sid = HCI_SID_INVALID; + mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, ISO_LINK, &flags); if (!(mask & HCI_LM_ACCEPT)) { hci_le_pa_term_sync(hdev, ev->handle); @@ -6386,6 +6400,9 @@ static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data, } unlock: + /* Handle any other pending PA sync command */ + hci_pa_create_sync_pending(hdev); + hci_dev_unlock(hdev); } -- cgit v1.3 From 79321b06a03e395ab1fc19a47549e9d70ddac115 Mon Sep 17 00:00:00 2001 From: Iulia Tanasescu Date: Fri, 1 Nov 2024 10:23:37 +0200 Subject: Bluetooth: ISO: Fix matching parent socket for BIS slave Currently, when a BIS slave connection is notified to the ISO layer, the parent socket is tried to be matched by the HCI_EVT_LE_BIG_SYNC_ESTABILISHED event. However, a BIS slave connection is notified to the ISO layer after the Command Complete for the LE Setup ISO Data Path command is received. This causes the parent to be incorrectly matched if multiple listen sockets are present. This commit adds a fix by matching the parent based on the BIG handle set in the notified connection. Signed-off-by: Iulia Tanasescu Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 7a83e400ac77..0d98cc16bbac 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1733,6 +1733,13 @@ static bool iso_match_big(struct sock *sk, void *data) return ev->handle == iso_pi(sk)->qos.bcast.big; } +static bool iso_match_big_hcon(struct sock *sk, void *data) +{ + struct hci_conn *hcon = data; + + return hcon->iso_qos.bcast.big == iso_pi(sk)->qos.bcast.big; +} + static bool iso_match_pa_sync_flag(struct sock *sk, void *data) { return test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags); @@ -1756,8 +1763,16 @@ static void iso_conn_ready(struct iso_conn *conn) if (!hcon) return; - if (test_bit(HCI_CONN_BIG_SYNC, &hcon->flags) || - test_bit(HCI_CONN_BIG_SYNC_FAILED, &hcon->flags)) { + if (test_bit(HCI_CONN_BIG_SYNC, &hcon->flags)) { + /* A BIS slave hcon is notified to the ISO layer + * after the Command Complete for the LE Setup + * ISO Data Path command is received. Get the + * parent socket that matches the hcon BIG handle. + */ + parent = iso_get_sock(&hcon->src, &hcon->dst, + BT_LISTEN, iso_match_big_hcon, + hcon); + } else if (test_bit(HCI_CONN_BIG_SYNC_FAILED, &hcon->flags)) { ev = hci_recv_event_data(hcon->hdev, HCI_EVT_LE_BIG_SYNC_ESTABILISHED); -- cgit v1.3 From 42ecf1947135110ea08abeaca39741636f9a2285 Mon Sep 17 00:00:00 2001 From: Iulia Tanasescu Date: Fri, 1 Nov 2024 10:23:38 +0200 Subject: Bluetooth: ISO: Do not emit LE BIG Create Sync if previous is pending The Bluetooth Core spec does not allow a LE BIG Create sync command to be sent to Controller if another one is pending (Vol 4, Part E, page 2586). In order to avoid this issue, the HCI_CONN_CREATE_BIG_SYNC was added to mark that the LE BIG Create Sync command has been sent for a hcon. Once the BIG Sync Established event is received, the hcon flag is erased and the next pending hcon is handled. Signed-off-by: Iulia Tanasescu Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 1 + include/net/bluetooth/hci_core.h | 29 ++++++++++++++ net/bluetooth/hci_conn.c | 87 +++++++++++++++++++++++++++++++++------- net/bluetooth/hci_event.c | 20 ++++++++- net/bluetooth/iso.c | 4 +- 5 files changed, 125 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 4becf201b063..5bb4eaa52e14 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -29,6 +29,7 @@ #define HCI_MAX_ACL_SIZE 1024 #define HCI_MAX_SCO_SIZE 255 #define HCI_MAX_ISO_SIZE 251 +#define HCI_MAX_ISO_BIS 31 #define HCI_MAX_EVENT_SIZE 260 #define HCI_MAX_FRAME_SIZE (HCI_MAX_ACL_SIZE + 4) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 43474b751a50..c95f7e6ba255 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -711,6 +711,9 @@ struct hci_conn { __s8 tx_power; __s8 max_tx_power; struct bt_iso_qos iso_qos; + __u8 num_bis; + __u8 bis[HCI_MAX_ISO_BIS]; + unsigned long flags; enum conn_reasons conn_reason; @@ -946,6 +949,7 @@ enum { HCI_CONN_PER_ADV, HCI_CONN_BIG_CREATED, HCI_CONN_CREATE_CIS, + HCI_CONN_CREATE_BIG_SYNC, HCI_CONN_BIG_SYNC, HCI_CONN_BIG_SYNC_FAILED, HCI_CONN_CREATE_PA_SYNC, @@ -1295,6 +1299,30 @@ static inline struct hci_conn *hci_conn_hash_lookup_big(struct hci_dev *hdev, return NULL; } +static inline struct hci_conn * +hci_conn_hash_lookup_big_sync_pend(struct hci_dev *hdev, + __u8 handle, __u8 num_bis) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type != ISO_LINK) + continue; + + if (handle == c->iso_qos.bcast.big && num_bis == c->num_bis) { + rcu_read_unlock(); + return c; + } + } + + rcu_read_unlock(); + + return NULL; +} + static inline struct hci_conn * hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state) { @@ -1479,6 +1507,7 @@ void hci_sco_setup(struct hci_conn *conn, __u8 status); bool hci_iso_setup_path(struct hci_conn *conn); int hci_le_create_cis_pending(struct hci_dev *hdev); int hci_pa_create_sync_pending(struct hci_dev *hdev); +int hci_le_big_create_sync_pending(struct hci_dev *hdev); int hci_conn_check_create_cis(struct hci_conn *conn); struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index f9da12339db8..e996e9763666 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -2180,34 +2180,93 @@ struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, return conn; } +static bool hci_conn_check_create_big_sync(struct hci_conn *conn) +{ + if (!conn->num_bis) + return false; + + return true; +} + +int hci_le_big_create_sync_pending(struct hci_dev *hdev) +{ + DEFINE_FLEX(struct hci_cp_le_big_create_sync, pdu, bis, num_bis, 0x11); + struct hci_conn *conn; + + rcu_read_lock(); + + pdu->num_bis = 0; + + /* The spec allows only one pending LE BIG Create Sync command at + * a time. If the command is pending now, don't do anything. We + * check for pending connections after each BIG Sync Established + * event. + * + * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E + * page 2586: + * + * If the Host sends this command when the Controller is in the + * process of synchronizing to any BIG, i.e. the HCI_LE_BIG_Sync_ + * Established event has not been generated, the Controller shall + * return the error code Command Disallowed (0x0C). + */ + list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { + if (test_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags)) + goto unlock; + } + + list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { + if (hci_conn_check_create_big_sync(conn)) { + struct bt_iso_qos *qos = &conn->iso_qos; + + set_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags); + + pdu->handle = qos->bcast.big; + pdu->sync_handle = cpu_to_le16(conn->sync_handle); + pdu->encryption = qos->bcast.encryption; + memcpy(pdu->bcode, qos->bcast.bcode, + sizeof(pdu->bcode)); + pdu->mse = qos->bcast.mse; + pdu->timeout = cpu_to_le16(qos->bcast.timeout); + pdu->num_bis = conn->num_bis; + memcpy(pdu->bis, conn->bis, conn->num_bis); + + break; + } + } + +unlock: + rcu_read_unlock(); + + if (!pdu->num_bis) + return 0; + + return hci_send_cmd(hdev, HCI_OP_LE_BIG_CREATE_SYNC, + struct_size(pdu, bis, pdu->num_bis), pdu); +} + int hci_le_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, struct bt_iso_qos *qos, __u16 sync_handle, __u8 num_bis, __u8 bis[]) { - DEFINE_FLEX(struct hci_cp_le_big_create_sync, pdu, bis, num_bis, 0x11); int err; - if (num_bis < 0x01 || num_bis > pdu->num_bis) + if (num_bis < 0x01 || num_bis > ISO_MAX_NUM_BIS) return -EINVAL; err = qos_set_big(hdev, qos); if (err) return err; - if (hcon) - hcon->iso_qos.bcast.big = qos->bcast.big; + if (hcon) { + /* Update hcon QoS */ + hcon->iso_qos = *qos; - pdu->handle = qos->bcast.big; - pdu->sync_handle = cpu_to_le16(sync_handle); - pdu->encryption = qos->bcast.encryption; - memcpy(pdu->bcode, qos->bcast.bcode, sizeof(pdu->bcode)); - pdu->mse = qos->bcast.mse; - pdu->timeout = cpu_to_le16(qos->bcast.timeout); - pdu->num_bis = num_bis; - memcpy(pdu->bis, bis, num_bis); + hcon->num_bis = num_bis; + memcpy(hcon->bis, bis, num_bis); + } - return hci_send_cmd(hdev, HCI_OP_LE_BIG_CREATE_SYNC, - struct_size(pdu, bis, num_bis), pdu); + return hci_le_big_create_sync_pending(hdev); } static void create_big_complete(struct hci_dev *hdev, void *data, int err) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index fd269fcabc2e..2b5ba8acd1d8 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6920,7 +6920,7 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { struct hci_evt_le_big_sync_estabilished *ev = data; - struct hci_conn *bis; + struct hci_conn *bis, *conn; int i; bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); @@ -6931,6 +6931,20 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, hci_dev_lock(hdev); + conn = hci_conn_hash_lookup_big_sync_pend(hdev, ev->handle, + ev->num_bis); + if (!conn) { + bt_dev_err(hdev, + "Unable to find connection for big 0x%2.2x", + ev->handle); + goto unlock; + } + + clear_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags); + + conn->num_bis = 0; + memset(conn->bis, 0, sizeof(conn->num_bis)); + for (i = 0; i < ev->num_bis; i++) { u16 handle = le16_to_cpu(ev->bis[i]); __le32 interval; @@ -6980,6 +6994,10 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, hci_connect_cfm(bis, ev->status); } +unlock: + /* Handle any other pending BIG sync command */ + hci_le_big_create_sync_pending(hdev); + hci_dev_unlock(hdev); } diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 0d98cc16bbac..9499ddfd25e7 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1957,6 +1957,7 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) if (sk) { int err; + struct hci_conn *hcon = iso_pi(sk)->conn->hcon; iso_pi(sk)->qos.bcast.encryption = ev2->encryption; @@ -1965,7 +1966,8 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) if (!test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags) && !test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) { - err = hci_le_big_create_sync(hdev, NULL, + err = hci_le_big_create_sync(hdev, + hcon, &iso_pi(sk)->qos, iso_pi(sk)->sync_handle, iso_pi(sk)->bc_num_bis, -- cgit v1.3 From 83d328a72eff3268ea4c19deb0a6cf4c7da15746 Mon Sep 17 00:00:00 2001 From: Iulia Tanasescu Date: Fri, 1 Nov 2024 10:23:39 +0200 Subject: Bluetooth: ISO: Update hci_conn_hash_lookup_big for Broadcast slave Currently, hci_conn_hash_lookup_big only checks for BIS master connections, by filtering out connections with the destination address set. This commit updates this function to also consider BIS slave connections, since it is also used for a Broadcast Receiver to set an available BIG handle before issuing the LE BIG Create Sync command. Signed-off-by: Iulia Tanasescu Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 12 +++++++++++- net/bluetooth/hci_event.c | 1 + net/bluetooth/iso.c | 1 - 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index c95f7e6ba255..ea798f07c5a2 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1285,7 +1285,17 @@ static inline struct hci_conn *hci_conn_hash_lookup_big(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (bacmp(&c->dst, BDADDR_ANY) || c->type != ISO_LINK) + if (c->type != ISO_LINK) + continue; + + /* An ISO_LINK hcon with BDADDR_ANY as destination + * address is a Broadcast connection. A Broadcast + * slave connection is associated with a PA train, + * so the sync_handle can be used to differentiate + * from unicast. + */ + if (bacmp(&c->dst, BDADDR_ANY) && + c->sync_handle == HCI_SYNC_HANDLE_INVALID) continue; if (handle == c->iso_qos.bcast.big) { diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 2b5ba8acd1d8..aca121408369 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6965,6 +6965,7 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, /* Mark PA sync as established */ set_bit(HCI_CONN_PA_SYNC, &bis->flags); + bis->sync_handle = conn->sync_handle; bis->iso_qos.bcast.big = ev->handle; memset(&interval, 0, sizeof(interval)); memcpy(&interval, ev->latency, sizeof(ev->latency)); diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 9499ddfd25e7..9e119da43147 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1839,7 +1839,6 @@ static void iso_conn_ready(struct iso_conn *conn) if (!bacmp(&hcon->dst, BDADDR_ANY)) { bacpy(&hcon->dst, &iso_pi(parent)->dst); hcon->dst_type = iso_pi(parent)->dst_type; - hcon->sync_handle = iso_pi(parent)->sync_handle; } if (ev3) { -- cgit v1.3 From e6720779ae612a14ac4ba7fe4fd5b27d900d932c Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 1 Oct 2024 15:46:10 -0400 Subject: Bluetooth: SCO: Use kref to track lifetime of sco_conn This make use of kref to keep track of reference of sco_conn which allows better tracking of its lifetime with usage of things like kref_get_unless_zero in a similar way as used in l2cap_chan. In addition to it remove call to sco_sock_set_timer on __sco_sock_close since at that point it is useless to set a timer as the sk will be freed there is nothing to be done in sco_sock_timeout. Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/sco.c | 99 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 71 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 1c7252a36866..1b8e468d24cf 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -51,6 +51,7 @@ struct sco_conn { struct delayed_work timeout_work; unsigned int mtu; + struct kref ref; }; #define sco_conn_lock(c) spin_lock(&c->lock) @@ -76,6 +77,49 @@ struct sco_pinfo { #define SCO_CONN_TIMEOUT (HZ * 40) #define SCO_DISCONN_TIMEOUT (HZ * 2) +static void sco_conn_free(struct kref *ref) +{ + struct sco_conn *conn = container_of(ref, struct sco_conn, ref); + + BT_DBG("conn %p", conn); + + if (conn->sk) + sco_pi(conn->sk)->conn = NULL; + + if (conn->hcon) { + conn->hcon->sco_data = NULL; + hci_conn_drop(conn->hcon); + } + + /* Ensure no more work items will run since hci_conn has been dropped */ + disable_delayed_work_sync(&conn->timeout_work); + + kfree(conn); +} + +static void sco_conn_put(struct sco_conn *conn) +{ + if (!conn) + return; + + BT_DBG("conn %p refcnt %d", conn, kref_read(&conn->ref)); + + kref_put(&conn->ref, sco_conn_free); +} + +static struct sco_conn *sco_conn_hold_unless_zero(struct sco_conn *conn) +{ + if (!conn) + return NULL; + + BT_DBG("conn %p refcnt %u", conn, kref_read(&conn->ref)); + + if (!kref_get_unless_zero(&conn->ref)) + return NULL; + + return conn; +} + static struct sock *sco_sock_hold(struct sco_conn *conn) { if (!conn || !bt_sock_linked(&sco_sk_list, conn->sk)) @@ -92,6 +136,10 @@ static void sco_sock_timeout(struct work_struct *work) timeout_work.work); struct sock *sk; + conn = sco_conn_hold_unless_zero(conn); + if (!conn) + return; + sco_conn_lock(conn); if (!conn->hcon) { sco_conn_unlock(conn); @@ -99,6 +147,7 @@ static void sco_sock_timeout(struct work_struct *work) } sk = sco_sock_hold(conn); sco_conn_unlock(conn); + sco_conn_put(conn); if (!sk) return; @@ -136,9 +185,14 @@ static struct sco_conn *sco_conn_add(struct hci_conn *hcon) { struct sco_conn *conn = hcon->sco_data; + conn = sco_conn_hold_unless_zero(conn); if (conn) { - if (!conn->hcon) + if (!conn->hcon) { + sco_conn_lock(conn); conn->hcon = hcon; + sco_conn_unlock(conn); + } + sco_conn_put(conn); return conn; } @@ -146,6 +200,7 @@ static struct sco_conn *sco_conn_add(struct hci_conn *hcon) if (!conn) return NULL; + kref_init(&conn->ref); spin_lock_init(&conn->lock); INIT_DELAYED_WORK(&conn->timeout_work, sco_sock_timeout); @@ -170,17 +225,15 @@ static void sco_chan_del(struct sock *sk, int err) struct sco_conn *conn; conn = sco_pi(sk)->conn; + sco_pi(sk)->conn = NULL; BT_DBG("sk %p, conn %p, err %d", sk, conn, err); if (conn) { sco_conn_lock(conn); conn->sk = NULL; - sco_pi(sk)->conn = NULL; sco_conn_unlock(conn); - - if (conn->hcon) - hci_conn_drop(conn->hcon); + sco_conn_put(conn); } sk->sk_state = BT_CLOSED; @@ -195,29 +248,28 @@ static void sco_conn_del(struct hci_conn *hcon, int err) struct sco_conn *conn = hcon->sco_data; struct sock *sk; + conn = sco_conn_hold_unless_zero(conn); if (!conn) return; BT_DBG("hcon %p conn %p, err %d", hcon, conn, err); - /* Kill socket */ sco_conn_lock(conn); sk = sco_sock_hold(conn); sco_conn_unlock(conn); + sco_conn_put(conn); - if (sk) { - lock_sock(sk); - sco_sock_clear_timer(sk); - sco_chan_del(sk, err); - release_sock(sk); - sock_put(sk); + if (!sk) { + sco_conn_put(conn); + return; } - /* Ensure no more work items will run before freeing conn. */ - cancel_delayed_work_sync(&conn->timeout_work); - - hcon->sco_data = NULL; - kfree(conn); + /* Kill socket */ + lock_sock(sk); + sco_sock_clear_timer(sk); + sco_chan_del(sk, err); + release_sock(sk); + sock_put(sk); } static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, @@ -401,6 +453,8 @@ static void sco_sock_destruct(struct sock *sk) { BT_DBG("sk %p", sk); + sco_conn_put(sco_pi(sk)->conn); + skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); } @@ -448,17 +502,6 @@ static void __sco_sock_close(struct sock *sk) case BT_CONNECTED: case BT_CONFIG: - if (sco_pi(sk)->conn->hcon) { - sk->sk_state = BT_DISCONN; - sco_sock_set_timer(sk, SCO_DISCONN_TIMEOUT); - sco_conn_lock(sco_pi(sk)->conn); - hci_conn_drop(sco_pi(sk)->conn->hcon); - sco_pi(sk)->conn->hcon = NULL; - sco_conn_unlock(sco_pi(sk)->conn); - } else - sco_chan_del(sk, ECONNRESET); - break; - case BT_CONNECT2: case BT_CONNECT: case BT_DISCONN: -- cgit v1.3 From dc26097bdb864a0d5955b9a25e43376ffc1af99b Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 1 Oct 2024 16:15:51 -0400 Subject: Bluetooth: ISO: Use kref to track lifetime of iso_conn This make use of kref to keep track of reference of iso_conn which allows better tracking of its lifetime with usage of things like kref_get_unless_zero in a similar way as used in l2cap_chan. In addition to it remove call to iso_sock_set_timer on iso_sock_disconn since at that point it is useless to set a timer as the sk will be freed there is nothing to be done in iso_sock_timeout. Fixes: ccf74f2390d6 ("Bluetooth: Add BTPROTO_ISO socket type") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 88 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 71 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 9e119da43147..24e78ada9ad2 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -35,6 +35,7 @@ struct iso_conn { struct sk_buff *rx_skb; __u32 rx_len; __u16 tx_sn; + struct kref ref; }; #define iso_conn_lock(c) spin_lock(&(c)->lock) @@ -93,6 +94,49 @@ static struct sock *iso_get_sock(bdaddr_t *src, bdaddr_t *dst, #define ISO_CONN_TIMEOUT (HZ * 40) #define ISO_DISCONN_TIMEOUT (HZ * 2) +static void iso_conn_free(struct kref *ref) +{ + struct iso_conn *conn = container_of(ref, struct iso_conn, ref); + + BT_DBG("conn %p", conn); + + if (conn->sk) + iso_pi(conn->sk)->conn = NULL; + + if (conn->hcon) { + conn->hcon->iso_data = NULL; + hci_conn_drop(conn->hcon); + } + + /* Ensure no more work items will run since hci_conn has been dropped */ + disable_delayed_work_sync(&conn->timeout_work); + + kfree(conn); +} + +static void iso_conn_put(struct iso_conn *conn) +{ + if (!conn) + return; + + BT_DBG("conn %p refcnt %d", conn, kref_read(&conn->ref)); + + kref_put(&conn->ref, iso_conn_free); +} + +static struct iso_conn *iso_conn_hold_unless_zero(struct iso_conn *conn) +{ + if (!conn) + return NULL; + + BT_DBG("conn %p refcnt %u", conn, kref_read(&conn->ref)); + + if (!kref_get_unless_zero(&conn->ref)) + return NULL; + + return conn; +} + static struct sock *iso_sock_hold(struct iso_conn *conn) { if (!conn || !bt_sock_linked(&iso_sk_list, conn->sk)) @@ -109,9 +153,14 @@ static void iso_sock_timeout(struct work_struct *work) timeout_work.work); struct sock *sk; + conn = iso_conn_hold_unless_zero(conn); + if (!conn) + return; + iso_conn_lock(conn); sk = iso_sock_hold(conn); iso_conn_unlock(conn); + iso_conn_put(conn); if (!sk) return; @@ -149,9 +198,14 @@ static struct iso_conn *iso_conn_add(struct hci_conn *hcon) { struct iso_conn *conn = hcon->iso_data; + conn = iso_conn_hold_unless_zero(conn); if (conn) { - if (!conn->hcon) + if (!conn->hcon) { + iso_conn_lock(conn); conn->hcon = hcon; + iso_conn_unlock(conn); + } + iso_conn_put(conn); return conn; } @@ -159,6 +213,7 @@ static struct iso_conn *iso_conn_add(struct hci_conn *hcon) if (!conn) return NULL; + kref_init(&conn->ref); spin_lock_init(&conn->lock); INIT_DELAYED_WORK(&conn->timeout_work, iso_sock_timeout); @@ -178,17 +233,15 @@ static void iso_chan_del(struct sock *sk, int err) struct sock *parent; conn = iso_pi(sk)->conn; + iso_pi(sk)->conn = NULL; BT_DBG("sk %p, conn %p, err %d", sk, conn, err); if (conn) { iso_conn_lock(conn); conn->sk = NULL; - iso_pi(sk)->conn = NULL; iso_conn_unlock(conn); - - if (conn->hcon) - hci_conn_drop(conn->hcon); + iso_conn_put(conn); } sk->sk_state = BT_CLOSED; @@ -210,6 +263,7 @@ static void iso_conn_del(struct hci_conn *hcon, int err) struct iso_conn *conn = hcon->iso_data; struct sock *sk; + conn = iso_conn_hold_unless_zero(conn); if (!conn) return; @@ -219,20 +273,18 @@ static void iso_conn_del(struct hci_conn *hcon, int err) iso_conn_lock(conn); sk = iso_sock_hold(conn); iso_conn_unlock(conn); + iso_conn_put(conn); - if (sk) { - lock_sock(sk); - iso_sock_clear_timer(sk); - iso_chan_del(sk, err); - release_sock(sk); - sock_put(sk); + if (!sk) { + iso_conn_put(conn); + return; } - /* Ensure no more work items will run before freeing conn. */ - cancel_delayed_work_sync(&conn->timeout_work); - - hcon->iso_data = NULL; - kfree(conn); + lock_sock(sk); + iso_sock_clear_timer(sk); + iso_chan_del(sk, err); + release_sock(sk); + sock_put(sk); } static int __iso_chan_add(struct iso_conn *conn, struct sock *sk, @@ -652,6 +704,8 @@ static void iso_sock_destruct(struct sock *sk) { BT_DBG("sk %p", sk); + iso_conn_put(iso_pi(sk)->conn); + skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); } @@ -711,6 +765,7 @@ static void iso_sock_disconn(struct sock *sk) */ if (bis_sk) { hcon->state = BT_OPEN; + hcon->iso_data = NULL; iso_pi(sk)->conn->hcon = NULL; iso_sock_clear_timer(sk); iso_chan_del(sk, bt_to_errno(hcon->abort_reason)); @@ -720,7 +775,6 @@ static void iso_sock_disconn(struct sock *sk) } sk->sk_state = BT_DISCONN; - iso_sock_set_timer(sk, ISO_DISCONN_TIMEOUT); iso_conn_lock(iso_pi(sk)->conn); hci_conn_drop(iso_pi(sk)->conn->hcon); iso_pi(sk)->conn->hcon = NULL; -- cgit v1.3 From 25ab2db3e60e0e84d7cdc740ea6ae3c10fe61eaa Mon Sep 17 00:00:00 2001 From: Iulia Tanasescu Date: Mon, 11 Nov 2024 13:47:07 +0200 Subject: Bluetooth: hci_conn: Remove alloc from critical section This removes the kzalloc memory allocation inside critical section in create_pa_sync, fixing the following message that appears when the kernel is compiled with CONFIG_DEBUG_ATOMIC_SLEEP enabled: BUG: sleeping function called from invalid context at include/linux/sched/mm.h:321 Signed-off-by: Iulia Tanasescu Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index e996e9763666..b5b78d469d54 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -2079,7 +2079,7 @@ static bool hci_conn_check_create_pa_sync(struct hci_conn *conn) static int create_pa_sync(struct hci_dev *hdev, void *data) { - struct hci_cp_le_pa_create_sync *cp = NULL; + struct hci_cp_le_pa_create_sync cp = {0}; struct hci_conn *conn; int err = 0; @@ -2108,19 +2108,13 @@ static int create_pa_sync(struct hci_dev *hdev, void *data) if (hci_conn_check_create_pa_sync(conn)) { struct bt_iso_qos *qos = &conn->iso_qos; - cp = kzalloc(sizeof(*cp), GFP_KERNEL); - if (!cp) { - err = -ENOMEM; - goto unlock; - } - - cp->options = qos->bcast.options; - cp->sid = conn->sid; - cp->addr_type = conn->dst_type; - bacpy(&cp->addr, &conn->dst); - cp->skip = cpu_to_le16(qos->bcast.skip); - cp->sync_timeout = cpu_to_le16(qos->bcast.sync_timeout); - cp->sync_cte_type = qos->bcast.sync_cte_type; + cp.options = qos->bcast.options; + cp.sid = conn->sid; + cp.addr_type = conn->dst_type; + bacpy(&cp.addr, &conn->dst); + cp.skip = cpu_to_le16(qos->bcast.skip); + cp.sync_timeout = cpu_to_le16(qos->bcast.sync_timeout); + cp.sync_cte_type = qos->bcast.sync_cte_type; break; } @@ -2131,17 +2125,15 @@ unlock: hci_dev_unlock(hdev); - if (cp) { + if (bacmp(&cp.addr, BDADDR_ANY)) { hci_dev_set_flag(hdev, HCI_PA_SYNC); set_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags); err = __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_CREATE_SYNC, - sizeof(*cp), cp, HCI_CMD_TIMEOUT); + sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (!err) err = hci_update_passive_scan_sync(hdev); - kfree(cp); - if (err) { hci_dev_clear_flag(hdev, HCI_PA_SYNC); clear_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags); -- cgit v1.3 From 07a9342b94a91b306ed1cf6aa8254aea210764c9 Mon Sep 17 00:00:00 2001 From: Iulia Tanasescu Date: Mon, 11 Nov 2024 13:47:08 +0200 Subject: Bluetooth: ISO: Send BIG Create Sync via hci_sync Before issuing the LE BIG Create Sync command, an available BIG handle is chosen by iterating through the conn_hash list and finding the first unused value. If a BIG is terminated, the associated hcons are removed from the list and the LE BIG Terminate Sync command is sent via hci_sync queue. However, a new LE BIG Create sync command might be issued via hci_send_cmd, before the previous BIG sync was terminated. This can cause the same BIG handle to be reused and the LE BIG Create Sync to fail with Command Disallowed. < HCI Command: LE Broadcast Isochronous Group Create Sync (0x08|0x006b) BIG Handle: 0x00 BIG Sync Handle: 0x0002 Encryption: Unencrypted (0x00) Broadcast Code[16]: 00000000000000000000000000000000 Maximum Number Subevents: 0x00 Timeout: 20000 ms (0x07d0) Number of BIS: 1 BIS ID: 0x01 > HCI Event: Command Status (0x0f) plen 4 LE Broadcast Isochronous Group Create Sync (0x08|0x006b) ncmd 1 Status: Command Disallowed (0x0c) < HCI Command: LE Broadcast Isochronous Group Terminate Sync (0x08|0x006c) BIG Handle: 0x00 This commit fixes the ordering of the LE BIG Create Sync/LE BIG Terminate Sync commands, to make sure that either the previous BIG sync is terminated before reusing the handle, or that a new handle is chosen for a new sync. Fixes: eca0ae4aea66 ("Bluetooth: Add initial implementation of BIS connections") Signed-off-by: Iulia Tanasescu Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 17 ++++++++++++++++- net/bluetooth/iso.c | 9 +++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index b5b78d469d54..d097e308a755 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -2180,7 +2180,15 @@ static bool hci_conn_check_create_big_sync(struct hci_conn *conn) return true; } -int hci_le_big_create_sync_pending(struct hci_dev *hdev) +static void big_create_sync_complete(struct hci_dev *hdev, void *data, int err) +{ + bt_dev_dbg(hdev, ""); + + if (err) + bt_dev_err(hdev, "Unable to create BIG sync: %d", err); +} + +static int big_create_sync(struct hci_dev *hdev, void *data) { DEFINE_FLEX(struct hci_cp_le_big_create_sync, pdu, bis, num_bis, 0x11); struct hci_conn *conn; @@ -2237,6 +2245,13 @@ unlock: struct_size(pdu, bis, pdu->num_bis), pdu); } +int hci_le_big_create_sync_pending(struct hci_dev *hdev) +{ + /* Queue big_create_sync */ + return hci_cmd_sync_queue_once(hdev, big_create_sync, + NULL, big_create_sync_complete); +} + int hci_le_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, struct bt_iso_qos *qos, __u16 sync_handle, __u8 num_bis, __u8 bis[]) diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 24e78ada9ad2..1b40fd2b2f02 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1392,6 +1392,13 @@ static void iso_conn_big_sync(struct sock *sk) if (!hdev) return; + /* hci_le_big_create_sync requires hdev lock to be held, since + * it enqueues the HCI LE BIG Create Sync command via + * hci_cmd_sync_queue_once, which checks hdev flags that might + * change. + */ + hci_dev_lock(hdev); + if (!test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) { err = hci_le_big_create_sync(hdev, iso_pi(sk)->conn->hcon, &iso_pi(sk)->qos, @@ -1402,6 +1409,8 @@ static void iso_conn_big_sync(struct sock *sk) bt_dev_err(hdev, "hci_le_big_create_sync: %d", err); } + + hci_dev_unlock(hdev); } static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg, -- cgit v1.3 From 55abbd148dfb604ebf3f72d6c3dd2a8063d40718 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 8 Nov 2024 11:19:54 -0500 Subject: Bluetooth: hci_core: Fix calling mgmt_device_connected Since 61a939c68ee0 ("Bluetooth: Queue incoming ACL data until BT_CONNECTED state is reached") there is no long the need to call mgmt_device_connected as ACL data will be queued until BT_CONNECTED state. Link: https://bugzilla.kernel.org/show_bug.cgi?id=219458 Link: https://github.com/bluez/bluez/issues/1014 Fixes: 333b4fd11e89 ("Bluetooth: L2CAP: Fix uaf in l2cap_connect") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index f6cff34a8542..f9e19f9cb5a3 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3792,8 +3792,6 @@ static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_lock(hdev); conn = hci_conn_hash_lookup_handle(hdev, handle); - if (conn && hci_dev_test_flag(hdev, HCI_MGMT)) - mgmt_device_connected(hdev, conn, NULL, 0); hci_dev_unlock(hdev); if (conn) { -- cgit v1.3 From 27aabf27fd014ae037cc179c61b0bee7cff55b3d Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Fri, 1 Nov 2024 14:44:10 +0300 Subject: Bluetooth: fix use-after-free in device_for_each_child() Syzbot has reported the following KASAN splat: BUG: KASAN: slab-use-after-free in device_for_each_child+0x18f/0x1a0 Read of size 8 at addr ffff88801f605308 by task kbnepd bnep0/4980 CPU: 0 UID: 0 PID: 4980 Comm: kbnepd bnep0 Not tainted 6.12.0-rc4-00161-gae90f6a6170d #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-2.fc40 04/01/2014 Call Trace: dump_stack_lvl+0x100/0x190 ? device_for_each_child+0x18f/0x1a0 print_report+0x13a/0x4cb ? __virt_addr_valid+0x5e/0x590 ? __phys_addr+0xc6/0x150 ? device_for_each_child+0x18f/0x1a0 kasan_report+0xda/0x110 ? device_for_each_child+0x18f/0x1a0 ? __pfx_dev_memalloc_noio+0x10/0x10 device_for_each_child+0x18f/0x1a0 ? __pfx_device_for_each_child+0x10/0x10 pm_runtime_set_memalloc_noio+0xf2/0x180 netdev_unregister_kobject+0x1ed/0x270 unregister_netdevice_many_notify+0x123c/0x1d80 ? __mutex_trylock_common+0xde/0x250 ? __pfx_unregister_netdevice_many_notify+0x10/0x10 ? trace_contention_end+0xe6/0x140 ? __mutex_lock+0x4e7/0x8f0 ? __pfx_lock_acquire.part.0+0x10/0x10 ? rcu_is_watching+0x12/0xc0 ? unregister_netdev+0x12/0x30 unregister_netdevice_queue+0x30d/0x3f0 ? __pfx_unregister_netdevice_queue+0x10/0x10 ? __pfx_down_write+0x10/0x10 unregister_netdev+0x1c/0x30 bnep_session+0x1fb3/0x2ab0 ? __pfx_bnep_session+0x10/0x10 ? __pfx_lock_release+0x10/0x10 ? __pfx_woken_wake_function+0x10/0x10 ? __kthread_parkme+0x132/0x200 ? __pfx_bnep_session+0x10/0x10 ? kthread+0x13a/0x370 ? __pfx_bnep_session+0x10/0x10 kthread+0x2b7/0x370 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x48/0x80 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 Allocated by task 4974: kasan_save_stack+0x30/0x50 kasan_save_track+0x14/0x30 __kasan_kmalloc+0xaa/0xb0 __kmalloc_noprof+0x1d1/0x440 hci_alloc_dev_priv+0x1d/0x2820 __vhci_create_device+0xef/0x7d0 vhci_write+0x2c7/0x480 vfs_write+0x6a0/0xfc0 ksys_write+0x12f/0x260 do_syscall_64+0xc7/0x250 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 4979: kasan_save_stack+0x30/0x50 kasan_save_track+0x14/0x30 kasan_save_free_info+0x3b/0x60 __kasan_slab_free+0x4f/0x70 kfree+0x141/0x490 hci_release_dev+0x4d9/0x600 bt_host_release+0x6a/0xb0 device_release+0xa4/0x240 kobject_put+0x1ec/0x5a0 put_device+0x1f/0x30 vhci_release+0x81/0xf0 __fput+0x3f6/0xb30 task_work_run+0x151/0x250 do_exit+0xa79/0x2c30 do_group_exit+0xd5/0x2a0 get_signal+0x1fcd/0x2210 arch_do_signal_or_restart+0x93/0x780 syscall_exit_to_user_mode+0x140/0x290 do_syscall_64+0xd4/0x250 entry_SYSCALL_64_after_hwframe+0x77/0x7f In 'hci_conn_del_sysfs()', 'device_unregister()' may be called when an underlying (kobject) reference counter is greater than 1. This means that reparenting (happened when the device is actually freed) is delayed and, during that delay, parent controller device (hciX) may be deleted. Since the latter may create a dangling pointer to freed parent, avoid that scenario by reparenting to NULL explicitly. Reported-by: syzbot+6cf5652d3df49fae2e3f@syzkaller.appspotmail.com Tested-by: syzbot+6cf5652d3df49fae2e3f@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=6cf5652d3df49fae2e3f Fixes: a85fb91e3d72 ("Bluetooth: Fix double free in hci_conn_cleanup") Signed-off-by: Dmitry Antipov Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sysfs.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c index 367e32fe30eb..4b54dbbf0729 100644 --- a/net/bluetooth/hci_sysfs.c +++ b/net/bluetooth/hci_sysfs.c @@ -21,16 +21,6 @@ static const struct device_type bt_link = { .release = bt_link_release, }; -/* - * The rfcomm tty device will possibly retain even when conn - * is down, and sysfs doesn't support move zombie device, - * so we should move the device before conn device is destroyed. - */ -static int __match_tty(struct device *dev, void *data) -{ - return !strncmp(dev_name(dev), "rfcomm", 6); -} - void hci_conn_init_sysfs(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; @@ -73,10 +63,13 @@ void hci_conn_del_sysfs(struct hci_conn *conn) return; } + /* If there are devices using the connection as parent reset it to NULL + * before unregistering the device. + */ while (1) { struct device *dev; - dev = device_find_child(&conn->dev, NULL, __match_tty); + dev = device_find_any_child(&conn->dev); if (!dev) break; device_move(dev, NULL, DPM_ORDER_DEV_LAST); -- cgit v1.3 From 827af4787e74e8df9e8e0677a69fbb15e0856d2f Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 23 Oct 2024 16:55:57 -0400 Subject: Bluetooth: MGMT: Add initial implementation of MGMT_OP_HCI_CMD_SYNC This adds the initial implementation of MGMT_OP_HCI_CMD_SYNC as documented in mgmt-api (BlueZ tree): Send HCI command and wait for event Command =========================================== Command Code: 0x005B Controller Index: Command Parameters: Opcode (2 Octets) Event (1 Octet) Timeout (1 Octet) Parameter Length (2 Octets) Parameter (variable) Return Parameters: Response (1-variable Octets) This command may be used to send a HCI command and wait for an (optional) event. The HCI command is specified by the Opcode, any arbitrary is supported including vendor commands, but contrary to the like of Raw/User channel it is run as an HCI command send by the kernel since it uses its command synchronization thus it is possible to wait for a specific event as a response. Setting event to 0x00 will cause the command to wait for either HCI Command Status or HCI Command Complete. Timeout is specified in seconds, setting it to 0 will cause the default timeout to be used. Possible errors: Failed Invalid Parameters Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/mgmt.h | 10 ++++++++ net/bluetooth/mgmt.c | 60 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) (limited to 'net') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index d382679efd2b..affac861efdc 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -878,6 +878,16 @@ struct mgmt_cp_mesh_send_cancel { } __packed; #define MGMT_MESH_SEND_CANCEL_SIZE 1 +#define MGMT_OP_HCI_CMD_SYNC 0x005B +struct mgmt_cp_hci_cmd_sync { + __le16 opcode; + __u8 event; + __u8 timeout; + __le16 params_len; + __u8 params[]; +} __packed; +#define MGMT_HCI_CMD_SYNC_SIZE 6 + #define MGMT_EV_CMD_COMPLETE 0x0001 struct mgmt_ev_cmd_complete { __le16 opcode; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index a429661b676a..1f6d083682b8 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -132,6 +132,7 @@ static const u16 mgmt_commands[] = { MGMT_OP_MESH_READ_FEATURES, MGMT_OP_MESH_SEND, MGMT_OP_MESH_SEND_CANCEL, + MGMT_OP_HCI_CMD_SYNC, }; static const u16 mgmt_events[] = { @@ -2515,6 +2516,64 @@ unlock: return err; } +static int send_hci_cmd_sync(struct hci_dev *hdev, void *data) +{ + struct mgmt_pending_cmd *cmd = data; + struct mgmt_cp_hci_cmd_sync *cp = cmd->param; + struct sk_buff *skb; + + skb = __hci_cmd_sync_ev(hdev, le16_to_cpu(cp->opcode), + le16_to_cpu(cp->params_len), cp->params, + cp->event, cp->timeout ? + msecs_to_jiffies(cp->timeout * 1000) : + HCI_CMD_TIMEOUT); + if (IS_ERR(skb)) { + mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_HCI_CMD_SYNC, + mgmt_status(PTR_ERR(skb))); + goto done; + } + + mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_HCI_CMD_SYNC, 0, + skb->data, skb->len); + + kfree_skb(skb); + +done: + mgmt_pending_free(cmd); + + return 0; +} + +static int mgmt_hci_cmd_sync(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + struct mgmt_cp_hci_cmd_sync *cp = data; + struct mgmt_pending_cmd *cmd; + int err; + + if (len < sizeof(*cp)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_HCI_CMD_SYNC, + MGMT_STATUS_INVALID_PARAMS); + + hci_dev_lock(hdev); + cmd = mgmt_pending_new(sk, MGMT_OP_HCI_CMD_SYNC, hdev, data, len); + if (!cmd) + err = -ENOMEM; + else + err = hci_cmd_sync_queue(hdev, send_hci_cmd_sync, cmd, NULL); + + if (err < 0) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_HCI_CMD_SYNC, + MGMT_STATUS_FAILED); + + if (cmd) + mgmt_pending_free(cmd); + } + + hci_dev_unlock(hdev); + return err; +} + /* This is a helper function to test for pending mgmt commands that can * cause CoD or EIR HCI commands. We can only allow one such pending * mgmt command at a time since otherwise we cannot easily track what @@ -9371,6 +9430,7 @@ static const struct hci_mgmt_handler mgmt_handlers[] = { { mesh_send, MGMT_MESH_SEND_SIZE, HCI_MGMT_VAR_LEN }, { mesh_send_cancel, MGMT_MESH_SEND_CANCEL_SIZE }, + { mgmt_hci_cmd_sync, MGMT_HCI_CMD_SYNC_SIZE, HCI_MGMT_VAR_LEN }, }; void mgmt_index_added(struct hci_dev *hdev) -- cgit v1.3 From bfe086be5c4c644602e26840683dfdd893f22d04 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 8 Nov 2024 17:47:12 +0100 Subject: bpf: ipv4: Prepare __bpf_redirect_neigh_v4() to future .flowi4_tos conversion. Use ip4h_dscp() to get the DSCP from the IPv4 header, then convert the dscp_t value to __u8 with inet_dscp_to_dsfield(). Then, when we'll convert .flowi4_tos to dscp_t, we'll just have to drop the inet_dscp_to_dsfield() call. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/35eacc8955003e434afb1365d404193cc98a9579.1731064982.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index bc33692acabc..4c396305cd4d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2355,7 +2355,7 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, struct flowi4 fl4 = { .flowi4_flags = FLOWI_FLAG_ANYSRC, .flowi4_mark = skb->mark, - .flowi4_tos = ip4h->tos & INET_DSCP_MASK, + .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)), .flowi4_oif = dev->ifindex, .flowi4_proto = ip4h->protocol, .daddr = ip4h->daddr, -- cgit v1.3 From dab9c6307161adc626dd21b8e9596a289e714155 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 8 Nov 2024 17:47:15 +0100 Subject: bpf: lwtunnel: Prepare bpf_lwt_xmit_reroute() to future .flowi4_tos conversion. Use ip4h_dscp() to get the DSCP from the IPv4 header, then convert the dscp_t value to __u8 with inet_dscp_to_dsfield(). Then, when we'll convert .flowi4_tos to dscp_t, we'll just have to drop the inet_dscp_to_dsfield() call. Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/8338a12377c44f698a651d1ce357dd92bdf18120.1731064982.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- net/core/lwt_bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 8a78bff53b2c..ae74634310a3 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -209,7 +209,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) fl4.flowi4_oif = oif; fl4.flowi4_mark = skb->mark; fl4.flowi4_uid = sock_net_uid(net, sk); - fl4.flowi4_tos = iph->tos & INET_DSCP_MASK; + fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; fl4.flowi4_proto = iph->protocol; fl4.daddr = iph->daddr; -- cgit v1.3 From 9e43ad7a1edef268acac603e1975c8f50a20d02f Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Wed, 13 Nov 2024 12:13:09 +0000 Subject: net: ethtool: only allow set_rxnfc with rss + ring_cookie if driver opts in Ethtool ntuple filters with FLOW_RSS were originally defined as adding the base queue ID (ring_cookie) to the value from the indirection table, so that the same table could distribute over more than one set of queues when used by different filters. However, some drivers / hardware ignore the ring_cookie, and simply use the indirection table entries as queue IDs directly. Thus, for drivers which have not opted in by setting ethtool_ops.cap_rss_rxnfc_adds to declare that they support the original (addition) semantics, reject in ethtool_set_rxnfc any filter which combines FLOW_RSS and a nonzero ring. (For a ring_cookie of zero, both behaviours are equivalent.) Set the cap bit in sfc, as it is known to support this feature. Signed-off-by: Edward Cree Reviewed-by: Martin Habets Link: https://patch.msgid.link/cc3da0844083b0e301a33092a6299e4042b65221.1731499022.git.ecree.xilinx@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/sfc/ef100_ethtool.c | 1 + drivers/net/ethernet/sfc/ethtool.c | 1 + include/linux/ethtool.h | 4 ++++ net/ethtool/ioctl.c | 5 +++++ 4 files changed, 11 insertions(+) (limited to 'net') diff --git a/drivers/net/ethernet/sfc/ef100_ethtool.c b/drivers/net/ethernet/sfc/ef100_ethtool.c index 5c2551369812..6c3b74000d3b 100644 --- a/drivers/net/ethernet/sfc/ef100_ethtool.c +++ b/drivers/net/ethernet/sfc/ef100_ethtool.c @@ -59,6 +59,7 @@ const struct ethtool_ops ef100_ethtool_ops = { .get_rxfh_indir_size = efx_ethtool_get_rxfh_indir_size, .get_rxfh_key_size = efx_ethtool_get_rxfh_key_size, .rxfh_per_ctx_key = true, + .cap_rss_rxnfc_adds = true, .rxfh_priv_size = sizeof(struct efx_rss_context_priv), .get_rxfh = efx_ethtool_get_rxfh, .set_rxfh = efx_ethtool_set_rxfh, diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c index bb1930818beb..83d715544f7f 100644 --- a/drivers/net/ethernet/sfc/ethtool.c +++ b/drivers/net/ethernet/sfc/ethtool.c @@ -263,6 +263,7 @@ const struct ethtool_ops efx_ethtool_ops = { .get_rxfh_indir_size = efx_ethtool_get_rxfh_indir_size, .get_rxfh_key_size = efx_ethtool_get_rxfh_key_size, .rxfh_per_ctx_key = true, + .cap_rss_rxnfc_adds = true, .rxfh_priv_size = sizeof(struct efx_rss_context_priv), .get_rxfh = efx_ethtool_get_rxfh, .set_rxfh = efx_ethtool_set_rxfh, diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 1199e308c8dd..299280c94d07 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -734,6 +734,9 @@ struct kernel_ethtool_ts_info { * @rxfh_per_ctx_key: device supports setting different RSS key for each * additional context. Netlink API should report hfunc, key, and input_xfrm * for every context, not just context 0. + * @cap_rss_rxnfc_adds: device supports nonzero ring_cookie in filters with + * %FLOW_RSS flag; the queue ID from the filter is added to the value from + * the indirection table to determine the delivery queue. * @rxfh_indir_space: max size of RSS indirection tables, if indirection table * size as returned by @get_rxfh_indir_size may change during lifetime * of the device. Leave as 0 if the table size is constant. @@ -956,6 +959,7 @@ struct ethtool_ops { u32 cap_rss_ctx_supported:1; u32 cap_rss_sym_xor_supported:1; u32 rxfh_per_ctx_key:1; + u32 cap_rss_rxnfc_adds:1; u32 rxfh_indir_space; u16 rxfh_key_space; u16 rxfh_priv_size; diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 7da94e26ced6..d86399bcf223 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -992,6 +992,11 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, if (rc) return rc; + /* Nonzero ring with RSS only makes sense if NIC adds them together */ + if (info.flow_type & FLOW_RSS && !ops->cap_rss_rxnfc_adds && + ethtool_get_flow_spec_ring(info.fs.ring_cookie)) + return -EINVAL; + if (ops->get_rxfh) { struct ethtool_rxfh_param rxfh = {}; -- cgit v1.3 From a64499f618b2e3ace083727237f8802aabc73008 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Wed, 13 Nov 2024 12:13:10 +0000 Subject: net: ethtool: account for RSS+RXNFC add semantics when checking channel count In ethtool_check_max_channel(), the new RX count must not only cover the max queue indices in RSS indirection tables and RXNFC destinations separately, but must also, for RXNFC rules with FLOW_RSS, cover the sum of the destination queue and the maximum index in the associated RSS context's indirection table, since that is the highest queue that the rule can actually deliver traffic to. It could be argued that the max queue across all custom RSS contexts (ethtool_get_max_rss_ctx_channel()) need no longer be considered, since any context to which packets can actually be delivered will be targeted by some RXNFC rule and its max will thus be allowed for by ethtool_get_max_rxnfc_channel(). For simplicity we keep both checks, so even RSS contexts unused by any RXNFC rule must fit the channel count. Signed-off-by: Edward Cree Link: https://patch.msgid.link/43257d375434bef388e36181492aa4c458b88336.1731499022.git.ecree.xilinx@gmail.com Signed-off-by: Jakub Kicinski --- net/ethtool/common.c | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 0d62363dbd9d..05ce4f8080b3 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -538,6 +538,20 @@ static int ethtool_get_rxnfc_rule_count(struct net_device *dev) return info.rule_cnt; } +/* Max offset for one RSS context */ +static u32 ethtool_get_rss_ctx_max_channel(struct ethtool_rxfh_context *ctx) +{ + u32 max_ring = 0; + u32 i, *tbl; + + if (WARN_ON_ONCE(!ctx)) + return 0; + tbl = ethtool_rxfh_context_indir(ctx); + for (i = 0; i < ctx->indir_size; i++) + max_ring = max(max_ring, tbl[i]); + return max_ring; +} + static int ethtool_get_max_rxnfc_channel(struct net_device *dev, u64 *max) { const struct ethtool_ops *ops = dev->ethtool_ops; @@ -574,10 +588,18 @@ static int ethtool_get_max_rxnfc_channel(struct net_device *dev, u64 *max) if (rule_info.fs.ring_cookie != RX_CLS_FLOW_DISC && rule_info.fs.ring_cookie != RX_CLS_FLOW_WAKE && - !(rule_info.flow_type & FLOW_RSS) && - !ethtool_get_flow_spec_ring_vf(rule_info.fs.ring_cookie)) - max_ring = - max_t(u64, max_ring, rule_info.fs.ring_cookie); + !ethtool_get_flow_spec_ring_vf(rule_info.fs.ring_cookie)) { + u64 ring = rule_info.fs.ring_cookie; + + if (rule_info.flow_type & FLOW_RSS) { + struct ethtool_rxfh_context *ctx; + + ctx = xa_load(&dev->ethtool->rss_ctx, + rule_info.rss_context); + ring += ethtool_get_rss_ctx_max_channel(ctx); + } + max_ring = max_t(u64, max_ring, ring); + } } kvfree(info); @@ -589,6 +611,7 @@ err_free_info: return err; } +/* Max offset across all of a device's RSS contexts */ static u32 ethtool_get_max_rss_ctx_channel(struct net_device *dev) { struct ethtool_rxfh_context *ctx; @@ -596,13 +619,8 @@ static u32 ethtool_get_max_rss_ctx_channel(struct net_device *dev) u32 max_ring = 0; mutex_lock(&dev->ethtool->rss_lock); - xa_for_each(&dev->ethtool->rss_ctx, context, ctx) { - u32 i, *tbl; - - tbl = ethtool_rxfh_context_indir(ctx); - for (i = 0; i < ctx->indir_size; i++) - max_ring = max(max_ring, tbl[i]); - } + xa_for_each(&dev->ethtool->rss_ctx, context, ctx) + max_ring = max(max_ring, ethtool_get_rss_ctx_max_channel(ctx)); mutex_unlock(&dev->ethtool->rss_lock); return max_ring; @@ -611,7 +629,7 @@ static u32 ethtool_get_max_rss_ctx_channel(struct net_device *dev) static u32 ethtool_get_max_rxfh_channel(struct net_device *dev) { struct ethtool_rxfh_param rxfh = {}; - u32 dev_size, current_max; + u32 dev_size, current_max = 0; int ret; /* While we do track whether RSS context has an indirection -- cgit v1.3 From a35672819f8d85e2ae38b80d40b923e3ef81e4ea Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Thu, 14 Nov 2024 12:06:56 +0100 Subject: xfrm: Fix acquire state insertion. A recent commit jumped over the dst hash computation and left the symbol uninitialized. Fix this by explicitly computing the dst hash before it is used. Fixes: 0045e3d80613 ("xfrm: Cache used outbound xfrm states at the policy.") Reported-by: Dan Carpenter Reviewed-by: Simon Horman Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index e3266a5d4f90..67ca7ac955a3 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1470,6 +1470,7 @@ found: x->km.state = XFRM_STATE_ACQ; x->dir = XFRM_SA_DIR_OUT; list_add(&x->km.all, &net->xfrm.state_all); + h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family); XFRM_STATE_INSERT(bydst, &x->bydst, net->xfrm.state_bydst + h, x->xso.type); -- cgit v1.3 From 0608746f95b29402421c0e0e96005afba45178ec Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 14 Nov 2024 17:03:21 +0100 Subject: netfilter: ipv4: Convert ip_route_me_harder() to dscp_t. Use ip4h_dscp()instead of reading iph->tos directly. ip4h_dscp() returns a dscp_t value which is temporarily converted back to __u8 with inet_dscp_to_dsfield(). When converting ->flowi4_tos to dscp_t in the future, we'll only have to remove that inet_dscp_to_dsfield() call. Signed-off-by: Guillaume Nault Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index e0aab66cd925..08bc3f2c0078 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -44,7 +44,7 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un */ fl4.daddr = iph->daddr; fl4.saddr = saddr; - fl4.flowi4_tos = iph->tos & INET_DSCP_MASK; + fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0; fl4.flowi4_l3mdev = l3mdev_master_ifindex(dev); fl4.flowi4_mark = skb->mark; -- cgit v1.3 From 6f9615a6e686bc0acfb5a02050a50782a6a378b2 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 14 Nov 2024 17:03:31 +0100 Subject: netfilter: flow_offload: Convert nft_flow_route() to dscp_t. Use ip4h_dscp()instead of reading ip_hdr()->tos directly. ip4h_dscp() returns a dscp_t value which is temporarily converted back to __u8 with inet_dscp_to_dsfield(). When converting ->flowi4_tos to dscp_t in the future, we'll only have to remove that inet_dscp_to_dsfield() call. Also, remove the comment about the net/ip.h include file, since it's now required for the ip4h_dscp() helper too. Signed-off-by: Guillaume Nault Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_flow_offload.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 65199c23c75c..3b474d235663 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -8,7 +8,7 @@ #include #include #include -#include /* for ipv4 options. */ +#include #include #include #include @@ -236,7 +236,7 @@ static int nft_flow_route(const struct nft_pktinfo *pkt, fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; - fl.u.ip4.flowi4_tos = ip_hdr(pkt->skb)->tos & INET_DSCP_MASK; + fl.u.ip4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(pkt->skb))); fl.u.ip4.flowi4_mark = pkt->skb->mark; fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; break; -- cgit v1.3 From f694ce6de58930146037aa3f69a534e98b007ff3 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 14 Nov 2024 17:03:38 +0100 Subject: netfilter: rpfilter: Convert rpfilter_mt() to dscp_t. Use ip4h_dscp() instead of reading iph->tos directly. ip4h_dscp() returns a dscp_t value which is temporarily converted back to __u8 with inet_dscp_to_dsfield(). When converting ->flowi4_tos to dscp_t in the future, we'll only have to remove that inet_dscp_to_dsfield() call. Signed-off-by: Guillaume Nault Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_rpfilter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index 1ce7a1655b97..a27782d7653e 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -76,7 +76,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) flow.daddr = iph->saddr; flow.saddr = rpfilter_get_saddr(iph->daddr); flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; - flow.flowi4_tos = iph->tos & INET_DSCP_MASK; + flow.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); flow.flowi4_scope = RT_SCOPE_UNIVERSE; flow.flowi4_l3mdev = l3mdev_master_ifindex_rcu(xt_in(par)); flow.flowi4_uid = sock_net_uid(xt_net(par), NULL); -- cgit v1.3 From f12b67cc7d1b67fe9ecee537df5b55625889ca9f Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 14 Nov 2024 17:03:45 +0100 Subject: netfilter: nft_fib: Convert nft_fib4_eval() to dscp_t. Use ip4h_dscp() instead of reading iph->tos directly. ip4h_dscp() returns a dscp_t value which is temporarily converted back to __u8 with inet_dscp_to_dsfield(). When converting ->flowi4_tos to dscp_t in the future, we'll only have to remove that inet_dscp_to_dsfield() call. Signed-off-by: Guillaume Nault Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nft_fib_ipv4.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 09fff5d424ef..625adbc42037 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -107,7 +108,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, if (priv->flags & NFTA_FIB_F_MARK) fl4.flowi4_mark = pkt->skb->mark; - fl4.flowi4_tos = iph->tos & INET_DSCP_MASK; + fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); if (priv->flags & NFTA_FIB_F_DADDR) { fl4.daddr = iph->daddr; -- cgit v1.3 From f0d839c13ed50175a6e8b3a5ccd591ed15307995 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 14 Nov 2024 17:03:52 +0100 Subject: netfilter: nf_dup4: Convert nf_dup_ipv4_route() to dscp_t. Use ip4h_dscp() instead of reading iph->tos directly. ip4h_dscp() returns a dscp_t value which is temporarily converted back to __u8 with inet_dscp_to_dsfield(). When converting ->flowi4_tos to dscp_t in the future, we'll only have to remove that inet_dscp_to_dsfield() call. Signed-off-by: Guillaume Nault Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_dup_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index ec94ee1051c7..25e1e8eb18dd 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -33,7 +33,7 @@ static bool nf_dup_ipv4_route(struct net *net, struct sk_buff *skb, fl4.flowi4_oif = oif; fl4.daddr = gw->s_addr; - fl4.flowi4_tos = iph->tos & INET_DSCP_MASK; + fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH; rt = ip_route_output_key(net, &fl4); -- cgit v1.3 From a12143e6084c502fc3cfaa8b717bffc8c14cf806 Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Thu, 14 Nov 2024 22:07:51 +0100 Subject: netfilter: bitwise: rename some boolean operation functions In the next patch we add support for doing AND, OR and XOR operations directly in the kernel, so rename some functions and an enum constant related to mask-and-xor boolean operations. Signed-off-by: Jeremy Sowden Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 10 +++++++--- net/netfilter/nft_bitwise.c | 34 ++++++++++++++++---------------- 2 files changed, 24 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 9e9079321380..487542234ccd 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -564,16 +564,20 @@ enum nft_immediate_attributes { /** * enum nft_bitwise_ops - nf_tables bitwise operations * - * @NFT_BITWISE_BOOL: mask-and-xor operation used to implement NOT, AND, OR and - * XOR boolean operations + * @NFT_BITWISE_MASK_XOR: mask-and-xor operation used to implement NOT, AND, OR + * and XOR boolean operations * @NFT_BITWISE_LSHIFT: left-shift operation * @NFT_BITWISE_RSHIFT: right-shift operation */ enum nft_bitwise_ops { - NFT_BITWISE_BOOL, + NFT_BITWISE_MASK_XOR, NFT_BITWISE_LSHIFT, NFT_BITWISE_RSHIFT, }; +/* + * Old name for NFT_BITWISE_MASK_XOR. Retained for backwards-compatibility. + */ +#define NFT_BITWISE_BOOL NFT_BITWISE_MASK_XOR /** * enum nft_bitwise_attributes - nf_tables bitwise expression netlink attributes diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 7de95674fd8c..7f6a4f800537 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -25,8 +25,8 @@ struct nft_bitwise { struct nft_data data; }; -static void nft_bitwise_eval_bool(u32 *dst, const u32 *src, - const struct nft_bitwise *priv) +static void nft_bitwise_eval_mask_xor(u32 *dst, const u32 *src, + const struct nft_bitwise *priv) { unsigned int i; @@ -68,8 +68,8 @@ void nft_bitwise_eval(const struct nft_expr *expr, u32 *dst = ®s->data[priv->dreg]; switch (priv->op) { - case NFT_BITWISE_BOOL: - nft_bitwise_eval_bool(dst, src, priv); + case NFT_BITWISE_MASK_XOR: + nft_bitwise_eval_mask_xor(dst, src, priv); break; case NFT_BITWISE_LSHIFT: nft_bitwise_eval_lshift(dst, src, priv); @@ -90,8 +90,8 @@ static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { [NFTA_BITWISE_DATA] = { .type = NLA_NESTED }, }; -static int nft_bitwise_init_bool(struct nft_bitwise *priv, - const struct nlattr *const tb[]) +static int nft_bitwise_init_mask_xor(struct nft_bitwise *priv, + const struct nlattr *const tb[]) { struct nft_data_desc mask = { .type = NFT_DATA_VALUE, @@ -185,7 +185,7 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, if (tb[NFTA_BITWISE_OP]) { priv->op = ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])); switch (priv->op) { - case NFT_BITWISE_BOOL: + case NFT_BITWISE_MASK_XOR: case NFT_BITWISE_LSHIFT: case NFT_BITWISE_RSHIFT: break; @@ -193,12 +193,12 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } } else { - priv->op = NFT_BITWISE_BOOL; + priv->op = NFT_BITWISE_MASK_XOR; } switch(priv->op) { - case NFT_BITWISE_BOOL: - err = nft_bitwise_init_bool(priv, tb); + case NFT_BITWISE_MASK_XOR: + err = nft_bitwise_init_mask_xor(priv, tb); break; case NFT_BITWISE_LSHIFT: case NFT_BITWISE_RSHIFT: @@ -209,8 +209,8 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, return err; } -static int nft_bitwise_dump_bool(struct sk_buff *skb, - const struct nft_bitwise *priv) +static int nft_bitwise_dump_mask_xor(struct sk_buff *skb, + const struct nft_bitwise *priv) { if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask, NFT_DATA_VALUE, priv->len) < 0) @@ -248,8 +248,8 @@ static int nft_bitwise_dump(struct sk_buff *skb, return -1; switch (priv->op) { - case NFT_BITWISE_BOOL: - err = nft_bitwise_dump_bool(skb, priv); + case NFT_BITWISE_MASK_XOR: + err = nft_bitwise_dump_mask_xor(skb, priv); break; case NFT_BITWISE_LSHIFT: case NFT_BITWISE_RSHIFT: @@ -269,7 +269,7 @@ static int nft_bitwise_offload(struct nft_offload_ctx *ctx, const struct nft_bitwise *priv = nft_expr_priv(expr); struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; - if (priv->op != NFT_BITWISE_BOOL) + if (priv->op != NFT_BITWISE_MASK_XOR) return -EOPNOTSUPP; if (memcmp(&priv->xor, &zero, sizeof(priv->xor)) || @@ -406,7 +406,7 @@ nft_bitwise_fast_dump(struct sk_buff *skb, return -1; if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(sizeof(u32)))) return -1; - if (nla_put_be32(skb, NFTA_BITWISE_OP, htonl(NFT_BITWISE_BOOL))) + if (nla_put_be32(skb, NFTA_BITWISE_OP, htonl(NFT_BITWISE_MASK_XOR))) return -1; data.data[0] = priv->mask; @@ -501,7 +501,7 @@ nft_bitwise_select_ops(const struct nft_ctx *ctx, return &nft_bitwise_ops; if (tb[NFTA_BITWISE_OP] && - ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])) != NFT_BITWISE_BOOL) + ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])) != NFT_BITWISE_MASK_XOR) return &nft_bitwise_ops; return &nft_bitwise_fast_ops; -- cgit v1.3 From b0ccf4f53d968e794a4ea579d5135cc1aaf1a53f Mon Sep 17 00:00:00 2001 From: Jeremy Sowden Date: Thu, 14 Nov 2024 22:08:13 +0100 Subject: netfilter: bitwise: add support for doing AND, OR and XOR directly Hitherto, these operations have been converted in user space to mask-and-xor operations on one register and two immediate values, and it is the latter which have been evaluated by the kernel. We add support for evaluating these operations directly in kernel space on one register and either an immediate value or a second register. Pablo made a few changes to the original patch: - EINVAL if NFTA_BITWISE_SREG2 is used with fast version. - Allow _AND,_OR,_XOR with _DATA != sizeof(u32) - Dump _SREG2 or _DATA with _AND,_OR,_XOR Signed-off-by: Jeremy Sowden Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 8 ++ net/netfilter/nft_bitwise.c | 134 ++++++++++++++++++++++++++++--- 2 files changed, 131 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 487542234ccd..49c944e78463 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -568,11 +568,17 @@ enum nft_immediate_attributes { * and XOR boolean operations * @NFT_BITWISE_LSHIFT: left-shift operation * @NFT_BITWISE_RSHIFT: right-shift operation + * @NFT_BITWISE_AND: and operation + * @NFT_BITWISE_OR: or operation + * @NFT_BITWISE_XOR: xor operation */ enum nft_bitwise_ops { NFT_BITWISE_MASK_XOR, NFT_BITWISE_LSHIFT, NFT_BITWISE_RSHIFT, + NFT_BITWISE_AND, + NFT_BITWISE_OR, + NFT_BITWISE_XOR, }; /* * Old name for NFT_BITWISE_MASK_XOR. Retained for backwards-compatibility. @@ -590,6 +596,7 @@ enum nft_bitwise_ops { * @NFTA_BITWISE_OP: type of operation (NLA_U32: nft_bitwise_ops) * @NFTA_BITWISE_DATA: argument for non-boolean operations * (NLA_NESTED: nft_data_attributes) + * @NFTA_BITWISE_SREG2: second source register (NLA_U32: nft_registers) * * The bitwise expression supports boolean and shift operations. It implements * the boolean operations by performing the following operation: @@ -613,6 +620,7 @@ enum nft_bitwise_attributes { NFTA_BITWISE_XOR, NFTA_BITWISE_OP, NFTA_BITWISE_DATA, + NFTA_BITWISE_SREG2, __NFTA_BITWISE_MAX }; #define NFTA_BITWISE_MAX (__NFTA_BITWISE_MAX - 1) diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 7f6a4f800537..d550910aabec 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -17,6 +17,7 @@ struct nft_bitwise { u8 sreg; + u8 sreg2; u8 dreg; enum nft_bitwise_ops op:8; u8 len; @@ -60,28 +61,72 @@ static void nft_bitwise_eval_rshift(u32 *dst, const u32 *src, } } +static void nft_bitwise_eval_and(u32 *dst, const u32 *src, const u32 *src2, + const struct nft_bitwise *priv) +{ + unsigned int i, n; + + for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++) + dst[i] = src[i] & src2[i]; +} + +static void nft_bitwise_eval_or(u32 *dst, const u32 *src, const u32 *src2, + const struct nft_bitwise *priv) +{ + unsigned int i, n; + + for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++) + dst[i] = src[i] | src2[i]; +} + +static void nft_bitwise_eval_xor(u32 *dst, const u32 *src, const u32 *src2, + const struct nft_bitwise *priv) +{ + unsigned int i, n; + + for (i = 0, n = DIV_ROUND_UP(priv->len, sizeof(u32)); i < n; i++) + dst[i] = src[i] ^ src2[i]; +} + void nft_bitwise_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_bitwise *priv = nft_expr_priv(expr); - const u32 *src = ®s->data[priv->sreg]; + const u32 *src = ®s->data[priv->sreg], *src2; u32 *dst = ®s->data[priv->dreg]; - switch (priv->op) { - case NFT_BITWISE_MASK_XOR: + if (priv->op == NFT_BITWISE_MASK_XOR) { nft_bitwise_eval_mask_xor(dst, src, priv); - break; - case NFT_BITWISE_LSHIFT: + return; + } + if (priv->op == NFT_BITWISE_LSHIFT) { nft_bitwise_eval_lshift(dst, src, priv); - break; - case NFT_BITWISE_RSHIFT: + return; + } + if (priv->op == NFT_BITWISE_RSHIFT) { nft_bitwise_eval_rshift(dst, src, priv); - break; + return; + } + + src2 = priv->sreg2 ? ®s->data[priv->sreg2] : priv->data.data; + + if (priv->op == NFT_BITWISE_AND) { + nft_bitwise_eval_and(dst, src, src2, priv); + return; + } + if (priv->op == NFT_BITWISE_OR) { + nft_bitwise_eval_or(dst, src, src2, priv); + return; + } + if (priv->op == NFT_BITWISE_XOR) { + nft_bitwise_eval_xor(dst, src, src2, priv); + return; } } static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { [NFTA_BITWISE_SREG] = { .type = NLA_U32 }, + [NFTA_BITWISE_SREG2] = { .type = NLA_U32 }, [NFTA_BITWISE_DREG] = { .type = NLA_U32 }, [NFTA_BITWISE_LEN] = { .type = NLA_U32 }, [NFTA_BITWISE_MASK] = { .type = NLA_NESTED }, @@ -105,7 +150,8 @@ static int nft_bitwise_init_mask_xor(struct nft_bitwise *priv, }; int err; - if (tb[NFTA_BITWISE_DATA]) + if (tb[NFTA_BITWISE_DATA] || + tb[NFTA_BITWISE_SREG2]) return -EINVAL; if (!tb[NFTA_BITWISE_MASK] || @@ -139,7 +185,8 @@ static int nft_bitwise_init_shift(struct nft_bitwise *priv, int err; if (tb[NFTA_BITWISE_MASK] || - tb[NFTA_BITWISE_XOR]) + tb[NFTA_BITWISE_XOR] || + tb[NFTA_BITWISE_SREG2]) return -EINVAL; if (!tb[NFTA_BITWISE_DATA]) @@ -157,6 +204,41 @@ static int nft_bitwise_init_shift(struct nft_bitwise *priv, return 0; } +static int nft_bitwise_init_bool(const struct nft_ctx *ctx, + struct nft_bitwise *priv, + const struct nlattr *const tb[]) +{ + int err; + + if (tb[NFTA_BITWISE_MASK] || + tb[NFTA_BITWISE_XOR]) + return -EINVAL; + + if ((!tb[NFTA_BITWISE_DATA] && !tb[NFTA_BITWISE_SREG2]) || + (tb[NFTA_BITWISE_DATA] && tb[NFTA_BITWISE_SREG2])) + return -EINVAL; + + if (tb[NFTA_BITWISE_DATA]) { + struct nft_data_desc desc = { + .type = NFT_DATA_VALUE, + .size = sizeof(priv->data), + .len = priv->len, + }; + + err = nft_data_init(NULL, &priv->data, &desc, + tb[NFTA_BITWISE_DATA]); + if (err < 0) + return err; + } else { + err = nft_parse_register_load(ctx, tb[NFTA_BITWISE_SREG2], + &priv->sreg2, priv->len); + if (err < 0) + return err; + } + + return 0; +} + static int nft_bitwise_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -188,6 +270,9 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, case NFT_BITWISE_MASK_XOR: case NFT_BITWISE_LSHIFT: case NFT_BITWISE_RSHIFT: + case NFT_BITWISE_AND: + case NFT_BITWISE_OR: + case NFT_BITWISE_XOR: break; default: return -EOPNOTSUPP; @@ -204,6 +289,11 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, case NFT_BITWISE_RSHIFT: err = nft_bitwise_init_shift(priv, tb); break; + case NFT_BITWISE_AND: + case NFT_BITWISE_OR: + case NFT_BITWISE_XOR: + err = nft_bitwise_init_bool(ctx, priv, tb); + break; } return err; @@ -232,6 +322,21 @@ static int nft_bitwise_dump_shift(struct sk_buff *skb, return 0; } +static int nft_bitwise_dump_bool(struct sk_buff *skb, + const struct nft_bitwise *priv) +{ + if (priv->sreg2) { + if (nft_dump_register(skb, NFTA_BITWISE_SREG2, priv->sreg2)) + return -1; + } else { + if (nft_data_dump(skb, NFTA_BITWISE_DATA, &priv->data, + NFT_DATA_VALUE, sizeof(u32)) < 0) + return -1; + } + + return 0; +} + static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { @@ -255,6 +360,11 @@ static int nft_bitwise_dump(struct sk_buff *skb, case NFT_BITWISE_RSHIFT: err = nft_bitwise_dump_shift(skb, priv); break; + case NFT_BITWISE_AND: + case NFT_BITWISE_OR: + case NFT_BITWISE_XOR: + err = nft_bitwise_dump_bool(skb, priv); + break; } return err; @@ -299,6 +409,7 @@ static bool nft_bitwise_reduce(struct nft_regs_track *track, track->regs[priv->dreg].bitwise && track->regs[priv->dreg].bitwise->ops == expr->ops && priv->sreg == bitwise->sreg && + priv->sreg2 == bitwise->sreg2 && priv->dreg == bitwise->dreg && priv->op == bitwise->op && priv->len == bitwise->len && @@ -375,7 +486,8 @@ static int nft_bitwise_fast_init(const struct nft_ctx *ctx, if (err < 0) return err; - if (tb[NFTA_BITWISE_DATA]) + if (tb[NFTA_BITWISE_DATA] || + tb[NFTA_BITWISE_SREG2]) return -EINVAL; if (!tb[NFTA_BITWISE_MASK] || -- cgit v1.3 From c53bf100f68619acf6cedcf4cf5249a1ca2db0b4 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Thu, 14 Nov 2024 17:51:56 +0000 Subject: netdev-genl: Hold rcu_read_lock in napi_get Hold rcu_read_lock in netdev_nl_napi_get_doit, which calls napi_by_id and is required to be called under rcu_read_lock. Cc: stable@vger.kernel.org Fixes: 27f91aaf49b3 ("netdev-genl: Add netlink framework functions for napi") Signed-off-by: Joe Damato Link: https://patch.msgid.link/20241114175157.16604-1-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 1cb954f2d39e..d2baa1af9df0 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -215,6 +215,7 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; rtnl_lock(); + rcu_read_lock(); napi = napi_by_id(napi_id); if (napi) { @@ -224,6 +225,7 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) err = -ENOENT; } + rcu_read_unlock(); rtnl_unlock(); if (err) -- cgit v1.3 From ed7231f56cd7f795ff3a831e32946a96661bfee9 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Thu, 14 Nov 2024 17:55:59 +0000 Subject: netdev-genl: Hold rcu_read_lock in napi_set Hold rcu_read_lock during netdev_nl_napi_set_doit, which calls napi_by_id and requires rcu_read_lock to be held. Closes: https://lore.kernel.org/netdev/719083c2-e277-447b-b6ea-ca3acb293a03@redhat.com/ Fixes: 1287c1ae0fc2 ("netdev-genl: Support setting per-NAPI config values") Signed-off-by: Joe Damato Link: https://patch.msgid.link/20241114175600.18882-1-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 765ce7c9d73b..fa119ff68698 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -346,6 +346,7 @@ int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info) napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]); rtnl_lock(); + rcu_read_lock(); napi = napi_by_id(napi_id); if (napi) { @@ -355,6 +356,7 @@ int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info) err = -ENOENT; } + rcu_read_unlock(); rtnl_unlock(); return err; -- cgit v1.3 From 0c0d0f42ffa6ac94cd79893b7ed419c15e1b45de Mon Sep 17 00:00:00 2001 From: Felix Maurer Date: Thu, 14 Nov 2024 12:30:05 +0100 Subject: xsk: Free skb when TX metadata options are invalid MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a new skb is allocated for transmitting an xsk descriptor, i.e., for every non-multibuf descriptor or the first frag of a multibuf descriptor, but the descriptor is later found to have invalid options set for the TX metadata, the new skb is never freed. This can leak skbs until the send buffer is full which makes sending more packets impossible. Fix this by freeing the skb in the error path if we are currently dealing with the first frag, i.e., an skb allocated in this iteration of xsk_build_skb. Fixes: 48eb03dd2630 ("xsk: Add TX timestamp and TX checksum offload support") Reported-by: Michal Schmidt Signed-off-by: Felix Maurer Reviewed-by: Toke Høiland-Jørgensen Acked-by: Stanislav Fomichev Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/edb9b00fb19e680dff5a3350cd7581c5927975a8.1731581697.git.fmaurer@redhat.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 1140b2a120ca..b57d5d2904eb 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -675,6 +675,8 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, len = desc->len; if (!skb) { + first_frag = true; + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); tr = dev->needed_tailroom; skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); @@ -685,12 +687,8 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, skb_put(skb, len); err = skb_store_bits(skb, 0, buffer, len); - if (unlikely(err)) { - kfree_skb(skb); + if (unlikely(err)) goto free_err; - } - - first_frag = true; } else { int nr_frags = skb_shinfo(skb)->nr_frags; struct page *page; @@ -758,6 +756,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, return skb; free_err: + if (first_frag && skb) + kfree_skb(skb); + if (err == -EOVERFLOW) { /* Drop the packet */ xsk_set_destructor_arg(xs->skb); -- cgit v1.3 From e51edeaf3506654ebd62c16e0ddf58da271b5200 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov <0x7f454c46@gmail.com> Date: Wed, 13 Nov 2024 18:46:44 +0000 Subject: net/netlink: Correct the comment on netlink message max cap Since commit d35c99ff77ec ("netlink: do not enter direct reclaim from netlink_dump()") the cap is 32KiB. Signed-off-by: Dmitry Safonov <0x7f454c46@gmail.com> Link: https://patch.msgid.link/20241113-tcp-md5-diag-prep-v2-5-00a2a7feb1fa@gmail.com Signed-off-by: Jakub Kicinski --- net/netlink/af_netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 2ea4763a2004..8c78b29683a6 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2264,7 +2264,7 @@ static int netlink_dump(struct sock *sk, bool lock_taken) goto errout_skb; /* NLMSG_GOODSIZE is small to avoid high order allocations being - * required, but it makes sense to _attempt_ a 16K bytes allocation + * required, but it makes sense to _attempt_ a 32KiB allocation * to reduce number of system calls on dump operations, if user * ever provided a big enough buffer. */ @@ -2286,7 +2286,7 @@ static int netlink_dump(struct sock *sk, bool lock_taken) goto errout_skb; /* Trim skb to allocated size. User is expected to provide buffer as - * large as max(min_dump_alloc, 16KiB (mac_recvmsg_len capped at + * large as max(min_dump_alloc, 32KiB (max_recvmsg_len capped at * netlink_recvmsg())). dump will pack as many smaller messages as * could fit within the allocated skb. skb is typically allocated * with larger space than required (could be as much as near 2x the -- cgit v1.3 From 221a9c1df790fa711d65daf5ba05d0addc279153 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 14 Nov 2024 03:00:11 -0800 Subject: net: netpoll: Individualize the skb pool The current implementation of the netpoll system uses a global skb pool, which can lead to inefficient memory usage and waste when targets are disabled or no longer in use. This can result in a significant amount of memory being unnecessarily allocated and retained, potentially causing performance issues and limiting the availability of resources for other system components. Modify the netpoll system to assign a skb pool to each target instead of using a global one. This approach allows for more fine-grained control over memory allocation and deallocation, ensuring that resources are only allocated and retained as needed. Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20241114-skb_buffers_v2-v3-1-9be9f52a8b69@debian.org Signed-off-by: Jakub Kicinski --- include/linux/netpoll.h | 1 + net/core/netpoll.c | 31 +++++++++++++------------------ 2 files changed, 14 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index cd4e28db0cbd..77635b885c18 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -32,6 +32,7 @@ struct netpoll { bool ipv6; u16 local_port, remote_port; u8 remote_mac[ETH_ALEN]; + struct sk_buff_head skb_pool; }; struct netpoll_info { diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 94b7f07a952f..719c9aae845f 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -45,9 +45,6 @@ #define MAX_UDP_CHUNK 1460 #define MAX_SKBS 32 - -static struct sk_buff_head skb_pool; - #define USEC_PER_POLL 50 #define MAX_SKB_SIZE \ @@ -234,20 +231,23 @@ void netpoll_poll_enable(struct net_device *dev) up(&ni->dev_lock); } -static void refill_skbs(void) +static void refill_skbs(struct netpoll *np) { + struct sk_buff_head *skb_pool; struct sk_buff *skb; unsigned long flags; - spin_lock_irqsave(&skb_pool.lock, flags); - while (skb_pool.qlen < MAX_SKBS) { + skb_pool = &np->skb_pool; + + spin_lock_irqsave(&skb_pool->lock, flags); + while (skb_pool->qlen < MAX_SKBS) { skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC); if (!skb) break; - __skb_queue_tail(&skb_pool, skb); + __skb_queue_tail(skb_pool, skb); } - spin_unlock_irqrestore(&skb_pool.lock, flags); + spin_unlock_irqrestore(&skb_pool->lock, flags); } static void zap_completion_queue(void) @@ -284,12 +284,12 @@ static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve) struct sk_buff *skb; zap_completion_queue(); - refill_skbs(); + refill_skbs(np); repeat: skb = alloc_skb(len, GFP_ATOMIC); if (!skb) - skb = skb_dequeue(&skb_pool); + skb = skb_dequeue(&np->skb_pool); if (!skb) { if (++count < 10) { @@ -673,6 +673,8 @@ int netpoll_setup(struct netpoll *np) struct in_device *in_dev; int err; + skb_queue_head_init(&np->skb_pool); + rtnl_lock(); if (np->dev_name[0]) { struct net *net = current->nsproxy->net_ns; @@ -773,7 +775,7 @@ put_noaddr: } /* fill up the skb queue */ - refill_skbs(); + refill_skbs(np); err = __netpoll_setup(np, ndev); if (err) @@ -792,13 +794,6 @@ unlock: } EXPORT_SYMBOL(netpoll_setup); -static int __init netpoll_init(void) -{ - skb_queue_head_init(&skb_pool); - return 0; -} -core_initcall(netpoll_init); - static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head) { struct netpoll_info *npinfo = -- cgit v1.3 From 6c59f16f1770481a6ee684720ec55b1e38b3a4b2 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 14 Nov 2024 03:00:12 -0800 Subject: net: netpoll: flush skb pool during cleanup The netpoll subsystem maintains a pool of 32 pre-allocated SKBs per instance, but these SKBs are not freed when the netpoll user is brought down. This leads to memory waste as these buffers remain allocated but unused. Add skb_pool_flush() to properly clean up these SKBs when netconsole is terminated, improving memory efficiency. Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20241114-skb_buffers_v2-v3-2-9be9f52a8b69@debian.org Signed-off-by: Jakub Kicinski --- net/core/netpoll.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 719c9aae845f..00e1e4a32902 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -531,6 +531,14 @@ static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr) return -1; } +static void skb_pool_flush(struct netpoll *np) +{ + struct sk_buff_head *skb_pool; + + skb_pool = &np->skb_pool; + skb_queue_purge_reason(skb_pool, SKB_CONSUMED); +} + int netpoll_parse_options(struct netpoll *np, char *opt) { char *cur=opt, *delim; @@ -779,10 +787,12 @@ put_noaddr: err = __netpoll_setup(np, ndev); if (err) - goto put; + goto flush; rtnl_unlock(); return 0; +flush: + skb_pool_flush(np); put: DEBUG_NET_WARN_ON_ONCE(np->dev); if (ip_overwritten) @@ -830,6 +840,8 @@ void __netpoll_cleanup(struct netpoll *np) call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info); } else RCU_INIT_POINTER(np->dev->npinfo, NULL); + + skb_pool_flush(np); } EXPORT_SYMBOL_GPL(__netpoll_cleanup); -- cgit v1.3 From 4b42fbc6bd8f73d9ded535d8c61ccaa837ff3bd4 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 14 Nov 2024 15:09:53 +0100 Subject: ndo_fdb_add: Add a parameter to report whether notification was sent Currently when FDB entries are added to or deleted from a VXLAN netdevice, the VXLAN driver emits one notification, including the VXLAN-specific attributes. The core however always sends a notification as well, a generic one. Thus two notifications are unnecessarily sent for these operations. A similar situation comes up with bridge driver, which also emits notifications on its own: # ip link add name vx type vxlan id 1000 dstport 4789 # bridge monitor fdb & [1] 1981693 # bridge fdb add de:ad:be:ef:13:37 dev vx self dst 192.0.2.1 de:ad:be:ef:13:37 dev vx dst 192.0.2.1 self permanent de:ad:be:ef:13:37 dev vx self permanent In order to prevent this duplicity, add a paremeter to ndo_fdb_add, bool *notified. The flag is primed to false, and if the callee sends a notification on its own, it sets it to true, thus informing the core that it should not generate another notification. Signed-off-by: Petr Machata Reviewed-by: Amit Cohen Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/cbf6ae8195e85cbf922f8058ce4eba770f3b71ed.1731589511.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_main.c | 3 ++- drivers/net/ethernet/intel/ice/ice_main.c | 4 +++- drivers/net/ethernet/intel/igb/igb_main.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +- drivers/net/ethernet/mscc/ocelot_net.c | 2 +- drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c | 2 +- drivers/net/macvlan.c | 2 +- drivers/net/vxlan/vxlan_core.c | 5 ++++- include/linux/netdevice.h | 5 ++++- net/bridge/br_fdb.c | 12 +++++++----- net/bridge/br_private.h | 2 +- net/core/rtnetlink.c | 9 ++++++--- 12 files changed, 32 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 55fb362eb508..ab5febf83ec3 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -13095,12 +13095,13 @@ static int i40e_get_phys_port_id(struct net_device *netdev, * @addr: the MAC address entry being added * @vid: VLAN ID * @flags: instructions from stack about fdb operation + * @notified: whether notification was emitted * @extack: netlink extended ack, unused currently */ static int i40e_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, - u16 flags, + u16 flags, bool *notified, struct netlink_ext_ack *extack) { struct i40e_netdev_priv *np = netdev_priv(dev); diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index a6f586f9bfd1..c875036f654b 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -6125,12 +6125,14 @@ ice_set_tx_maxrate(struct net_device *netdev, int queue_index, u32 maxrate) * @addr: the MAC address entry being added * @vid: VLAN ID * @flags: instructions from stack about fdb operation + * @notified: whether notification was emitted * @extack: netlink extended ack */ static int ice_fdb_add(struct ndmsg *ndm, struct nlattr __always_unused *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, - u16 flags, struct netlink_ext_ack __always_unused *extack) + u16 flags, bool *notified, + struct netlink_ext_ack __always_unused *extack) { int err; diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index f1d088168723..f0528bd13184 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2486,7 +2486,7 @@ static int igb_set_features(struct net_device *netdev, static int igb_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, - u16 flags, + u16 flags, bool *notified, struct netlink_ext_ack *extack) { /* guarantee we can provide a unique filter for the unicast address */ diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 8b8404d8c946..adc9392463ce 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -9954,7 +9954,7 @@ static int ixgbe_set_features(struct net_device *netdev, static int ixgbe_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, - u16 flags, + u16 flags, bool *notified, struct netlink_ext_ack *extack) { /* guarantee we can provide a unique filter for the unicast address */ diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 7c9540a71725..4f15ba2c5525 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -730,7 +730,7 @@ static void ocelot_get_stats64(struct net_device *dev, static int ocelot_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, - u16 vid, u16 flags, + u16 vid, u16 flags, bool *notified, struct netlink_ext_ack *extack) { struct ocelot_port_private *priv = netdev_priv(dev); diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c index b3588a1ebc25..2484cebd97d4 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c @@ -394,7 +394,7 @@ static int qlcnic_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], static int qlcnic_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *netdev, const unsigned char *addr, u16 vid, u16 flags, - struct netlink_ext_ack *extack) + bool *notified, struct netlink_ext_ack *extack) { struct qlcnic_adapter *adapter = netdev_priv(netdev); int err = 0; diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index edbd5afcec41..dfb462e63248 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -1024,7 +1024,7 @@ static int macvlan_vlan_rx_kill_vid(struct net_device *dev, static int macvlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, - u16 flags, + u16 flags, bool *notified, struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 42b07bc2b107..22f17c5c7549 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1241,7 +1241,7 @@ static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags, - struct netlink_ext_ack *extack) + bool *notified, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); /* struct net *net = dev_net(vxlan->dev); */ @@ -1277,6 +1277,9 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], nhid, true, extack); spin_unlock_bh(&vxlan->hash_lock[hash_index]); + if (!err) + *notified = true; + return err; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0aae346d919e..6a7fd191e1ee 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1248,8 +1248,10 @@ struct netdev_net_notifier { * int (*ndo_fdb_add)(struct ndmsg *ndm, struct nlattr *tb[], * struct net_device *dev, * const unsigned char *addr, u16 vid, u16 flags, - * struct netlink_ext_ack *extack); + * bool *notified, struct netlink_ext_ack *extack); * Adds an FDB entry to dev for addr. + * Callee shall set *notified to true if it sent any appropriate + * notification(s). Otherwise core will send a generic one. * int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[], * struct net_device *dev, * const unsigned char *addr, u16 vid) @@ -1525,6 +1527,7 @@ struct net_device_ops { const unsigned char *addr, u16 vid, u16 flags, + bool *notified, struct netlink_ext_ack *extack); int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[], diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 77f110035df1..5f29958f3ddd 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -1152,7 +1152,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 nlh_flags, u16 vid, struct nlattr *nfea_tb[], - struct netlink_ext_ack *extack) + bool *notified, struct netlink_ext_ack *extack) { int err = 0; @@ -1183,6 +1183,8 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, spin_unlock_bh(&br->hash_lock); } + if (!err) + *notified = true; return err; } @@ -1195,7 +1197,7 @@ static const struct nla_policy br_nda_fdb_pol[NFEA_MAX + 1] = { int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 nlh_flags, - struct netlink_ext_ack *extack) + bool *notified, struct netlink_ext_ack *extack) { struct nlattr *nfea_tb[NFEA_MAX + 1], *attr; struct net_bridge_vlan_group *vg; @@ -1258,10 +1260,10 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], /* VID was specified, so use it. */ err = __br_fdb_add(ndm, br, p, addr, nlh_flags, vid, nfea_tb, - extack); + notified, extack); } else { err = __br_fdb_add(ndm, br, p, addr, nlh_flags, 0, nfea_tb, - extack); + notified, extack); if (err || !vg || !vg->num_vlans) goto out; @@ -1273,7 +1275,7 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], if (!br_vlan_should_use(v)) continue; err = __br_fdb_add(ndm, br, p, addr, nlh_flags, v->vid, - nfea_tb, extack); + nfea_tb, notified, extack); if (err) goto out; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 041f6e571a20..ebfc59049ec1 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -858,7 +858,7 @@ int br_fdb_delete_bulk(struct nlmsghdr *nlh, struct net_device *dev, struct netlink_ext_ack *extack); int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 nlh_flags, - struct netlink_ext_ack *extack); + bool *notified, struct netlink_ext_ack *extack); int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *fdev, int *idx); int br_fdb_get(struct sk_buff *skb, struct nlattr *tb[], struct net_device *dev, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 327fa4957929..f31b2436cde5 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4578,9 +4578,10 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops = br_dev->netdev_ops; + bool notified = false; err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid, - nlh->nlmsg_flags, extack); + nlh->nlmsg_flags, ¬ified, extack); if (err) goto out; else @@ -4589,16 +4590,18 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, /* Embedded bridge, macvlan, and any other device support */ if ((ndm->ndm_flags & NTF_SELF)) { + bool notified = false; + if (dev->netdev_ops->ndo_fdb_add) err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags, - extack); + ¬ified, extack); else err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags); - if (!err) { + if (!err && !notified) { rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH, ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; -- cgit v1.3 From 42575ad5aab932273475d1ec3e7881cb5a05420e Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 14 Nov 2024 15:09:54 +0100 Subject: ndo_fdb_del: Add a parameter to report whether notification was sent In a similar fashion to ndo_fdb_add, which was covered in the previous patch, add the bool *notified argument to ndo_fdb_del. Callees that send a notification on their own set the flag to true. Signed-off-by: Petr Machata Reviewed-by: Amit Cohen Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/06b1acf4953ef0a5ed153ef1f32d7292044f2be6.1731589511.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_main.c | 4 +++- drivers/net/ethernet/mscc/ocelot_net.c | 2 +- drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c | 2 +- drivers/net/macvlan.c | 2 +- drivers/net/vxlan/vxlan_core.c | 5 ++++- include/linux/netdevice.h | 9 +++++++-- net/bridge/br_fdb.c | 15 ++++++++------- net/bridge/br_private.h | 2 +- net/core/rtnetlink.c | 11 ++++++++--- 9 files changed, 34 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index c875036f654b..b79848fe2a9e 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -6166,12 +6166,14 @@ ice_fdb_add(struct ndmsg *ndm, struct nlattr __always_unused *tb[], * @dev: the net device pointer * @addr: the MAC address entry being added * @vid: VLAN ID + * @notified: whether notification was emitted * @extack: netlink extended ack */ static int ice_fdb_del(struct ndmsg *ndm, __always_unused struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, - __always_unused u16 vid, struct netlink_ext_ack *extack) + __always_unused u16 vid, bool *notified, + struct netlink_ext_ack *extack) { int err; diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 4f15ba2c5525..558e03301aa8 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -744,7 +744,7 @@ static int ocelot_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], static int ocelot_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, - struct netlink_ext_ack *extack) + bool *notified, struct netlink_ext_ack *extack) { struct ocelot_port_private *priv = netdev_priv(dev); struct ocelot_port *ocelot_port = &priv->port; diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c index 2484cebd97d4..eb69121df726 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c @@ -367,7 +367,7 @@ static int qlcnic_set_mac(struct net_device *netdev, void *p) static int qlcnic_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *netdev, - const unsigned char *addr, u16 vid, + const unsigned char *addr, u16 vid, bool *notified, struct netlink_ext_ack *extack) { struct qlcnic_adapter *adapter = netdev_priv(netdev); diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index dfb462e63248..fed4fe2a4748 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -1049,7 +1049,7 @@ static int macvlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], static int macvlan_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, - const unsigned char *addr, u16 vid, + const unsigned char *addr, u16 vid, bool *notified, struct netlink_ext_ack *extack) { struct macvlan_dev *vlan = netdev_priv(dev); diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 22f17c5c7549..9ea63059d52d 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1319,7 +1319,7 @@ out: /* Delete entry (via netlink) */ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, - const unsigned char *addr, u16 vid, + const unsigned char *addr, u16 vid, bool *notified, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); @@ -1341,6 +1341,9 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], true); spin_unlock_bh(&vxlan->hash_lock[hash_index]); + if (!err) + *notified = true; + return err; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6a7fd191e1ee..ecc686409161 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1254,8 +1254,11 @@ struct netdev_net_notifier { * notification(s). Otherwise core will send a generic one. * int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[], * struct net_device *dev, - * const unsigned char *addr, u16 vid) + * const unsigned char *addr, u16 vid + * bool *notified, struct netlink_ext_ack *extack); * Deletes the FDB entry from dev corresponding to addr. + * Callee shall set *notified to true if it sent any appropriate + * notification(s). Otherwise core will send a generic one. * int (*ndo_fdb_del_bulk)(struct nlmsghdr *nlh, struct net_device *dev, * struct netlink_ext_ack *extack); * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb, @@ -1533,7 +1536,9 @@ struct net_device_ops { struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, - u16 vid, struct netlink_ext_ack *extack); + u16 vid, + bool *notified, + struct netlink_ext_ack *extack); int (*ndo_fdb_del_bulk)(struct nlmsghdr *nlh, struct net_device *dev, struct netlink_ext_ack *extack); diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 5f29958f3ddd..82bac2426631 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -1287,7 +1287,7 @@ out: static int fdb_delete_by_addr_and_port(struct net_bridge *br, const struct net_bridge_port *p, - const u8 *addr, u16 vlan) + const u8 *addr, u16 vlan, bool *notified) { struct net_bridge_fdb_entry *fdb; @@ -1296,18 +1296,19 @@ static int fdb_delete_by_addr_and_port(struct net_bridge *br, return -ENOENT; fdb_delete(br, fdb, true); + *notified = true; return 0; } static int __br_fdb_delete(struct net_bridge *br, const struct net_bridge_port *p, - const unsigned char *addr, u16 vid) + const unsigned char *addr, u16 vid, bool *notified) { int err; spin_lock_bh(&br->hash_lock); - err = fdb_delete_by_addr_and_port(br, p, addr, vid); + err = fdb_delete_by_addr_and_port(br, p, addr, vid, notified); spin_unlock_bh(&br->hash_lock); return err; @@ -1316,7 +1317,7 @@ static int __br_fdb_delete(struct net_bridge *br, /* Remove neighbor entry with RTM_DELNEIGH */ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, - const unsigned char *addr, u16 vid, + const unsigned char *addr, u16 vid, bool *notified, struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; @@ -1339,19 +1340,19 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], } if (vid) { - err = __br_fdb_delete(br, p, addr, vid); + err = __br_fdb_delete(br, p, addr, vid, notified); } else { struct net_bridge_vlan *v; err = -ENOENT; - err &= __br_fdb_delete(br, p, addr, 0); + err &= __br_fdb_delete(br, p, addr, 0, notified); if (!vg || !vg->num_vlans) return err; list_for_each_entry(v, &vg->vlan_list, vlist) { if (!br_vlan_should_use(v)) continue; - err &= __br_fdb_delete(br, p, addr, v->vid); + err &= __br_fdb_delete(br, p, addr, v->vid, notified); } } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index ebfc59049ec1..9853cfbb9d14 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -853,7 +853,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, - struct netlink_ext_ack *extack); + bool *notified, struct netlink_ext_ack *extack); int br_fdb_delete_bulk(struct nlmsghdr *nlh, struct net_device *dev, struct netlink_ext_ack *extack); int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f31b2436cde5..dd142f444659 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4701,11 +4701,13 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); + bool notified = false; ops = br_dev->netdev_ops; if (!del_bulk) { if (ops->ndo_fdb_del) - err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack); + err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, + ¬ified, extack); } else { if (ops->ndo_fdb_del_bulk) err = ops->ndo_fdb_del_bulk(nlh, dev, extack); @@ -4719,10 +4721,13 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, /* Embedded bridge, macvlan, and any other device support */ if (ndm->ndm_flags & NTF_SELF) { + bool notified = false; + ops = dev->netdev_ops; if (!del_bulk) { if (ops->ndo_fdb_del) - err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack); + err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, + ¬ified, extack); else err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid); } else { @@ -4733,7 +4738,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, } if (!err) { - if (!del_bulk) + if (!del_bulk && !notified) rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH, ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; -- cgit v1.3 From ff1060813d9347e8c45c8b8cff93a4dfdb6726ad Mon Sep 17 00:00:00 2001 From: Mirsad Todorovac Date: Sat, 9 Nov 2024 22:18:41 +0100 Subject: net/9p/usbg: fix handling of the failed kzalloc() memory allocation On the linux-next, next-20241108 vanilla kernel, the coccinelle tool gave the following error report: ./net/9p/trans_usbg.c:912:5-11: ERROR: allocation function on line 911 returns NULL not ERR_PTR on failure kzalloc() failure is fixed to handle the NULL return case on the memory exhaustion. Fixes: a3be076dc174d ("net/9p/usbg: Add new usb gadget function transport") Cc: Michael Grzeschik Cc: Eric Van Hensbergen Cc: Latchesar Ionkov Cc: Dominique Martinet Cc: Christian Schoenebeck Cc: v9fs@lists.linux.dev Cc: linux-kernel@vger.kernel.org Signed-off-by: Mirsad Todorovac Message-ID: <20241109211840.721226-2-mtodorovac69@gmail.com> Signed-off-by: Dominique Martinet --- net/9p/trans_usbg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/9p/trans_usbg.c b/net/9p/trans_usbg.c index 975b76839dca..6b694f117aef 100644 --- a/net/9p/trans_usbg.c +++ b/net/9p/trans_usbg.c @@ -909,9 +909,9 @@ static struct usb_function_instance *usb9pfs_alloc_instance(void) usb9pfs_opts->buflen = DEFAULT_BUFLEN; dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (IS_ERR(dev)) { + if (!dev) { kfree(usb9pfs_opts); - return ERR_CAST(dev); + return ERR_PTR(-ENOMEM); } usb9pfs_opts->dev = dev; -- cgit v1.3 From accdd51dc74ff65b7b7be1961b11723d228fbbbd Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Thu, 14 Nov 2024 18:52:04 +0800 Subject: net/udp: Add a new struct for hash2 slot Preparing for udp 4-tuple hash (uhash4 for short). To implement uhash4 without cache line missing when lookup, hslot2 is used to record the number of hashed sockets in hslot4. Thus adding a new struct udp_hslot_main with field hash4_cnt, which is used by hash2. The new struct is used to avoid doubling the size of udp_hslot. Before uhash4 lookup, firstly checking hash4_cnt to see if there are hashed sks in hslot4. Because hslot2 is always used in lookup, there is no cache line miss. Related helpers are updated, and use the helpers as possible. uhash4 is implemented in following patches. Signed-off-by: Philo Lu Acked-by: Willem de Bruijn Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/udp.h | 38 ++++++++++++++++++++++++++++++++++---- net/ipv4/udp.c | 44 +++++++++++++++++++++++--------------------- net/ipv6/udp.c | 15 ++++++--------- 3 files changed, 63 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/include/net/udp.h b/include/net/udp.h index 61222545ab1c..62a7207e65f2 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -50,7 +50,7 @@ struct udp_skb_cb { #define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb)) /** - * struct udp_hslot - UDP hash slot + * struct udp_hslot - UDP hash slot used by udp_table.hash * * @head: head of list of sockets * @count: number of sockets in 'head' list @@ -60,7 +60,22 @@ struct udp_hslot { struct hlist_head head; int count; spinlock_t lock; -} __attribute__((aligned(2 * sizeof(long)))); +} __aligned(2 * sizeof(long)); + +/** + * struct udp_hslot_main - UDP hash slot used by udp_table.hash2 + * + * @hslot: basic hash slot + * @hash4_cnt: number of sockets in hslot4 of the same + * (local port, local address) + */ +struct udp_hslot_main { + struct udp_hslot hslot; /* must be the first member */ +#if !IS_ENABLED(CONFIG_BASE_SMALL) + u32 hash4_cnt; +#endif +} __aligned(2 * sizeof(long)); +#define UDP_HSLOT_MAIN(__hslot) ((struct udp_hslot_main *)(__hslot)) /** * struct udp_table - UDP table @@ -72,7 +87,7 @@ struct udp_hslot { */ struct udp_table { struct udp_hslot *hash; - struct udp_hslot *hash2; + struct udp_hslot_main *hash2; unsigned int mask; unsigned int log; }; @@ -84,6 +99,7 @@ static inline struct udp_hslot *udp_hashslot(struct udp_table *table, { return &table->hash[udp_hashfn(net, num, table->mask)]; } + /* * For secondary hash, net_hash_mix() is performed before calling * udp_hashslot2(), this explains difference with udp_hashslot() @@ -91,8 +107,22 @@ static inline struct udp_hslot *udp_hashslot(struct udp_table *table, static inline struct udp_hslot *udp_hashslot2(struct udp_table *table, unsigned int hash) { - return &table->hash2[hash & table->mask]; + return &table->hash2[hash & table->mask].hslot; +} + +#if IS_ENABLED(CONFIG_BASE_SMALL) +static inline void udp_table_hash4_init(struct udp_table *table) +{ +} +#else /* !CONFIG_BASE_SMALL */ + +/* Must be called with table->hash2 initialized */ +static inline void udp_table_hash4_init(struct udp_table *table) +{ + for (int i = 0; i <= table->mask; i++) + table->hash2[i].hash4_cnt = 0; } +#endif /* CONFIG_BASE_SMALL */ extern struct proto udp_prot; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0e24916b39d4..2fdac5fae2a8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -486,13 +486,12 @@ struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr, int sdif, struct udp_table *udptable, struct sk_buff *skb) { unsigned short hnum = ntohs(dport); - unsigned int hash2, slot2; struct udp_hslot *hslot2; struct sock *result, *sk; + unsigned int hash2; hash2 = ipv4_portaddr_hash(net, daddr, hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; + hslot2 = udp_hashslot2(udptable, hash2); /* Lookup connected or non-wildcard socket */ result = udp4_lib_lookup2(net, saddr, sport, @@ -519,8 +518,7 @@ struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr, /* Lookup wildcard sockets */ hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; + hslot2 = udp_hashslot2(udptable, hash2); result = udp4_lib_lookup2(net, saddr, sport, htonl(INADDR_ANY), hnum, dif, sdif, @@ -2268,7 +2266,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, udptable->mask; hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: - hslot = &udptable->hash2[hash2]; + hslot = &udptable->hash2[hash2].hslot; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); } @@ -2539,14 +2537,13 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, struct udp_table *udptable = net->ipv4.udp_table; INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); unsigned short hnum = ntohs(loc_port); - unsigned int hash2, slot2; struct udp_hslot *hslot2; + unsigned int hash2; __portpair ports; struct sock *sk; hash2 = ipv4_portaddr_hash(net, loc_addr, hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; + hslot2 = udp_hashslot2(udptable, hash2); ports = INET_COMBINED_PORTS(rmt_port, hnum); udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { @@ -3187,7 +3184,7 @@ again: batch_sks = 0; for (; state->bucket <= udptable->mask; state->bucket++) { - struct udp_hslot *hslot2 = &udptable->hash2[state->bucket]; + struct udp_hslot *hslot2 = &udptable->hash2[state->bucket].hslot; if (hlist_empty(&hslot2->head)) continue; @@ -3428,10 +3425,11 @@ __setup("uhash_entries=", set_uhash_entries); void __init udp_table_init(struct udp_table *table, const char *name) { - unsigned int i; + unsigned int i, slot_size; + slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main); table->hash = alloc_large_system_hash(name, - 2 * sizeof(struct udp_hslot), + slot_size, uhash_entries, 21, /* one slot per 2 MB */ 0, @@ -3440,17 +3438,18 @@ void __init udp_table_init(struct udp_table *table, const char *name) UDP_HTABLE_SIZE_MIN, UDP_HTABLE_SIZE_MAX); - table->hash2 = table->hash + (table->mask + 1); + table->hash2 = (void *)(table->hash + (table->mask + 1)); for (i = 0; i <= table->mask; i++) { INIT_HLIST_HEAD(&table->hash[i].head); table->hash[i].count = 0; spin_lock_init(&table->hash[i].lock); } for (i = 0; i <= table->mask; i++) { - INIT_HLIST_HEAD(&table->hash2[i].head); - table->hash2[i].count = 0; - spin_lock_init(&table->hash2[i].lock); + INIT_HLIST_HEAD(&table->hash2[i].hslot.head); + table->hash2[i].hslot.count = 0; + spin_lock_init(&table->hash2[i].hslot.lock); } + udp_table_hash4_init(table); } u32 udp_flow_hashrnd(void) @@ -3476,18 +3475,20 @@ static void __net_init udp_sysctl_init(struct net *net) static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_entries) { struct udp_table *udptable; + unsigned int slot_size; int i; udptable = kmalloc(sizeof(*udptable), GFP_KERNEL); if (!udptable) goto out; - udptable->hash = vmalloc_huge(hash_entries * 2 * sizeof(struct udp_hslot), + slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main); + udptable->hash = vmalloc_huge(hash_entries * slot_size, GFP_KERNEL_ACCOUNT); if (!udptable->hash) goto free_table; - udptable->hash2 = udptable->hash + hash_entries; + udptable->hash2 = (void *)(udptable->hash + hash_entries); udptable->mask = hash_entries - 1; udptable->log = ilog2(hash_entries); @@ -3496,10 +3497,11 @@ static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_ent udptable->hash[i].count = 0; spin_lock_init(&udptable->hash[i].lock); - INIT_HLIST_HEAD(&udptable->hash2[i].head); - udptable->hash2[i].count = 0; - spin_lock_init(&udptable->hash2[i].lock); + INIT_HLIST_HEAD(&udptable->hash2[i].hslot.head); + udptable->hash2[i].hslot.count = 0; + spin_lock_init(&udptable->hash2[i].hslot.lock); } + udp_table_hash4_init(udptable); return udptable; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 0cef8ae5d1ea..0d7aac9d44e5 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -224,13 +224,12 @@ struct sock *__udp6_lib_lookup(const struct net *net, struct sk_buff *skb) { unsigned short hnum = ntohs(dport); - unsigned int hash2, slot2; struct udp_hslot *hslot2; struct sock *result, *sk; + unsigned int hash2; hash2 = ipv6_portaddr_hash(net, daddr, hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; + hslot2 = udp_hashslot2(udptable, hash2); /* Lookup connected or non-wildcard sockets */ result = udp6_lib_lookup2(net, saddr, sport, @@ -257,8 +256,7 @@ struct sock *__udp6_lib_lookup(const struct net *net, /* Lookup wildcard sockets */ hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; + hslot2 = udp_hashslot2(udptable, hash2); result = udp6_lib_lookup2(net, saddr, sport, &in6addr_any, hnum, dif, sdif, @@ -859,7 +857,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, udptable->mask; hash2 = ipv6_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: - hslot = &udptable->hash2[hash2]; + hslot = &udptable->hash2[hash2].hslot; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); } @@ -1065,14 +1063,13 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net, { struct udp_table *udptable = net->ipv4.udp_table; unsigned short hnum = ntohs(loc_port); - unsigned int hash2, slot2; struct udp_hslot *hslot2; + unsigned int hash2; __portpair ports; struct sock *sk; hash2 = ipv6_portaddr_hash(net, loc_addr, hnum); - slot2 = hash2 & udptable->mask; - hslot2 = &udptable->hash2[slot2]; + hslot2 = udp_hashslot2(udptable, hash2); ports = INET_COMBINED_PORTS(rmt_port, hnum); udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { -- cgit v1.3 From dab78a1745ab3c6001e1e4d50a9d09efef8e260d Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Thu, 14 Nov 2024 18:52:05 +0800 Subject: net/udp: Add 4-tuple hash list basis Add a new hash list, hash4, in udp table. It will be used to implement 4-tuple hash for connected udp sockets. This patch adds the hlist to table, and implements helpers and the initialization. 4-tuple hash is implemented in the following patch. hash4 uses hlist_nulls to avoid moving wrongly onto another hlist due to concurrent rehash, because rehash() can happen with lookup(). Co-developed-by: Cambda Zhu Signed-off-by: Cambda Zhu Co-developed-by: Fred Chen Signed-off-by: Fred Chen Co-developed-by: Yubing Qiu Signed-off-by: Yubing Qiu Signed-off-by: Philo Lu Acked-by: Willem de Bruijn Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/udp.h | 11 +++++++ include/net/udp.h | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++-- net/ipv4/udp.c | 6 ++-- 3 files changed, 97 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/udp.h b/include/linux/udp.h index 3eb3f2b9a2a0..0807e21cfec9 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -56,6 +56,12 @@ struct udp_sock { int pending; /* Any pending frames ? */ __u8 encap_type; /* Is this an Encapsulation socket? */ +#if !IS_ENABLED(CONFIG_BASE_SMALL) + /* For UDP 4-tuple hash */ + __u16 udp_lrpa_hash; + struct hlist_nulls_node udp_lrpa_node; +#endif + /* * Following member retains the information to create a UDP header * when the socket is uncorked. @@ -206,6 +212,11 @@ static inline void udp_allow_gso(struct sock *sk) #define udp_portaddr_for_each_entry_rcu(__sk, list) \ hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node) +#if !IS_ENABLED(CONFIG_BASE_SMALL) +#define udp_lrpa_for_each_entry_rcu(__up, node, list) \ + hlist_nulls_for_each_entry_rcu(__up, node, list, udp_lrpa_node) +#endif + #define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE) #endif /* _LINUX_UDP_H */ diff --git a/include/net/udp.h b/include/net/udp.h index 62a7207e65f2..edb669967130 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -50,14 +50,21 @@ struct udp_skb_cb { #define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb)) /** - * struct udp_hslot - UDP hash slot used by udp_table.hash + * struct udp_hslot - UDP hash slot used by udp_table.hash/hash4 * * @head: head of list of sockets + * @nulls_head: head of list of sockets, only used by hash4 * @count: number of sockets in 'head' list * @lock: spinlock protecting changes to head/count */ struct udp_hslot { - struct hlist_head head; + union { + struct hlist_head head; + /* hash4 uses hlist_nulls to avoid moving wrongly onto another + * hlist, because rehash() can happen with lookup(). + */ + struct hlist_nulls_head nulls_head; + }; int count; spinlock_t lock; } __aligned(2 * sizeof(long)); @@ -82,12 +89,17 @@ struct udp_hslot_main { * * @hash: hash table, sockets are hashed on (local port) * @hash2: hash table, sockets are hashed on (local port, local address) + * @hash4: hash table, connected sockets are hashed on + * (local port, local address, remote port, remote address) * @mask: number of slots in hash tables, minus 1 * @log: log2(number of slots in hash table) */ struct udp_table { struct udp_hslot *hash; struct udp_hslot_main *hash2; +#if !IS_ENABLED(CONFIG_BASE_SMALL) + struct udp_hslot *hash4; +#endif unsigned int mask; unsigned int log; }; @@ -114,13 +126,80 @@ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table, static inline void udp_table_hash4_init(struct udp_table *table) { } + +static inline struct udp_hslot *udp_hashslot4(struct udp_table *table, + unsigned int hash) +{ + BUILD_BUG(); + return NULL; +} + +static inline bool udp_hashed4(const struct sock *sk) +{ + return false; +} + +static inline unsigned int udp_hash4_slot_size(void) +{ + return 0; +} + +static inline bool udp_has_hash4(const struct udp_hslot *hslot2) +{ + return false; +} + +static inline void udp_hash4_inc(struct udp_hslot *hslot2) +{ +} + +static inline void udp_hash4_dec(struct udp_hslot *hslot2) +{ +} #else /* !CONFIG_BASE_SMALL */ /* Must be called with table->hash2 initialized */ static inline void udp_table_hash4_init(struct udp_table *table) { - for (int i = 0; i <= table->mask; i++) + table->hash4 = (void *)(table->hash2 + (table->mask + 1)); + for (int i = 0; i <= table->mask; i++) { table->hash2[i].hash4_cnt = 0; + + INIT_HLIST_NULLS_HEAD(&table->hash4[i].nulls_head, i); + table->hash4[i].count = 0; + spin_lock_init(&table->hash4[i].lock); + } +} + +static inline struct udp_hslot *udp_hashslot4(struct udp_table *table, + unsigned int hash) +{ + return &table->hash4[hash & table->mask]; +} + +static inline bool udp_hashed4(const struct sock *sk) +{ + return !hlist_nulls_unhashed(&udp_sk(sk)->udp_lrpa_node); +} + +static inline unsigned int udp_hash4_slot_size(void) +{ + return sizeof(struct udp_hslot); +} + +static inline bool udp_has_hash4(const struct udp_hslot *hslot2) +{ + return UDP_HSLOT_MAIN(hslot2)->hash4_cnt; +} + +static inline void udp_hash4_inc(struct udp_hslot *hslot2) +{ + UDP_HSLOT_MAIN(hslot2)->hash4_cnt++; +} + +static inline void udp_hash4_dec(struct udp_hslot *hslot2) +{ + UDP_HSLOT_MAIN(hslot2)->hash4_cnt--; } #endif /* CONFIG_BASE_SMALL */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 2fdac5fae2a8..0bc0881d6569 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -3427,7 +3427,8 @@ void __init udp_table_init(struct udp_table *table, const char *name) { unsigned int i, slot_size; - slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main); + slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main) + + udp_hash4_slot_size(); table->hash = alloc_large_system_hash(name, slot_size, uhash_entries, @@ -3482,7 +3483,8 @@ static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_ent if (!udptable) goto out; - slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main); + slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main) + + udp_hash4_slot_size(); udptable->hash = vmalloc_huge(hash_entries * slot_size, GFP_KERNEL_ACCOUNT); if (!udptable->hash) -- cgit v1.3 From 78c91ae2c6deb5d236a5a93ff2995cdd05514380 Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Thu, 14 Nov 2024 18:52:06 +0800 Subject: ipv4/udp: Add 4-tuple hash for connected socket Currently, the udp_table has two hash table, the port hash and portaddr hash. Usually for UDP servers, all sockets have the same local port and addr, so they are all on the same hash slot within a reuseport group. In some applications, UDP servers use connect() to manage clients. In particular, when firstly receiving from an unseen 4 tuple, a new socket is created and connect()ed to the remote addr:port, and then the fd is used exclusively by the client. Once there are connected sks in a reuseport group, udp has to score all sks in the same hash2 slot to find the best match. This could be inefficient with a large number of connections, resulting in high softirq overhead. To solve the problem, this patch implement 4-tuple hash for connected udp sockets. During connect(), hash4 slot is updated, as well as a corresponding counter, hash4_cnt, in hslot2. In __udp4_lib_lookup(), hslot4 will be searched firstly if the counter is non-zero. Otherwise, hslot2 is used like before. Note that only connected sockets enter this hash4 path, while un-connected ones are not affected. hlist_nulls is used for hash4, because we probably move to another hslot wrongly when lookup with concurrent rehash. Then we check nulls at the list end to see if we should restart lookup. Because udp does not use SLAB_TYPESAFE_BY_RCU, we don't need to touch sk_refcnt when lookup. Stress test results (with 1 cpu fully used) are shown below, in pps: (1) _un-connected_ socket as server [a] w/o hash4: 1,825176 [b] w/ hash4: 1,831750 (+0.36%) (2) 500 _connected_ sockets as server [c] w/o hash4: 290860 (only 16% of [a]) [d] w/ hash4: 1,889658 (+3.1% compared with [b]) With hash4, compute_score is skipped when lookup, so [d] is slightly better than [b]. Co-developed-by: Cambda Zhu Signed-off-by: Cambda Zhu Co-developed-by: Fred Chen Signed-off-by: Fred Chen Co-developed-by: Yubing Qiu Signed-off-by: Yubing Qiu Signed-off-by: Philo Lu Acked-by: Willem de Bruijn Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/udp.h | 16 ++++- net/ipv4/udp.c | 197 +++++++++++++++++++++++++++++++++++++++++++++++++++++- net/ipv6/udp.c | 2 +- 3 files changed, 210 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/udp.h b/include/net/udp.h index edb669967130..feb06c0e48fb 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -302,13 +302,27 @@ static inline int udp_lib_hash(struct sock *sk) } void udp_lib_unhash(struct sock *sk); -void udp_lib_rehash(struct sock *sk, u16 new_hash); +void udp_lib_rehash(struct sock *sk, u16 new_hash, u16 new_hash4); static inline void udp_lib_close(struct sock *sk, long timeout) { sk_common_release(sk); } +/* hash4 routines shared between UDPv4/6 */ +#if IS_ENABLED(CONFIG_BASE_SMALL) +static inline void udp_lib_hash4(struct sock *sk, u16 hash) +{ +} + +static inline void udp4_hash4(struct sock *sk) +{ +} +#else /* !CONFIG_BASE_SMALL */ +void udp_lib_hash4(struct sock *sk, u16 hash); +void udp4_hash4(struct sock *sk); +#endif /* CONFIG_BASE_SMALL */ + int udp_lib_get_port(struct sock *sk, unsigned short snum, unsigned int hash2_nulladdr); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0bc0881d6569..b6c5edd7ff48 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -478,6 +478,159 @@ rescore: return result; } +#if IS_ENABLED(CONFIG_BASE_SMALL) +static struct sock *udp4_lib_lookup4(const struct net *net, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned int hnum, + int dif, int sdif, + struct udp_table *udptable) +{ + return NULL; +} + +static void udp_rehash4(struct udp_table *udptable, struct sock *sk, + u16 newhash4) +{ +} + +static void udp_unhash4(struct udp_table *udptable, struct sock *sk) +{ +} +#else /* !CONFIG_BASE_SMALL */ +static struct sock *udp4_lib_lookup4(const struct net *net, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned int hnum, + int dif, int sdif, + struct udp_table *udptable) +{ + const __portpair ports = INET_COMBINED_PORTS(sport, hnum); + const struct hlist_nulls_node *node; + struct udp_hslot *hslot4; + unsigned int hash4, slot; + struct udp_sock *up; + struct sock *sk; + + hash4 = udp_ehashfn(net, daddr, hnum, saddr, sport); + slot = hash4 & udptable->mask; + hslot4 = &udptable->hash4[slot]; + INET_ADDR_COOKIE(acookie, saddr, daddr); + +begin: + /* SLAB_TYPESAFE_BY_RCU not used, so we don't need to touch sk_refcnt */ + udp_lrpa_for_each_entry_rcu(up, node, &hslot4->nulls_head) { + sk = (struct sock *)up; + if (inet_match(net, sk, acookie, ports, dif, sdif)) + return sk; + } + + /* if the nulls value we got at the end of this lookup is not the + * expected one, we must restart lookup. We probably met an item that + * was moved to another chain due to rehash. + */ + if (get_nulls_value(node) != slot) + goto begin; + + return NULL; +} + +/* In hash4, rehash can happen in connect(), where hash4_cnt keeps unchanged. */ +static void udp_rehash4(struct udp_table *udptable, struct sock *sk, + u16 newhash4) +{ + struct udp_hslot *hslot4, *nhslot4; + + hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash); + nhslot4 = udp_hashslot4(udptable, newhash4); + udp_sk(sk)->udp_lrpa_hash = newhash4; + + if (hslot4 != nhslot4) { + spin_lock_bh(&hslot4->lock); + hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_lrpa_node); + hslot4->count--; + spin_unlock_bh(&hslot4->lock); + + spin_lock_bh(&nhslot4->lock); + hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_lrpa_node, + &nhslot4->nulls_head); + nhslot4->count++; + spin_unlock_bh(&nhslot4->lock); + } +} + +static void udp_unhash4(struct udp_table *udptable, struct sock *sk) +{ + struct udp_hslot *hslot2, *hslot4; + + if (udp_hashed4(sk)) { + hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); + hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash); + + spin_lock(&hslot4->lock); + hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_lrpa_node); + hslot4->count--; + spin_unlock(&hslot4->lock); + + spin_lock(&hslot2->lock); + udp_hash4_dec(hslot2); + spin_unlock(&hslot2->lock); + } +} + +void udp_lib_hash4(struct sock *sk, u16 hash) +{ + struct udp_hslot *hslot, *hslot2, *hslot4; + struct net *net = sock_net(sk); + struct udp_table *udptable; + + /* Connected udp socket can re-connect to another remote address, + * so rehash4 is needed. + */ + udptable = net->ipv4.udp_table; + if (udp_hashed4(sk)) { + udp_rehash4(udptable, sk, hash); + return; + } + + hslot = udp_hashslot(udptable, net, udp_sk(sk)->udp_port_hash); + hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); + hslot4 = udp_hashslot4(udptable, hash); + udp_sk(sk)->udp_lrpa_hash = hash; + + spin_lock_bh(&hslot->lock); + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); + + spin_lock(&hslot4->lock); + hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_lrpa_node, + &hslot4->nulls_head); + hslot4->count++; + spin_unlock(&hslot4->lock); + + spin_lock(&hslot2->lock); + udp_hash4_inc(hslot2); + spin_unlock(&hslot2->lock); + + spin_unlock_bh(&hslot->lock); +} +EXPORT_SYMBOL(udp_lib_hash4); + +/* call with sock lock */ +void udp4_hash4(struct sock *sk) +{ + struct net *net = sock_net(sk); + unsigned int hash; + + if (sk_unhashed(sk) || sk->sk_rcv_saddr == htonl(INADDR_ANY)) + return; + + hash = udp_ehashfn(net, sk->sk_rcv_saddr, sk->sk_num, + sk->sk_daddr, sk->sk_dport); + + udp_lib_hash4(sk, hash); +} +EXPORT_SYMBOL(udp4_hash4); +#endif /* CONFIG_BASE_SMALL */ + /* UDP is nearly always wildcards out the wazoo, it makes no sense to try * harder than this. -DaveM */ @@ -493,6 +646,13 @@ struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr, hash2 = ipv4_portaddr_hash(net, daddr, hnum); hslot2 = udp_hashslot2(udptable, hash2); + if (udp_has_hash4(hslot2)) { + result = udp4_lib_lookup4(net, saddr, sport, daddr, hnum, + dif, sdif, udptable); + if (result) /* udp4_lib_lookup4 return sk or NULL */ + return result; + } + /* Lookup connected or non-wildcard socket */ result = udp4_lib_lookup2(net, saddr, sport, daddr, hnum, dif, sdif, @@ -1933,6 +2093,18 @@ int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) } EXPORT_SYMBOL(udp_pre_connect); +static int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + int res; + + lock_sock(sk); + res = __ip4_datagram_connect(sk, uaddr, addr_len); + if (!res) + udp4_hash4(sk); + release_sock(sk); + return res; +} + int __udp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); @@ -1992,6 +2164,8 @@ void udp_lib_unhash(struct sock *sk) hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); hslot2->count--; spin_unlock(&hslot2->lock); + + udp_unhash4(udptable, sk); } spin_unlock_bh(&hslot->lock); } @@ -2001,7 +2175,7 @@ EXPORT_SYMBOL(udp_lib_unhash); /* * inet_rcv_saddr was changed, we must rehash secondary hash */ -void udp_lib_rehash(struct sock *sk, u16 newhash) +void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4) { if (sk_hashed(sk)) { struct udp_table *udptable = udp_get_table_prot(sk); @@ -2033,6 +2207,19 @@ void udp_lib_rehash(struct sock *sk, u16 newhash) spin_unlock(&nhslot2->lock); } + if (udp_hashed4(sk)) { + udp_rehash4(udptable, sk, newhash4); + + if (hslot2 != nhslot2) { + spin_lock(&hslot2->lock); + udp_hash4_dec(hslot2); + spin_unlock(&hslot2->lock); + + spin_lock(&nhslot2->lock); + udp_hash4_inc(nhslot2); + spin_unlock(&nhslot2->lock); + } + } spin_unlock_bh(&hslot->lock); } } @@ -2044,7 +2231,11 @@ void udp_v4_rehash(struct sock *sk) u16 new_hash = ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, inet_sk(sk)->inet_num); - udp_lib_rehash(sk, new_hash); + u16 new_hash4 = udp_ehashfn(sock_net(sk), + sk->sk_rcv_saddr, sk->sk_num, + sk->sk_daddr, sk->sk_dport); + + udp_lib_rehash(sk, new_hash, new_hash4); } static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) @@ -2937,7 +3128,7 @@ struct proto udp_prot = { .owner = THIS_MODULE, .close = udp_lib_close, .pre_connect = udp_pre_connect, - .connect = ip4_datagram_connect, + .connect = udp_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, .init = udp_init_sock, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 0d7aac9d44e5..1ea99d704e31 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -111,7 +111,7 @@ void udp_v6_rehash(struct sock *sk) &sk->sk_v6_rcv_saddr, inet_sk(sk)->inet_num); - udp_lib_rehash(sk, new_hash); + udp_lib_rehash(sk, new_hash, 0); /* 4-tuple hash not implemented */ } static int compute_score(struct sock *sk, const struct net *net, -- cgit v1.3 From 1b29a730ef8b6fd3aa3e11c2f6d409cf201cd913 Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Thu, 14 Nov 2024 18:52:07 +0800 Subject: ipv6/udp: Add 4-tuple hash for connected socket Implement ipv6 udp hash4 like that in ipv4. The major difference is that the hash value should be calculated with udp6_ehashfn(). Besides, ipv4-mapped ipv6 address is handled before hash() and rehash(). Export udp_ehashfn because now we use it in udpv6 rehash. Core procedures of hash/unhash/rehash are same as ipv4, and udpv4 and udpv6 share the same udptable, so some functions in ipv4 hash4 can also be shared. Co-developed-by: Cambda Zhu Signed-off-by: Cambda Zhu Co-developed-by: Fred Chen Signed-off-by: Fred Chen Co-developed-by: Yubing Qiu Signed-off-by: Yubing Qiu Signed-off-by: Philo Lu Acked-by: Willem de Bruijn Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/udp.h | 2 ++ net/ipv4/udp.c | 2 +- net/ipv6/udp.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 103 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/udp.h b/include/net/udp.h index feb06c0e48fb..6e89520e100d 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -303,6 +303,8 @@ static inline int udp_lib_hash(struct sock *sk) void udp_lib_unhash(struct sock *sk); void udp_lib_rehash(struct sock *sk, u16 new_hash, u16 new_hash4); +u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, + const __be32 faddr, const __be16 fport); static inline void udp_lib_close(struct sock *sk, long timeout) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b6c5edd7ff48..6a01905d379f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -410,7 +410,6 @@ static int compute_score(struct sock *sk, const struct net *net, return score; } -INDIRECT_CALLABLE_SCOPE u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, const __be16 fport) { @@ -419,6 +418,7 @@ u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, return __inet_ehashfn(laddr, lport, faddr, fport, udp_ehash_secret + net_hash_mix(net)); } +EXPORT_SYMBOL(udp_ehashfn); /* called with rcu_read_lock() */ static struct sock *udp4_lib_lookup2(const struct net *net, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 1ea99d704e31..d766fd798ecf 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -110,8 +110,19 @@ void udp_v6_rehash(struct sock *sk) u16 new_hash = ipv6_portaddr_hash(sock_net(sk), &sk->sk_v6_rcv_saddr, inet_sk(sk)->inet_num); + u16 new_hash4; - udp_lib_rehash(sk, new_hash, 0); /* 4-tuple hash not implemented */ + if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) { + new_hash4 = udp_ehashfn(sock_net(sk), + sk->sk_rcv_saddr, sk->sk_num, + sk->sk_daddr, sk->sk_dport); + } else { + new_hash4 = udp6_ehashfn(sock_net(sk), + &sk->sk_v6_rcv_saddr, sk->sk_num, + &sk->sk_v6_daddr, sk->sk_dport); + } + + udp_lib_rehash(sk, new_hash, new_hash4); } static int compute_score(struct sock *sk, const struct net *net, @@ -216,6 +227,74 @@ rescore: return result; } +#if IS_ENABLED(CONFIG_BASE_SMALL) +static struct sock *udp6_lib_lookup4(const struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, + unsigned int hnum, int dif, int sdif, + struct udp_table *udptable) +{ + return NULL; +} + +static void udp6_hash4(struct sock *sk) +{ +} +#else /* !CONFIG_BASE_SMALL */ +static struct sock *udp6_lib_lookup4(const struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, + unsigned int hnum, int dif, int sdif, + struct udp_table *udptable) +{ + const __portpair ports = INET_COMBINED_PORTS(sport, hnum); + const struct hlist_nulls_node *node; + struct udp_hslot *hslot4; + unsigned int hash4, slot; + struct udp_sock *up; + struct sock *sk; + + hash4 = udp6_ehashfn(net, daddr, hnum, saddr, sport); + slot = hash4 & udptable->mask; + hslot4 = &udptable->hash4[slot]; + +begin: + udp_lrpa_for_each_entry_rcu(up, node, &hslot4->nulls_head) { + sk = (struct sock *)up; + if (inet6_match(net, sk, saddr, daddr, ports, dif, sdif)) + return sk; + } + + /* if the nulls value we got at the end of this lookup is not the + * expected one, we must restart lookup. We probably met an item that + * was moved to another chain due to rehash. + */ + if (get_nulls_value(node) != slot) + goto begin; + + return NULL; +} + +static void udp6_hash4(struct sock *sk) +{ + struct net *net = sock_net(sk); + unsigned int hash; + + if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) { + udp4_hash4(sk); + return; + } + + if (sk_unhashed(sk) || ipv6_addr_any(&sk->sk_v6_rcv_saddr)) + return; + + hash = udp6_ehashfn(net, &sk->sk_v6_rcv_saddr, sk->sk_num, + &sk->sk_v6_daddr, sk->sk_dport); + + udp_lib_hash4(sk, hash); +} +#endif /* CONFIG_BASE_SMALL */ + /* rcu_read_lock() must be held */ struct sock *__udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr, __be16 sport, @@ -231,6 +310,13 @@ struct sock *__udp6_lib_lookup(const struct net *net, hash2 = ipv6_portaddr_hash(net, daddr, hnum); hslot2 = udp_hashslot2(udptable, hash2); + if (udp_has_hash4(hslot2)) { + result = udp6_lib_lookup4(net, saddr, sport, daddr, hnum, + dif, sdif, udptable); + if (result) /* udp6_lib_lookup4 return sk or NULL */ + return result; + } + /* Lookup connected or non-wildcard sockets */ result = udp6_lib_lookup2(net, saddr, sport, daddr, hnum, dif, sdif, @@ -1166,6 +1252,18 @@ static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr, return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len); } +static int udpv6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + int res; + + lock_sock(sk); + res = __ip6_datagram_connect(sk, uaddr, addr_len); + if (!res) + udp6_hash4(sk); + release_sock(sk); + return res; +} + /** * udp6_hwcsum_outgoing - handle outgoing HW checksumming * @sk: socket we are sending on @@ -1761,7 +1859,7 @@ struct proto udpv6_prot = { .owner = THIS_MODULE, .close = udp_lib_close, .pre_connect = udpv6_pre_connect, - .connect = ip6_datagram_connect, + .connect = udpv6_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, .init = udpv6_init_sock, -- cgit v1.3 From 03854920c39c62b88c0b540c92cf35746d059af2 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 6 Oct 2024 02:19:52 +0100 Subject: libceph: Remove unused ceph_pagelist functions ceph_pagelist_truncate() and ceph_pagelist_set_cursor() have been unused since commit 39be95e9c8c0 ("ceph: ceph_pagelist_append might sleep while atomic") Remove them. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- include/linux/ceph/pagelist.h | 12 ------------ net/ceph/pagelist.c | 38 -------------------------------------- 2 files changed, 50 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/pagelist.h b/include/linux/ceph/pagelist.h index 5dead8486fd8..879bec0863aa 100644 --- a/include/linux/ceph/pagelist.h +++ b/include/linux/ceph/pagelist.h @@ -17,12 +17,6 @@ struct ceph_pagelist { refcount_t refcnt; }; -struct ceph_pagelist_cursor { - struct ceph_pagelist *pl; /* pagelist, for error checking */ - struct list_head *page_lru; /* page in list */ - size_t room; /* room remaining to reset to */ -}; - struct ceph_pagelist *ceph_pagelist_alloc(gfp_t gfp_flags); extern void ceph_pagelist_release(struct ceph_pagelist *pl); @@ -33,12 +27,6 @@ extern int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space); extern int ceph_pagelist_free_reserve(struct ceph_pagelist *pl); -extern void ceph_pagelist_set_cursor(struct ceph_pagelist *pl, - struct ceph_pagelist_cursor *c); - -extern int ceph_pagelist_truncate(struct ceph_pagelist *pl, - struct ceph_pagelist_cursor *c); - static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v) { __le64 ev = cpu_to_le64(v); diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c index 74622b278d57..5a9c4be5f222 100644 --- a/net/ceph/pagelist.c +++ b/net/ceph/pagelist.c @@ -131,41 +131,3 @@ int ceph_pagelist_free_reserve(struct ceph_pagelist *pl) return 0; } EXPORT_SYMBOL(ceph_pagelist_free_reserve); - -/* Create a truncation point. */ -void ceph_pagelist_set_cursor(struct ceph_pagelist *pl, - struct ceph_pagelist_cursor *c) -{ - c->pl = pl; - c->page_lru = pl->head.prev; - c->room = pl->room; -} -EXPORT_SYMBOL(ceph_pagelist_set_cursor); - -/* Truncate a pagelist to the given point. Move extra pages to reserve. - * This won't sleep. - * Returns: 0 on success, - * -EINVAL if the pagelist doesn't match the trunc point pagelist - */ -int ceph_pagelist_truncate(struct ceph_pagelist *pl, - struct ceph_pagelist_cursor *c) -{ - struct page *page; - - if (pl != c->pl) - return -EINVAL; - ceph_pagelist_unmap_tail(pl); - while (pl->head.prev != c->page_lru) { - page = list_entry(pl->head.prev, struct page, lru); - /* move from pagelist to reserve */ - list_move_tail(&page->lru, &pl->free_list); - ++pl->num_pages_free; - } - pl->room = c->room; - if (!list_empty(&pl->head)) { - page = list_entry(pl->head.prev, struct page, lru); - pl->mapped_tail = kmap(page); - } - return 0; -} -EXPORT_SYMBOL(ceph_pagelist_truncate); -- cgit v1.3 From ee1eb8ccaab8cc2ef4bda8e11a40409ee20f6405 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 6 Oct 2024 02:19:53 +0100 Subject: libceph: Remove unused pagevec functions ceph_copy_user_to_page_vector() has been unused since 2013's commit e8344e668915 ("ceph: Implement writev/pwritev for sync operation.") ceph_copy_to_page_vector() has been unused since 2012's commit 913d2fdcf605 ("rbd: always pass ops array to rbd_req_sync_op()") Remove them. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- include/linux/ceph/libceph.h | 6 ----- net/ceph/pagevec.c | 52 -------------------------------------------- 2 files changed, 58 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 15fb566d3f46..733e7f93db66 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -317,12 +317,6 @@ extern void ceph_release_page_vector(struct page **pages, int num_pages); extern void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty); extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); -extern int ceph_copy_user_to_page_vector(struct page **pages, - const void __user *data, - loff_t off, size_t len); -extern void ceph_copy_to_page_vector(struct page **pages, - const void *data, - loff_t off, size_t len); extern void ceph_copy_from_page_vector(struct page **pages, void *data, loff_t off, size_t len); diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index 64305e7056a1..4509757d8b3b 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -55,58 +55,6 @@ struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags) } EXPORT_SYMBOL(ceph_alloc_page_vector); -/* - * copy user data into a page vector - */ -int ceph_copy_user_to_page_vector(struct page **pages, - const void __user *data, - loff_t off, size_t len) -{ - int i = 0; - int po = off & ~PAGE_MASK; - int left = len; - int l, bad; - - while (left > 0) { - l = min_t(int, PAGE_SIZE-po, left); - bad = copy_from_user(page_address(pages[i]) + po, data, l); - if (bad == l) - return -EFAULT; - data += l - bad; - left -= l - bad; - po += l - bad; - if (po == PAGE_SIZE) { - po = 0; - i++; - } - } - return len; -} -EXPORT_SYMBOL(ceph_copy_user_to_page_vector); - -void ceph_copy_to_page_vector(struct page **pages, - const void *data, - loff_t off, size_t len) -{ - int i = 0; - size_t po = off & ~PAGE_MASK; - size_t left = len; - - while (left > 0) { - size_t l = min_t(size_t, PAGE_SIZE-po, left); - - memcpy(page_address(pages[i]) + po, data, l); - data += l; - left -= l; - po += l; - if (po == PAGE_SIZE) { - po = 0; - i++; - } - } -} -EXPORT_SYMBOL(ceph_copy_to_page_vector); - void ceph_copy_from_page_vector(struct page **pages, void *data, loff_t off, size_t len) -- cgit v1.3 From 32844fd72b879d02f1f6b4394025349d31a09fd3 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 6 Oct 2024 02:19:54 +0100 Subject: libceph: Remove unused ceph_osdc_watch_check ceph_osdc_watch_check() has been unused since it was added in commit b07d3c4bd727 ("libceph: support for checking on status of watch") Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 2 -- net/ceph/osd_client.c | 34 ---------------------------------- 2 files changed, 36 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index d7941478158c..d55b30057a45 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -626,8 +626,6 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc, u32 timeout, struct page ***preply_pages, size_t *preply_len); -int ceph_osdc_watch_check(struct ceph_osd_client *osdc, - struct ceph_osd_linger_request *lreq); int ceph_osdc_list_watchers(struct ceph_osd_client *osdc, struct ceph_object_id *oid, struct ceph_object_locator *oloc, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 9d078b37fe0b..9b1168eb77ab 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -4999,40 +4999,6 @@ out_put_lreq: } EXPORT_SYMBOL(ceph_osdc_notify); -/* - * Return the number of milliseconds since the watch was last - * confirmed, or an error. If there is an error, the watch is no - * longer valid, and should be destroyed with ceph_osdc_unwatch(). - */ -int ceph_osdc_watch_check(struct ceph_osd_client *osdc, - struct ceph_osd_linger_request *lreq) -{ - unsigned long stamp, age; - int ret; - - down_read(&osdc->lock); - mutex_lock(&lreq->lock); - stamp = lreq->watch_valid_thru; - if (!list_empty(&lreq->pending_lworks)) { - struct linger_work *lwork = - list_first_entry(&lreq->pending_lworks, - struct linger_work, - pending_item); - - if (time_before(lwork->queued_stamp, stamp)) - stamp = lwork->queued_stamp; - } - age = jiffies - stamp; - dout("%s lreq %p linger_id %llu age %lu last_error %d\n", __func__, - lreq, lreq->linger_id, age, lreq->last_error); - /* we are truncating to msecs, so return a safe upper bound */ - ret = lreq->last_error ?: 1 + jiffies_to_msecs(age); - - mutex_unlock(&lreq->lock); - up_read(&osdc->lock); - return ret; -} - static int decode_watcher(void **p, void *end, struct ceph_watch_item *item) { u8 struct_v; -- cgit v1.3 From 3e0f59f09e3f319b6652e5b4523fe02d965515a5 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 6 Oct 2024 02:19:55 +0100 Subject: libceph: Remove unused ceph_crypto_key_encode ceph_crypto_key_encode() was added in 2010's commit 8b6e4f2d8b21 ("ceph: aes crypto and base64 encode/decode helpers") but has remained unused (the decode is used). Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/crypto.c | 12 ------------ net/ceph/crypto.h | 1 - 2 files changed, 13 deletions(-) (limited to 'net') diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 051d22c0e4ad..01b2ce1e8fc0 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -74,18 +74,6 @@ int ceph_crypto_key_clone(struct ceph_crypto_key *dst, return set_secret(dst, src->key); } -int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end) -{ - if (*p + sizeof(u16) + sizeof(key->created) + - sizeof(u16) + key->len > end) - return -ERANGE; - ceph_encode_16(p, key->type); - ceph_encode_copy(p, &key->created, sizeof(key->created)); - ceph_encode_16(p, key->len); - ceph_encode_copy(p, key->key, key->len); - return 0; -} - int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end) { int ret; diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h index 13bd526349fa..23de29fc613c 100644 --- a/net/ceph/crypto.h +++ b/net/ceph/crypto.h @@ -21,7 +21,6 @@ struct ceph_crypto_key { int ceph_crypto_key_clone(struct ceph_crypto_key *dst, const struct ceph_crypto_key *src); -int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end); int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end); int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in); void ceph_crypto_key_destroy(struct ceph_crypto_key *key); -- cgit v1.3 From 2862eee078a4d2d1f584e7f24fa50dddfa5f3471 Mon Sep 17 00:00:00 2001 From: Yang Erkun Date: Mon, 21 Oct 2024 22:23:42 +0800 Subject: SUNRPC: make sure cache entry active before cache_show The function `c_show` was called with protection from RCU. This only ensures that `cp` will not be freed. Therefore, the reference count for `cp` can drop to zero, which will trigger a refcount use-after-free warning when `cache_get` is called. To resolve this issue, use `cache_get_rcu` to ensure that `cp` remains active. ------------[ cut here ]------------ refcount_t: addition on 0; use-after-free. WARNING: CPU: 7 PID: 822 at lib/refcount.c:25 refcount_warn_saturate+0xb1/0x120 CPU: 7 UID: 0 PID: 822 Comm: cat Not tainted 6.12.0-rc3+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.1-2.fc37 04/01/2014 RIP: 0010:refcount_warn_saturate+0xb1/0x120 Call Trace: c_show+0x2fc/0x380 [sunrpc] seq_read_iter+0x589/0x770 seq_read+0x1e5/0x270 proc_reg_read+0xe1/0x140 vfs_read+0x125/0x530 ksys_read+0xc1/0x160 do_syscall_64+0x5f/0x170 entry_SYSCALL_64_after_hwframe+0x76/0x7e Cc: stable@vger.kernel.org # v4.20+ Signed-off-by: Yang Erkun Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/cache.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 1bd3e531b0e0..059f6ef1ad18 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1427,7 +1427,9 @@ static int c_show(struct seq_file *m, void *p) seq_printf(m, "# expiry=%lld refcnt=%d flags=%lx\n", convert_to_wallclock(cp->expiry_time), kref_read(&cp->ref), cp->flags); - cache_get(cp); + if (!cache_get_rcu(cp)) + return 0; + if (cache_check(cd, cp, NULL)) /* cache_check does a cache_put on failure */ seq_puts(m, "# "); -- cgit v1.3 From ce89e742a4c12b20f09a43fec1b21db33f2166cd Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Thu, 24 Oct 2024 09:55:20 +0800 Subject: svcrdma: fix miss destroy percpu_counter in svc_rdma_proc_init() There's issue as follows: RPC: Registered rdma transport module. RPC: Registered rdma backchannel transport module. RPC: Unregistered rdma transport module. RPC: Unregistered rdma backchannel transport module. BUG: unable to handle page fault for address: fffffbfff80c609a PGD 123fee067 P4D 123fee067 PUD 123fea067 PMD 10c624067 PTE 0 Oops: Oops: 0000 [#1] PREEMPT SMP KASAN NOPTI RIP: 0010:percpu_counter_destroy_many+0xf7/0x2a0 Call Trace: __die+0x1f/0x70 page_fault_oops+0x2cd/0x860 spurious_kernel_fault+0x36/0x450 do_kern_addr_fault+0xca/0x100 exc_page_fault+0x128/0x150 asm_exc_page_fault+0x26/0x30 percpu_counter_destroy_many+0xf7/0x2a0 mmdrop+0x209/0x350 finish_task_switch.isra.0+0x481/0x840 schedule_tail+0xe/0xd0 ret_from_fork+0x23/0x80 ret_from_fork_asm+0x1a/0x30 If register_sysctl() return NULL, then svc_rdma_proc_cleanup() will not destroy the percpu counters which init in svc_rdma_proc_init(). If CONFIG_HOTPLUG_CPU is enabled, residual nodes may be in the 'percpu_counters' list. The above issue may occur once the module is removed. If the CONFIG_HOTPLUG_CPU configuration is not enabled, memory leakage occurs. To solve above issue just destroy all percpu counters when register_sysctl() return NULL. Fixes: 1e7e55731628 ("svcrdma: Restore read and write stats") Fixes: 22df5a22462e ("svcrdma: Convert rdma_stat_sq_starve to a per-CPU counter") Fixes: df971cd853c0 ("svcrdma: Convert rdma_stat_recv to a per-CPU counter") Signed-off-by: Ye Bin Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 58ae6ec4f25b..415c0310101f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -233,25 +233,34 @@ static int svc_rdma_proc_init(void) rc = percpu_counter_init(&svcrdma_stat_read, 0, GFP_KERNEL); if (rc) - goto out_err; + goto err; rc = percpu_counter_init(&svcrdma_stat_recv, 0, GFP_KERNEL); if (rc) - goto out_err; + goto err_read; rc = percpu_counter_init(&svcrdma_stat_sq_starve, 0, GFP_KERNEL); if (rc) - goto out_err; + goto err_recv; rc = percpu_counter_init(&svcrdma_stat_write, 0, GFP_KERNEL); if (rc) - goto out_err; + goto err_sq; svcrdma_table_header = register_sysctl("sunrpc/svc_rdma", svcrdma_parm_table); + if (!svcrdma_table_header) + goto err_write; + return 0; -out_err: +err_write: + rc = -ENOMEM; + percpu_counter_destroy(&svcrdma_stat_write); +err_sq: percpu_counter_destroy(&svcrdma_stat_sq_starve); +err_recv: percpu_counter_destroy(&svcrdma_stat_recv); +err_read: percpu_counter_destroy(&svcrdma_stat_read); +err: return rc; } -- cgit v1.3 From 0de6a472c3b38432b2f184bd64eb70d9ea36d107 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 14 Nov 2024 16:32:21 -0800 Subject: net/neighbor: clear error in case strict check is not set Commit 51183d233b5a ("net/neighbor: Update neigh_dump_info for strict data checking") added strict checking. The err variable is not cleared, so if we find no table to dump we will return the validation error even if user did not want strict checking. I think the only way to hit this is to send an buggy request, and ask for a table which doesn't exist, so there's no point treating this as a real fix. I only noticed it because a syzbot repro depended on it to trigger another bug. Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241115003221.733593-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/neighbour.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 5e572f6eaf2c..89656d180bc6 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2824,6 +2824,7 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) err = neigh_valid_dump_req(nlh, cb->strict_check, &filter, cb->extack); if (err < 0 && cb->strict_check) return err; + err = 0; s_t = cb->args[0]; -- cgit v1.3 From 3fbb27b7f87e60324cec7d2d10a40884182c99d4 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 15 Nov 2024 17:52:34 +0100 Subject: mptcp: pm: lockless list traversal to dump endp To return an endpoint to the userspace via Netlink, and to dump all of them, the endpoint list was iterated while holding the pernet->lock, but only to read the content of the list. In these cases, the spin locks can be replaced by RCU read ones, and use the _rcu variants to iterate over the entries list in a lockless way. Note that the __lookup_addr_by_id() helper has been modified to use the _rcu variants of list_for_each_entry(), but with an extra conditions, so it can be called either while the RCU read lock is held, or when the associated pernet->lock is held. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241115-net-next-mptcp-pm-lockless-dump-v1-1-f4a1bcb4ca2c@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 758a0dbfcf78..2b005ddfd2d3 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -512,7 +512,8 @@ __lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) { struct mptcp_pm_addr_entry *entry; - list_for_each_entry(entry, &pernet->local_addr_list, list) { + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, + lockdep_is_held(&pernet->lock)) { if (entry->addr.id == id) return entry; } @@ -1824,7 +1825,7 @@ int mptcp_pm_nl_get_addr(struct sk_buff *skb, struct genl_info *info) goto fail; } - spin_lock_bh(&pernet->lock); + rcu_read_lock(); entry = __lookup_addr_by_id(pernet, addr.addr.id); if (!entry) { GENL_SET_ERR_MSG(info, "address not found"); @@ -1838,11 +1839,11 @@ int mptcp_pm_nl_get_addr(struct sk_buff *skb, struct genl_info *info) genlmsg_end(msg, reply); ret = genlmsg_reply(msg, info); - spin_unlock_bh(&pernet->lock); + rcu_read_unlock(); return ret; unlock_fail: - spin_unlock_bh(&pernet->lock); + rcu_read_unlock(); fail: nlmsg_free(msg); @@ -1866,7 +1867,7 @@ int mptcp_pm_nl_dump_addr(struct sk_buff *msg, pernet = pm_nl_get_pernet(net); - spin_lock_bh(&pernet->lock); + rcu_read_lock(); for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) { if (test_bit(i, pernet->id_bitmap)) { entry = __lookup_addr_by_id(pernet, i); @@ -1891,7 +1892,7 @@ int mptcp_pm_nl_dump_addr(struct sk_buff *msg, genlmsg_end(msg, hdr); } } - spin_unlock_bh(&pernet->lock); + rcu_read_unlock(); cb->args[0] = id; return msg->len; -- cgit v1.3 From 1d7fa6ceb91fddbe38cae3521d5d1075bce6a00e Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 15 Nov 2024 17:52:35 +0100 Subject: mptcp: pm: avoid code duplication to lookup endp The helper __lookup_addr() can be used in mptcp_pm_nl_get_local_id() and mptcp_pm_nl_is_backup() to simplify the code, and avoid code duplication. Co-developed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241115-net-next-mptcp-pm-lockless-dump-v1-2-f4a1bcb4ca2c@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 2b005ddfd2d3..7a0f7998376a 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1143,17 +1143,13 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc { struct mptcp_pm_addr_entry *entry; struct pm_nl_pernet *pernet; - int ret = -1; + int ret; pernet = pm_nl_get_pernet_from_msk(msk); rcu_read_lock(); - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { - if (mptcp_addresses_equal(&entry->addr, skc, entry->addr.port)) { - ret = entry->addr.id; - break; - } - } + entry = __lookup_addr(pernet, skc); + ret = entry ? entry->addr.id : -1; rcu_read_unlock(); if (ret >= 0) return ret; @@ -1180,15 +1176,11 @@ bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); struct mptcp_pm_addr_entry *entry; - bool backup = false; + bool backup; rcu_read_lock(); - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { - if (mptcp_addresses_equal(&entry->addr, skc, entry->addr.port)) { - backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); - break; - } - } + entry = __lookup_addr(pernet, skc); + backup = entry && !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); rcu_read_unlock(); return backup; -- cgit v1.3 From 1cfb5e57886aa69a992ff0ebd32e4651eb0fc995 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 15 Nov 2024 12:43:03 -0800 Subject: Revert "net: ethtool: Avoid thousands of -Wflex-array-member-not-at-end warnings" This reverts commit 3bd9b9abdf1563a22041b7255baea6d449902f1a. We cannot use the new tagged struct group because it throws C++ errors even under "extern C". Signed-off-by: Kees Cook Link: https://patch.msgid.link/20241115204308.3821419-1-kees@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 6 +++--- drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c | 4 ++-- drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c | 2 +- drivers/net/ethernet/cisco/enic/enic_ethtool.c | 2 +- drivers/net/ethernet/qlogic/qede/qede_ethtool.c | 4 ++-- include/linux/ethtool.h | 2 +- net/ethtool/ioctl.c | 2 +- net/ethtool/linkinfo.c | 8 ++++---- net/ethtool/linkmodes.c | 18 +++++++----------- 9 files changed, 22 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index cfd2c65b1c90..061a40b1974b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -2780,7 +2780,7 @@ u32 bnxt_fw_to_ethtool_speed(u16 fw_link_speed) static void bnxt_get_default_speeds(struct ethtool_link_ksettings *lk_ksettings, struct bnxt_link_info *link_info) { - struct ethtool_link_settings_hdr *base = &lk_ksettings->base; + struct ethtool_link_settings *base = &lk_ksettings->base; if (link_info->link_state == BNXT_LINK_STATE_UP) { base->speed = bnxt_fw_to_ethtool_speed(link_info->link_speed); @@ -2799,7 +2799,7 @@ static void bnxt_get_default_speeds(struct ethtool_link_ksettings *lk_ksettings, static int bnxt_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *lk_ksettings) { - struct ethtool_link_settings_hdr *base = &lk_ksettings->base; + struct ethtool_link_settings *base = &lk_ksettings->base; enum ethtool_link_mode_bit_indices link_mode; struct bnxt *bp = netdev_priv(dev); struct bnxt_link_info *link_info; @@ -3022,9 +3022,9 @@ u16 bnxt_get_fw_auto_link_speeds(const unsigned long *mode) static int bnxt_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *lk_ksettings) { - const struct ethtool_link_settings_hdr *base = &lk_ksettings->base; struct bnxt *bp = netdev_priv(dev); struct bnxt_link_info *link_info = &bp->link_info; + const struct ethtool_link_settings *base = &lk_ksettings->base; bool set_pause = false; u32 speed, lanes = 0; int rc = 0; diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c index 45d28a65347e..7f3f5afa864f 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c @@ -662,8 +662,8 @@ static unsigned int lmm_to_fw_caps(const unsigned long *link_mode_mask) static int get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *link_ksettings) { - struct ethtool_link_settings_hdr *base = &link_ksettings->base; struct port_info *pi = netdev_priv(dev); + struct ethtool_link_settings *base = &link_ksettings->base; /* For the nonce, the Firmware doesn't send up Port State changes * when the Virtual Interface attached to the Port is down. So @@ -717,9 +717,9 @@ static int get_link_ksettings(struct net_device *dev, static int set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *link_ksettings) { - const struct ethtool_link_settings_hdr *base = &link_ksettings->base; struct port_info *pi = netdev_priv(dev); struct link_config *lc = &pi->link_cfg; + const struct ethtool_link_settings *base = &link_ksettings->base; struct link_config old_lc; unsigned int fw_caps; int ret = 0; diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c index 61d08547e3f9..2fbe0f059a0b 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c @@ -1436,8 +1436,8 @@ static void fw_caps_to_lmm(enum fw_port_type port_type, static int cxgb4vf_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *link_ksettings) { - struct ethtool_link_settings_hdr *base = &link_ksettings->base; struct port_info *pi = netdev_priv(dev); + struct ethtool_link_settings *base = &link_ksettings->base; /* For the nonce, the Firmware doesn't send up Port State changes * when the Virtual Interface attached to the Port is down. So diff --git a/drivers/net/ethernet/cisco/enic/enic_ethtool.c b/drivers/net/ethernet/cisco/enic/enic_ethtool.c index 95b071153fed..d607b4f0542c 100644 --- a/drivers/net/ethernet/cisco/enic/enic_ethtool.c +++ b/drivers/net/ethernet/cisco/enic/enic_ethtool.c @@ -129,8 +129,8 @@ static void enic_intr_coal_set_rx(struct enic *enic, u32 timer) static int enic_get_ksettings(struct net_device *netdev, struct ethtool_link_ksettings *ecmd) { - struct ethtool_link_settings_hdr *base = &ecmd->base; struct enic *enic = netdev_priv(netdev); + struct ethtool_link_settings *base = &ecmd->base; ethtool_link_ksettings_add_link_mode(ecmd, supported, 10000baseT_Full); diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c index c553da16d4b1..e50e1df0a433 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c +++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c @@ -504,7 +504,7 @@ static int qede_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { typeof(cmd->link_modes) *link_modes = &cmd->link_modes; - struct ethtool_link_settings_hdr *base = &cmd->base; + struct ethtool_link_settings *base = &cmd->base; struct qede_dev *edev = netdev_priv(dev); struct qed_link_output current_link; @@ -537,7 +537,7 @@ static int qede_get_link_ksettings(struct net_device *dev, static int qede_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd) { - const struct ethtool_link_settings_hdr *base = &cmd->base; + const struct ethtool_link_settings *base = &cmd->base; const struct ethtool_forced_speed_map *map; struct qede_dev *edev = netdev_priv(dev); struct qed_link_output current_link; diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 299280c94d07..b8b935b52603 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -211,7 +211,7 @@ void ethtool_rxfh_context_lost(struct net_device *dev, u32 context_id); * fields, but they are allowed to overwrite them (will be ignored). */ struct ethtool_link_ksettings { - struct ethtool_link_settings_hdr base; + struct ethtool_link_settings base; struct { __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising); diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index d86399bcf223..61df8ce44379 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -425,7 +425,7 @@ convert_link_ksettings_to_legacy_settings( /* layout of the struct passed from/to userland */ struct ethtool_link_usettings { - struct ethtool_link_settings_hdr base; + struct ethtool_link_settings base; struct { __u32 supported[__ETHTOOL_LINK_MODE_MASK_NU32]; __u32 advertising[__ETHTOOL_LINK_MODE_MASK_NU32]; diff --git a/net/ethtool/linkinfo.c b/net/ethtool/linkinfo.c index 2d5bc57160be..30b8ce275159 100644 --- a/net/ethtool/linkinfo.c +++ b/net/ethtool/linkinfo.c @@ -8,9 +8,9 @@ struct linkinfo_req_info { }; struct linkinfo_reply_data { - struct ethnl_reply_data base; - struct ethtool_link_ksettings ksettings; - struct ethtool_link_settings_hdr *lsettings; + struct ethnl_reply_data base; + struct ethtool_link_ksettings ksettings; + struct ethtool_link_settings *lsettings; }; #define LINKINFO_REPDATA(__reply_base) \ @@ -98,7 +98,7 @@ static int ethnl_set_linkinfo(struct ethnl_req_info *req_info, struct genl_info *info) { struct ethtool_link_ksettings ksettings = {}; - struct ethtool_link_settings_hdr *lsettings; + struct ethtool_link_settings *lsettings; struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; bool mod = false; diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index 17e49cf89f03..259cd9ef1f2a 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -11,10 +11,10 @@ struct linkmodes_req_info { }; struct linkmodes_reply_data { - struct ethnl_reply_data base; - struct ethtool_link_ksettings ksettings; - struct ethtool_link_settings_hdr *lsettings; - bool peer_empty; + struct ethnl_reply_data base; + struct ethtool_link_ksettings ksettings; + struct ethtool_link_settings *lsettings; + bool peer_empty; }; #define LINKMODES_REPDATA(__reply_base) \ @@ -62,12 +62,10 @@ static int linkmodes_reply_size(const struct ethnl_req_info *req_base, { const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base); const struct ethtool_link_ksettings *ksettings = &data->ksettings; + const struct ethtool_link_settings *lsettings = &ksettings->base; bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; - const struct ethtool_link_settings_hdr *lsettings; int len, ret; - lsettings = &ksettings->base; - len = nla_total_size(sizeof(u8)) /* LINKMODES_AUTONEG */ + nla_total_size(sizeof(u32)) /* LINKMODES_SPEED */ + nla_total_size(sizeof(u32)) /* LINKMODES_LANES */ @@ -105,12 +103,10 @@ static int linkmodes_fill_reply(struct sk_buff *skb, { const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base); const struct ethtool_link_ksettings *ksettings = &data->ksettings; + const struct ethtool_link_settings *lsettings = &ksettings->base; bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; - const struct ethtool_link_settings_hdr *lsettings; int ret; - lsettings = &ksettings->base; - if (nla_put_u8(skb, ETHTOOL_A_LINKMODES_AUTONEG, lsettings->autoneg)) return -EMSGSIZE; @@ -241,7 +237,7 @@ static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb, struct ethtool_link_ksettings *ksettings, bool *mod, const struct net_device *dev) { - struct ethtool_link_settings_hdr *lsettings = &ksettings->base; + struct ethtool_link_settings *lsettings = &ksettings->base; bool req_speed, req_lanes, req_duplex; const struct nlattr *master_slave_cfg, *lanes_cfg; int ret; -- cgit v1.3 From 85c7975acd970dac0fd7b0813763faade027c55e Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Mon, 18 Nov 2024 17:14:27 +0800 Subject: net: ip: fix unexpected return in fib_validate_source() The errno should be replaced with drop reasons in fib_validate_source(), and the "-EINVAL" shouldn't be returned. And this causes a warning, which is reported by syzkaller: netlink: 'syz-executor371': attribute type 4 has an invalid length. ------------[ cut here ]------------ WARNING: CPU: 0 PID: 5842 at net/core/skbuff.c:1219 __sk_skb_reason_drop net/core/skbuff.c:1216 [inline] WARNING: CPU: 0 PID: 5842 at net/core/skbuff.c:1219 sk_skb_reason_drop+0x87/0x380 net/core/skbuff.c:1241 Modules linked in: CPU: 0 UID: 0 PID: 5842 Comm: syz-executor371 Not tainted 6.12.0-rc6-syzkaller-01362-ga58f00ed24b8 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/30/2024 RIP: 0010:__sk_skb_reason_drop net/core/skbuff.c:1216 [inline] RIP: 0010:sk_skb_reason_drop+0x87/0x380 net/core/skbuff.c:1241 Code: 00 00 00 fc ff df 41 8d 9e 00 00 fc ff bf 01 00 fc ff 89 de e8 ea 9f 08 f8 81 fb 00 00 fc ff 77 3a 4c 89 e5 e8 9a 9b 08 f8 90 <0f> 0b 90 eb 5e bf 01 00 00 00 89 ee e8 c8 9f 08 f8 85 ed 0f 8e 49 RSP: 0018:ffffc90003d57078 EFLAGS: 00010293 RAX: ffffffff898c3ec6 RBX: 00000000fffbffea RCX: ffff8880347a5a00 RDX: 0000000000000000 RSI: 00000000fffbffea RDI: 00000000fffc0001 RBP: dffffc0000000000 R08: ffffffff898c3eb6 R09: 1ffff110023eb7d4 R10: dffffc0000000000 R11: ffffed10023eb7d5 R12: dffffc0000000000 R13: ffff888011f5bdc0 R14: 00000000ffffffea R15: 0000000000000000 FS: 000055557d41e380(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000056519d31d608 CR3: 000000007854e000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: kfree_skb_reason include/linux/skbuff.h:1263 [inline] ip_rcv_finish_core+0xfde/0x1b50 net/ipv4/ip_input.c:424 ip_list_rcv_finish net/ipv4/ip_input.c:610 [inline] ip_sublist_rcv+0x3b1/0xab0 net/ipv4/ip_input.c:636 ip_list_rcv+0x42b/0x480 net/ipv4/ip_input.c:670 __netif_receive_skb_list_ptype net/core/dev.c:5715 [inline] __netif_receive_skb_list_core+0x94e/0x980 net/core/dev.c:5762 __netif_receive_skb_list net/core/dev.c:5814 [inline] netif_receive_skb_list_internal+0xa51/0xe30 net/core/dev.c:5905 netif_receive_skb_list+0x55/0x4b0 net/core/dev.c:5957 xdp_recv_frames net/bpf/test_run.c:280 [inline] xdp_test_run_batch net/bpf/test_run.c:361 [inline] bpf_test_run_xdp_live+0x1b5e/0x21b0 net/bpf/test_run.c:390 bpf_prog_test_run_xdp+0x805/0x11e0 net/bpf/test_run.c:1318 bpf_prog_test_run+0x2e4/0x360 kernel/bpf/syscall.c:4266 __sys_bpf+0x48d/0x810 kernel/bpf/syscall.c:5671 __do_sys_bpf kernel/bpf/syscall.c:5760 [inline] __se_sys_bpf kernel/bpf/syscall.c:5758 [inline] __x64_sys_bpf+0x7c/0x90 kernel/bpf/syscall.c:5758 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f18af25a8e9 Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffee4090af8 EFLAGS: 00000246 ORIG_RAX: 0000000000000141 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f18af25a8e9 RDX: 0000000000000048 RSI: 0000000020000600 RDI: 000000000000000a RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Fix it by returning "-SKB_DROP_REASON_IP_LOCAL_SOURCE" instead of "-EINVAL" in fib_validate_source(). Reported-by: syzbot+52fbd90f020788ec7709@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6738e539.050a0220.e1c64.0002.GAE@google.com/ Fixes: 82d9983ebeb8 ("net: ip: make ip_route_input_noref() return drop reasons") Signed-off-by: Menglong Dong Acked-by: Paolo Abeni Signed-off-by: Jakub Kicinski --- net/ipv4/fib_frontend.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 87bb36a5bdec..272e42d81323 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -447,7 +447,7 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, * and the same host but different containers are not. */ if (inet_lookup_ifaddr_rcu(net, src)) - return -EINVAL; + return -SKB_DROP_REASON_IP_LOCAL_SOURCE; ok: *itag = 0; -- cgit v1.3 From c69c5e10adb903ae2438d4f9c16eccf43d1fcbc1 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 18 Nov 2024 03:15:17 -0800 Subject: netpoll: Use rcu_access_pointer() in __netpoll_setup The ndev->npinfo pointer in __netpoll_setup() is RCU-protected but is being accessed directly for a NULL check. While no RCU read lock is held in this context, we should still use proper RCU primitives for consistency and correctness. Replace the direct NULL check with rcu_access_pointer(), which is the appropriate primitive when only checking for NULL without dereferencing the pointer. This function provides the necessary ordering guarantees without requiring RCU read-side protection. Reviewed-by: Michal Kubiak Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20241118-netpoll_rcu-v1-1-a1888dcb4a02@debian.org Signed-off-by: Jakub Kicinski --- net/core/netpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index aa49b92e9194..45fb60bc4803 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -626,7 +626,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) goto out; } - if (!ndev->npinfo) { + if (!rcu_access_pointer(ndev->npinfo)) { npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); if (!npinfo) { err = -ENOMEM; -- cgit v1.3 From 8ca2a1eeadf09862190b2810697702d803ceef2d Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Mon, 18 Nov 2024 11:09:09 +0800 Subject: bpf: fix recursive lock when verdict program return SK_PASS When the stream_verdict program returns SK_PASS, it places the received skb into its own receive queue, but a recursive lock eventually occurs, leading to an operating system deadlock. This issue has been present since v6.9. ''' sk_psock_strp_data_ready write_lock_bh(&sk->sk_callback_lock) strp_data_ready strp_read_sock read_sock -> tcp_read_sock strp_recv cb.rcv_msg -> sk_psock_strp_read # now stream_verdict return SK_PASS without peer sock assign __SK_PASS = sk_psock_map_verd(SK_PASS, NULL) sk_psock_verdict_apply sk_psock_skb_ingress_self sk_psock_skb_ingress_enqueue sk_psock_data_ready read_lock_bh(&sk->sk_callback_lock) <= dead lock ''' This topic has been discussed before, but it has not been fixed. Previous discussion: https://lore.kernel.org/all/6684a5864ec86_403d20898@john.notmuch Fixes: 6648e613226e ("bpf, skmsg: Fix NULL pointer dereference in sk_psock_skb_ingress_enqueue") Reported-by: Vincent Whitchurch Signed-off-by: Jiayuan Chen Signed-off-by: John Fastabend Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/20241118030910.36230-2-mrpre@163.com Signed-off-by: Jakub Kicinski --- net/core/skmsg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index b1dcbd3be89e..e90fbab703b2 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1117,9 +1117,9 @@ static void sk_psock_strp_data_ready(struct sock *sk) if (tls_sw_has_ctx_rx(sk)) { psock->saved_data_ready(sk); } else { - write_lock_bh(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); strp_data_ready(&psock->strp); - write_unlock_bh(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } } rcu_read_unlock(); -- cgit v1.3 From 7ef3ae82a6ebbf4750967d1ce43bcdb7e44ff74b Mon Sep 17 00:00:00 2001 From: Alex Zenla Date: Tue, 19 Nov 2024 21:16:33 +0000 Subject: 9p/xen: fix init sequence Large amount of mount hangs observed during hotplugging of 9pfs devices. The 9pfs Xen driver attempts to initialize itself more than once, causing the frontend and backend to disagree: the backend listens on a channel that the frontend does not send on, resulting in stalled processing. Only allow initialization of 9p frontend once. Fixes: c15fe55d14b3b ("9p/xen: fix connection sequence") Signed-off-by: Alex Zenla Signed-off-by: Alexander Merritt Signed-off-by: Ariadne Conill Reviewed-by: Juergen Gross Message-ID: <20241119211633.38321-1-alexander@edera.dev> Signed-off-by: Dominique Martinet --- net/9p/trans_xen.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index dfdbe1ca5338..0304e8a1616d 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -465,6 +465,7 @@ static int xen_9pfs_front_init(struct xenbus_device *dev) goto error; } + xenbus_switch_state(dev, XenbusStateInitialised); return 0; error_xenbus: @@ -512,8 +513,10 @@ static void xen_9pfs_front_changed(struct xenbus_device *dev, break; case XenbusStateInitWait: - if (!xen_9pfs_front_init(dev)) - xenbus_switch_state(dev, XenbusStateInitialised); + if (dev->state != XenbusStateInitialising) + break; + + xen_9pfs_front_init(dev); break; case XenbusStateConnected: -- cgit v1.3 From e43c608f40c065b30964f0a806348062991b802d Mon Sep 17 00:00:00 2001 From: Alex Zenla Date: Thu, 21 Nov 2024 22:51:00 +0000 Subject: 9p/xen: fix release of IRQ Kernel logs indicate an IRQ was double-freed. Pass correct device ID during IRQ release. Fixes: 71ebd71921e45 ("xen/9pfs: connect to the backend") Signed-off-by: Alex Zenla Signed-off-by: Alexander Merritt Signed-off-by: Ariadne Conill Reviewed-by: Juergen Gross Message-ID: <20241121225100.5736-1-alexander@edera.dev> [Dominique: remove confusing variable reset to 0] Signed-off-by: Dominique Martinet --- net/9p/trans_xen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index 0304e8a1616d..b9ff69c7522a 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -286,7 +286,7 @@ static void xen_9pfs_front_free(struct xen_9pfs_front_priv *priv) if (!priv->rings[i].intf) break; if (priv->rings[i].irq > 0) - unbind_from_irqhandler(priv->rings[i].irq, priv->dev); + unbind_from_irqhandler(priv->rings[i].irq, ring); if (priv->rings[i].data.in) { for (j = 0; j < (1 << priv->rings[i].intf->ring_order); -- cgit v1.3 From e0260d530b73ee969ae971d14daa02376dcfc93f Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Fri, 22 Nov 2024 23:43:02 +0900 Subject: net/9p/usbg: allow building as standalone module There is no reason only the usbg transport would not be its own module, so make it tristate. In particular, this fixes a couple of issues the current bool had: - trans_usbg was apparently not compiled at all when NET_9P=m - the workaround added in commit 2193ede180dd ("net/9p/usbg: fix CONFIG_USB_GADGET dependency") became redundant because a tristate item cannot be built-in when its dependency is a module, so we can depend on USB_GADGET "normally" again. Cc: Michael Grzeschik Link: https://lkml.kernel.org/r/ZzhWRPDNwu225NWz@codewreck.org Message-ID: <20241122144754.1231919-1-asmadeus@codewreck.org> Signed-off-by: Dominique Martinet --- net/9p/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/9p/Kconfig b/net/9p/Kconfig index ee967fd25312..22f8c167845d 100644 --- a/net/9p/Kconfig +++ b/net/9p/Kconfig @@ -41,8 +41,8 @@ config NET_9P_XEN two Xen domains. config NET_9P_USBG - bool "9P USB Gadget Transport" - depends on USB_GADGET=y || USB_GADGET=NET_9P + tristate "9P USB Gadget Transport" + depends on USB_GADGET select CONFIGFS_FS select USB_LIBCOMPOSITE help -- cgit v1.3 From 9b234a97b10cf1385d451a3824539b774abbcdaf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Nov 2024 19:41:05 +0000 Subject: rtnetlink: fix rtnl_dump_ifinfo() error path syzbot found that rtnl_dump_ifinfo() could return with a lock held [1] Move code around so that rtnl_link_ops_put() and put_net() can be called at the end of this function. [1] WARNING: lock held when returning to user space! 6.12.0-rc7-syzkaller-01681-g38f83a57aa8e #0 Not tainted syz-executor399/5841 is leaving the kernel with locks still held! 1 lock held by syz-executor399/5841: #0: ffffffff8f46c2a0 (&ops->srcu#2){.+.+}-{0:0}, at: rcu_lock_acquire include/linux/rcupdate.h:337 [inline] #0: ffffffff8f46c2a0 (&ops->srcu#2){.+.+}-{0:0}, at: rcu_read_lock include/linux/rcupdate.h:849 [inline] #0: ffffffff8f46c2a0 (&ops->srcu#2){.+.+}-{0:0}, at: rtnl_link_ops_get+0x22/0x250 net/core/rtnetlink.c:555 Fixes: 43c7ce69d28e ("rtnetlink: Protect struct rtnl_link_ops with SRCU.") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Joe Damato Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241121194105.3632507-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index dd142f444659..58df76fe408a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2442,7 +2442,9 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid); if (IS_ERR(tgt_net)) { NL_SET_ERR_MSG(extack, "Invalid target network namespace id"); - return PTR_ERR(tgt_net); + err = PTR_ERR(tgt_net); + netnsid = -1; + goto out; } break; case IFLA_EXT_MASK: @@ -2457,7 +2459,8 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) default: if (cb->strict_check) { NL_SET_ERR_MSG(extack, "Unsupported attribute in link dump request"); - return -EINVAL; + err = -EINVAL; + goto out; } } } @@ -2479,11 +2482,14 @@ walk_entries: break; } - if (kind_ops) - rtnl_link_ops_put(kind_ops, ops_srcu_index); cb->seq = tgt_net->dev_base_seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + +out: + + if (kind_ops) + rtnl_link_ops_put(kind_ops, ops_srcu_index); if (netnsid >= 0) put_net(tgt_net); -- cgit v1.3 From 3bf39fa849ab8ed52abb6715922e6102d3df9f97 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 19 Nov 2024 14:44:31 -0800 Subject: netlink: fix false positive warning in extack during dumps Commit under fixes extended extack reporting to dumps. It works under normal conditions, because extack errors are usually reported during ->start() or the first ->dump(), it's quite rare that the dump starts okay but fails later. If the dump does fail later, however, the input skb will already have the initiating message pulled, so checking if bad attr falls within skb->data will fail. Switch the check to using nlh, which is always valid. syzbot found a way to hit that scenario by filling up the receive queue. In this case we initiate a dump but don't call ->dump() until there is read space for an skb. WARNING: CPU: 1 PID: 5845 at net/netlink/af_netlink.c:2210 netlink_ack_tlv_fill+0x1a8/0x560 net/netlink/af_netlink.c:2209 RIP: 0010:netlink_ack_tlv_fill+0x1a8/0x560 net/netlink/af_netlink.c:2209 Call Trace: netlink_dump_done+0x513/0x970 net/netlink/af_netlink.c:2250 netlink_dump+0x91f/0xe10 net/netlink/af_netlink.c:2351 netlink_recvmsg+0x6bb/0x11d0 net/netlink/af_netlink.c:1983 sock_recvmsg_nosec net/socket.c:1051 [inline] sock_recvmsg+0x22f/0x280 net/socket.c:1073 __sys_recvfrom+0x246/0x3d0 net/socket.c:2267 __do_sys_recvfrom net/socket.c:2285 [inline] __se_sys_recvfrom net/socket.c:2281 [inline] __x64_sys_recvfrom+0xde/0x100 net/socket.c:2281 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7ff37dd17a79 Reported-by: syzbot+d4373fa8042c06cefa84@syzkaller.appspotmail.com Fixes: 8af4f60472fc ("netlink: support all extack types in dumps") Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20241119224432.1713040-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/netlink/af_netlink.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index dd3517b0fdfd..f4e7b5e4bb59 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2181,9 +2181,14 @@ netlink_ack_tlv_len(struct netlink_sock *nlk, int err, return tlvlen; } +static bool nlmsg_check_in_payload(const struct nlmsghdr *nlh, const void *addr) +{ + return !WARN_ON(addr < nlmsg_data(nlh) || + addr - (const void *) nlh >= nlh->nlmsg_len); +} + static void -netlink_ack_tlv_fill(struct sk_buff *in_skb, struct sk_buff *skb, - const struct nlmsghdr *nlh, int err, +netlink_ack_tlv_fill(struct sk_buff *skb, const struct nlmsghdr *nlh, int err, const struct netlink_ext_ack *extack) { if (extack->_msg) @@ -2195,9 +2200,7 @@ netlink_ack_tlv_fill(struct sk_buff *in_skb, struct sk_buff *skb, if (!err) return; - if (extack->bad_attr && - !WARN_ON((u8 *)extack->bad_attr < in_skb->data || - (u8 *)extack->bad_attr >= in_skb->data + in_skb->len)) + if (extack->bad_attr && nlmsg_check_in_payload(nlh, extack->bad_attr)) WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS, (u8 *)extack->bad_attr - (const u8 *)nlh)); if (extack->policy) @@ -2206,9 +2209,7 @@ netlink_ack_tlv_fill(struct sk_buff *in_skb, struct sk_buff *skb, if (extack->miss_type) WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_TYPE, extack->miss_type)); - if (extack->miss_nest && - !WARN_ON((u8 *)extack->miss_nest < in_skb->data || - (u8 *)extack->miss_nest > in_skb->data + in_skb->len)) + if (extack->miss_nest && nlmsg_check_in_payload(nlh, extack->miss_nest)) WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_NEST, (u8 *)extack->miss_nest - (const u8 *)nlh)); } @@ -2237,7 +2238,7 @@ static int netlink_dump_done(struct netlink_sock *nlk, struct sk_buff *skb, if (extack_len) { nlh->nlmsg_flags |= NLM_F_ACK_TLVS; if (skb_tailroom(skb) >= extack_len) { - netlink_ack_tlv_fill(cb->skb, skb, cb->nlh, + netlink_ack_tlv_fill(skb, cb->nlh, nlk->dump_done_errno, extack); nlmsg_end(skb, nlh); } @@ -2496,7 +2497,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, } if (tlvlen) - netlink_ack_tlv_fill(in_skb, skb, nlh, err, extack); + netlink_ack_tlv_fill(skb, nlh, err, extack); nlmsg_end(skb, rep); -- cgit v1.3 From 5d066766c5f1252f98ff859265bcd1a5b52ac46c Mon Sep 17 00:00:00 2001 From: James Chapman Date: Mon, 18 Nov 2024 14:04:11 +0000 Subject: net/l2tp: fix warning in l2tp_exit_net found by syzbot In l2tp's net exit handler, we check that an IDR is empty before destroying it: WARN_ON_ONCE(!idr_is_empty(&pn->l2tp_tunnel_idr)); idr_destroy(&pn->l2tp_tunnel_idr); By forcing memory allocation failures in idr_alloc_32, syzbot is able to provoke a condition where idr_is_empty returns false despite there being no items in the IDR. This turns out to be because the radix tree of the IDR contains only internal radix-tree nodes and it is this that causes idr_is_empty to return false. The internal nodes are cleaned by idr_destroy. Use idr_for_each to check that the IDR is empty instead of idr_is_empty to avoid the problem. Reported-by: syzbot+332fe1e67018625f63c9@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=332fe1e67018625f63c9 Fixes: 73d33bd063c4 ("l2tp: avoid using drain_workqueue in l2tp_pre_exit_net") Signed-off-by: James Chapman Link: https://patch.msgid.link/20241118140411.1582555-1-jchapman@katalix.com Signed-off-by: Paolo Abeni --- net/l2tp/l2tp_core.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 3eec23ac5ab1..369a2f2e459c 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1870,15 +1870,31 @@ static __net_exit void l2tp_pre_exit_net(struct net *net) } } +static int l2tp_idr_item_unexpected(int id, void *p, void *data) +{ + const char *idr_name = data; + + pr_err("l2tp: %s IDR not empty at net %d exit\n", idr_name, id); + WARN_ON_ONCE(1); + return 1; +} + static __net_exit void l2tp_exit_net(struct net *net) { struct l2tp_net *pn = l2tp_pernet(net); - WARN_ON_ONCE(!idr_is_empty(&pn->l2tp_v2_session_idr)); + /* Our per-net IDRs should be empty. Check that is so, to + * help catch cleanup races or refcnt leaks. + */ + idr_for_each(&pn->l2tp_v2_session_idr, l2tp_idr_item_unexpected, + "v2_session"); + idr_for_each(&pn->l2tp_v3_session_idr, l2tp_idr_item_unexpected, + "v3_session"); + idr_for_each(&pn->l2tp_tunnel_idr, l2tp_idr_item_unexpected, + "tunnel"); + idr_destroy(&pn->l2tp_v2_session_idr); - WARN_ON_ONCE(!idr_is_empty(&pn->l2tp_v3_session_idr)); idr_destroy(&pn->l2tp_v3_session_idr); - WARN_ON_ONCE(!idr_is_empty(&pn->l2tp_tunnel_idr)); idr_destroy(&pn->l2tp_tunnel_idr); } -- cgit v1.3 From ebaf81317e42aa990ad20b113cfe3a7b20d4e937 Mon Sep 17 00:00:00 2001 From: Sidraya Jayagond Date: Tue, 19 Nov 2024 16:22:19 +0100 Subject: s390/iucv: MSG_PEEK causes memory leak in iucv_sock_destruct() Passing MSG_PEEK flag to skb_recv_datagram() increments skb refcount (skb->users) and iucv_sock_recvmsg() does not decrement skb refcount at exit. This results in skb memory leak in skb_queue_purge() and WARN_ON in iucv_sock_destruct() during socket close. To fix this decrease skb refcount by one if MSG_PEEK is set in order to prevent memory leak and WARN_ON. WARNING: CPU: 2 PID: 6292 at net/iucv/af_iucv.c:286 iucv_sock_destruct+0x144/0x1a0 [af_iucv] CPU: 2 PID: 6292 Comm: afiucv_test_msg Kdump: loaded Tainted: G W 6.10.0-rc7 #1 Hardware name: IBM 3931 A01 704 (z/VM 7.3.0) Call Trace: [<001587c682c4aa98>] iucv_sock_destruct+0x148/0x1a0 [af_iucv] [<001587c682c4a9d0>] iucv_sock_destruct+0x80/0x1a0 [af_iucv] [<001587c704117a32>] __sk_destruct+0x52/0x550 [<001587c704104a54>] __sock_release+0xa4/0x230 [<001587c704104c0c>] sock_close+0x2c/0x40 [<001587c702c5f5a8>] __fput+0x2e8/0x970 [<001587c7024148c4>] task_work_run+0x1c4/0x2c0 [<001587c7023b0716>] do_exit+0x996/0x1050 [<001587c7023b13aa>] do_group_exit+0x13a/0x360 [<001587c7023b1626>] __s390x_sys_exit_group+0x56/0x60 [<001587c7022bccca>] do_syscall+0x27a/0x380 [<001587c7049a6a0c>] __do_syscall+0x9c/0x160 [<001587c7049ce8a8>] system_call+0x70/0x98 Last Breaking-Event-Address: [<001587c682c4a9d4>] iucv_sock_destruct+0x84/0x1a0 [af_iucv] Fixes: eac3731bd04c ("[S390]: Add AF_IUCV socket support") Reviewed-by: Alexandra Winter Reviewed-by: Thorsten Winkler Signed-off-by: Sidraya Jayagond Signed-off-by: Alexandra Winter Reviewed-by: David Wei Link: https://patch.msgid.link/20241119152219.3712168-1-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- net/iucv/af_iucv.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index c00323fa9eb6..7929df08d4e0 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1236,7 +1236,9 @@ static int iucv_sock_recvmsg(struct socket *sock, struct msghdr *msg, return -EOPNOTSUPP; /* receive/dequeue next skb: - * the function understands MSG_PEEK and, thus, does not dequeue skb */ + * the function understands MSG_PEEK and, thus, does not dequeue skb + * only refcount is increased. + */ skb = skb_recv_datagram(sk, flags, &err); if (!skb) { if (sk->sk_shutdown & RCV_SHUTDOWN) @@ -1252,9 +1254,8 @@ static int iucv_sock_recvmsg(struct socket *sock, struct msghdr *msg, cskb = skb; if (skb_copy_datagram_msg(cskb, offset, msg, copied)) { - if (!(flags & MSG_PEEK)) - skb_queue_head(&sk->sk_receive_queue, skb); - return -EFAULT; + err = -EFAULT; + goto err_out; } /* SOCK_SEQPACKET: set MSG_TRUNC if recv buf size is too small */ @@ -1271,11 +1272,8 @@ static int iucv_sock_recvmsg(struct socket *sock, struct msghdr *msg, err = put_cmsg(msg, SOL_IUCV, SCM_IUCV_TRGCLS, sizeof(IUCV_SKB_CB(skb)->class), (void *)&IUCV_SKB_CB(skb)->class); - if (err) { - if (!(flags & MSG_PEEK)) - skb_queue_head(&sk->sk_receive_queue, skb); - return err; - } + if (err) + goto err_out; /* Mark read part of skb as used */ if (!(flags & MSG_PEEK)) { @@ -1331,8 +1329,18 @@ done: /* SOCK_SEQPACKET: return real length if MSG_TRUNC is set */ if (sk->sk_type == SOCK_SEQPACKET && (flags & MSG_TRUNC)) copied = rlen; + if (flags & MSG_PEEK) + skb_unref(skb); return copied; + +err_out: + if (!(flags & MSG_PEEK)) + skb_queue_head(&sk->sk_receive_queue, skb); + else + skb_unref(skb); + + return err; } static inline __poll_t iucv_accept_poll(struct sock *parent) -- cgit v1.3 From 00b5b7aab9e422d00d5a9d03d7e0760a76b5d57f Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 20 Nov 2024 09:51:07 +0000 Subject: net/ipv6: delete temporary address if mngtmpaddr is removed or unmanaged MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RFC8981 section 3.4 says that existing temporary addresses must have their lifetimes adjusted so that no temporary addresses should ever remain "valid" or "preferred" longer than the incoming SLAAC Prefix Information. This would strongly imply in Linux's case that if the "mngtmpaddr" address is deleted or un-flagged as such, its corresponding temporary addresses must be cleared out right away. But now the temporary address is renewed even after ‘mngtmpaddr’ is removed or becomes unmanaged as manage_tempaddrs() set temporary addresses prefered/valid time to 0, and later in addrconf_verify_rtnl() all checkings failed to remove the addresses. Fix this by deleting the temporary address directly for these situations. Fixes: 778964f2fdf0 ("ipv6/addrconf: fix timing bug in tempaddr regen") Signed-off-by: Hangbin Liu Reviewed-by: David Ahern Signed-off-by: Paolo Abeni --- net/ipv6/addrconf.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 96b5b2b0d507..c489a1e6aec9 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2570,6 +2570,24 @@ static struct inet6_dev *addrconf_add_dev(struct net_device *dev) return idev; } +static void delete_tempaddrs(struct inet6_dev *idev, + struct inet6_ifaddr *ifp) +{ + struct inet6_ifaddr *ift, *tmp; + + write_lock_bh(&idev->lock); + list_for_each_entry_safe(ift, tmp, &idev->tempaddr_list, tmp_list) { + if (ift->ifpub != ifp) + continue; + + in6_ifa_hold(ift); + write_unlock_bh(&idev->lock); + ipv6_del_addr(ift); + write_lock_bh(&idev->lock); + } + write_unlock_bh(&idev->lock); +} + static void manage_tempaddrs(struct inet6_dev *idev, struct inet6_ifaddr *ifp, __u32 valid_lft, __u32 prefered_lft, @@ -3124,11 +3142,12 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags, in6_ifa_hold(ifp); read_unlock_bh(&idev->lock); - if (!(ifp->flags & IFA_F_TEMPORARY) && - (ifa_flags & IFA_F_MANAGETEMPADDR)) - manage_tempaddrs(idev, ifp, 0, 0, false, - jiffies); ipv6_del_addr(ifp); + + if (!(ifp->flags & IFA_F_TEMPORARY) && + (ifp->flags & IFA_F_MANAGETEMPADDR)) + delete_tempaddrs(idev, ifp); + addrconf_verify_rtnl(net); if (ipv6_addr_is_multicast(pfx)) { ipv6_mc_config(net->ipv6.mc_autojoin_sk, @@ -4952,14 +4971,12 @@ static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp, } if (was_managetempaddr || ifp->flags & IFA_F_MANAGETEMPADDR) { - if (was_managetempaddr && - !(ifp->flags & IFA_F_MANAGETEMPADDR)) { - cfg->valid_lft = 0; - cfg->preferred_lft = 0; - } - manage_tempaddrs(ifp->idev, ifp, cfg->valid_lft, - cfg->preferred_lft, !was_managetempaddr, - jiffies); + if (was_managetempaddr && !(ifp->flags & IFA_F_MANAGETEMPADDR)) + delete_tempaddrs(ifp->idev, ifp); + else + manage_tempaddrs(ifp->idev, ifp, cfg->valid_lft, + cfg->preferred_lft, !was_managetempaddr, + jiffies); } addrconf_verify_rtnl(net); -- cgit v1.3 From 9cfb5e7f0ded2bfaabc270ceb5f91d13f0e805b9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Nov 2024 17:13:43 +0000 Subject: net: hsr: fix hsr_init_sk() vs network/transport headers. Following sequence in hsr_init_sk() is invalid : skb_reset_mac_header(skb); skb_reset_mac_len(skb); skb_reset_network_header(skb); skb_reset_transport_header(skb); It is invalid because skb_reset_mac_len() needs the correct network header, which should be after the mac header. This patch moves the skb_reset_network_header() and skb_reset_transport_header() before the call to dev_hard_header(). As a result skb->mac_len is no longer set to a value close to 65535. Fixes: 48b491a5cc74 ("net: hsr: fix mac_len checks") Signed-off-by: Eric Dumazet Cc: George McCollister Link: https://patch.msgid.link/20241122171343.897551-1-edumazet@google.com Signed-off-by: Paolo Abeni --- net/hsr/hsr_device.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 9e64496a5c1c..31a416ee21ad 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -268,6 +268,8 @@ static struct sk_buff *hsr_init_skb(struct hsr_port *master) skb->dev = master->dev; skb->priority = TC_PRIO_CONTROL; + skb_reset_network_header(skb); + skb_reset_transport_header(skb); if (dev_hard_header(skb, skb->dev, ETH_P_PRP, hsr->sup_multicast_addr, skb->dev->dev_addr, skb->len) <= 0) @@ -275,8 +277,6 @@ static struct sk_buff *hsr_init_skb(struct hsr_port *master) skb_reset_mac_header(skb); skb_reset_mac_len(skb); - skb_reset_network_header(skb); - skb_reset_transport_header(skb); return skb; out: -- cgit v1.3 From 0b882940665ca2849386ee459d4331aa2f8c4e7d Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 15 Nov 2024 10:45:31 -0500 Subject: Bluetooth: MGMT: Fix slab-use-after-free Read in set_powered_sync This fixes the following crash: ================================================================== BUG: KASAN: slab-use-after-free in set_powered_sync+0x3a/0xc0 net/bluetooth/mgmt.c:1353 Read of size 8 at addr ffff888029b4dd18 by task kworker/u9:0/54 CPU: 1 UID: 0 PID: 54 Comm: kworker/u9:0 Not tainted 6.11.0-rc6-syzkaller-01155-gf723224742fc #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/06/2024 Workqueue: hci0 hci_cmd_sync_work Call Trace: __dump_stack lib/dump_stack.c:93 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:119 print_address_description mm/kasan/report.c:377 [inline] print_report+0x169/0x550 mm/kasan/report.c:488 q kasan_report+0x143/0x180 mm/kasan/report.c:601 set_powered_sync+0x3a/0xc0 net/bluetooth/mgmt.c:1353 hci_cmd_sync_work+0x22b/0x400 net/bluetooth/hci_sync.c:328 process_one_work kernel/workqueue.c:3231 [inline] process_scheduled_works+0xa2c/0x1830 kernel/workqueue.c:3312 worker_thread+0x86d/0xd10 kernel/workqueue.c:3389 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Allocated by task 5247: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:370 [inline] __kasan_kmalloc+0x98/0xb0 mm/kasan/common.c:387 kasan_kmalloc include/linux/kasan.h:211 [inline] __kmalloc_cache_noprof+0x19c/0x2c0 mm/slub.c:4193 kmalloc_noprof include/linux/slab.h:681 [inline] kzalloc_noprof include/linux/slab.h:807 [inline] mgmt_pending_new+0x65/0x250 net/bluetooth/mgmt_util.c:269 mgmt_pending_add+0x36/0x120 net/bluetooth/mgmt_util.c:296 set_powered+0x3cd/0x5e0 net/bluetooth/mgmt.c:1394 hci_mgmt_cmd+0xc47/0x11d0 net/bluetooth/hci_sock.c:1712 hci_sock_sendmsg+0x7b8/0x11c0 net/bluetooth/hci_sock.c:1832 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:745 sock_write_iter+0x2dd/0x400 net/socket.c:1160 new_sync_write fs/read_write.c:497 [inline] vfs_write+0xa72/0xc90 fs/read_write.c:590 ksys_write+0x1a0/0x2c0 fs/read_write.c:643 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 5246: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x40/0x50 mm/kasan/generic.c:579 poison_slab_object+0xe0/0x150 mm/kasan/common.c:240 __kasan_slab_free+0x37/0x60 mm/kasan/common.c:256 kasan_slab_free include/linux/kasan.h:184 [inline] slab_free_hook mm/slub.c:2256 [inline] slab_free mm/slub.c:4477 [inline] kfree+0x149/0x360 mm/slub.c:4598 settings_rsp+0x2bc/0x390 net/bluetooth/mgmt.c:1443 mgmt_pending_foreach+0xd1/0x130 net/bluetooth/mgmt_util.c:259 __mgmt_power_off+0x112/0x420 net/bluetooth/mgmt.c:9455 hci_dev_close_sync+0x665/0x11a0 net/bluetooth/hci_sync.c:5191 hci_dev_do_close net/bluetooth/hci_core.c:483 [inline] hci_dev_close+0x112/0x210 net/bluetooth/hci_core.c:508 sock_do_ioctl+0x158/0x460 net/socket.c:1222 sock_ioctl+0x629/0x8e0 net/socket.c:1341 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:907 [inline] __se_sys_ioctl+0xfc/0x170 fs/ioctl.c:893 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83gv entry_SYSCALL_64_after_hwframe+0x77/0x7f Reported-by: syzbot+03d6270b6425df1605bf@syzkaller.appspotmail.com Tested-by: syzbot+03d6270b6425df1605bf@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=03d6270b6425df1605bf Fixes: 275f3f648702 ("Bluetooth: Fix not checking MGMT cmd pending queue") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 1f6d083682b8..e406eb8e4327 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1318,7 +1318,8 @@ static void mgmt_set_powered_complete(struct hci_dev *hdev, void *data, int err) struct mgmt_mode *cp; /* Make sure cmd still outstanding. */ - if (cmd != pending_find(MGMT_OP_SET_POWERED, hdev)) + if (err == -ECANCELED || + cmd != pending_find(MGMT_OP_SET_POWERED, hdev)) return; cp = cmd->param; @@ -1351,7 +1352,13 @@ static void mgmt_set_powered_complete(struct hci_dev *hdev, void *data, int err) static int set_powered_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_mode *cp = cmd->param; + struct mgmt_mode *cp; + + /* Make sure cmd still outstanding. */ + if (cmd != pending_find(MGMT_OP_SET_POWERED, hdev)) + return -ECANCELED; + + cp = cmd->param; BT_DBG("%s", hdev->name); -- cgit v1.3 From a66dfaf18fd61bb75ef8cee83db46b2aadf153d0 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 21 Nov 2024 11:09:22 -0500 Subject: Bluetooth: MGMT: Fix possible deadlocks This fixes possible deadlocks like the following caused by hci_cmd_sync_dequeue causing the destroy function to run: INFO: task kworker/u19:0:143 blocked for more than 120 seconds. Tainted: G W O 6.8.0-2024-03-19-intel-next-iLS-24ww14 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:kworker/u19:0 state:D stack:0 pid:143 tgid:143 ppid:2 flags:0x00004000 Workqueue: hci0 hci_cmd_sync_work [bluetooth] Call Trace: __schedule+0x374/0xaf0 schedule+0x3c/0xf0 schedule_preempt_disabled+0x1c/0x30 __mutex_lock.constprop.0+0x3ef/0x7a0 __mutex_lock_slowpath+0x13/0x20 mutex_lock+0x3c/0x50 mgmt_set_connectable_complete+0xa4/0x150 [bluetooth] ? kfree+0x211/0x2a0 hci_cmd_sync_dequeue+0xae/0x130 [bluetooth] ? __pfx_cmd_complete_rsp+0x10/0x10 [bluetooth] cmd_complete_rsp+0x26/0x80 [bluetooth] mgmt_pending_foreach+0x4d/0x70 [bluetooth] __mgmt_power_off+0x8d/0x180 [bluetooth] ? _raw_spin_unlock_irq+0x23/0x40 hci_dev_close_sync+0x445/0x5b0 [bluetooth] hci_set_powered_sync+0x149/0x250 [bluetooth] set_powered_sync+0x24/0x60 [bluetooth] hci_cmd_sync_work+0x90/0x150 [bluetooth] process_one_work+0x13e/0x300 worker_thread+0x2f7/0x420 ? __pfx_worker_thread+0x10/0x10 kthread+0x107/0x140 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x3d/0x60 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1b/0x30 Tested-by: Kiran K Fixes: f53e1c9c726d ("Bluetooth: MGMT: Fix possible crash on mgmt_index_removed") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index e406eb8e4327..b31192d473d0 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1518,7 +1518,8 @@ static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "err %d", err); /* Make sure cmd still outstanding. */ - if (cmd != pending_find(MGMT_OP_SET_DISCOVERABLE, hdev)) + if (err == -ECANCELED || + cmd != pending_find(MGMT_OP_SET_DISCOVERABLE, hdev)) return; hci_dev_lock(hdev); @@ -1692,7 +1693,8 @@ static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "err %d", err); /* Make sure cmd still outstanding. */ - if (cmd != pending_find(MGMT_OP_SET_CONNECTABLE, hdev)) + if (err == -ECANCELED || + cmd != pending_find(MGMT_OP_SET_CONNECTABLE, hdev)) return; hci_dev_lock(hdev); @@ -1924,7 +1926,7 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) bool changed; /* Make sure cmd still outstanding. */ - if (cmd != pending_find(MGMT_OP_SET_SSP, hdev)) + if (err == -ECANCELED || cmd != pending_find(MGMT_OP_SET_SSP, hdev)) return; if (err) { @@ -3848,7 +3850,8 @@ static void set_name_complete(struct hci_dev *hdev, void *data, int err) bt_dev_dbg(hdev, "err %d", err); - if (cmd != pending_find(MGMT_OP_SET_LOCAL_NAME, hdev)) + if (err == -ECANCELED || + cmd != pending_find(MGMT_OP_SET_LOCAL_NAME, hdev)) return; if (status) { @@ -4023,7 +4026,8 @@ static void set_default_phy_complete(struct hci_dev *hdev, void *data, int err) struct sk_buff *skb = cmd->skb; u8 status = mgmt_status(err); - if (cmd != pending_find(MGMT_OP_SET_PHY_CONFIGURATION, hdev)) + if (err == -ECANCELED || + cmd != pending_find(MGMT_OP_SET_PHY_CONFIGURATION, hdev)) return; if (!status) { @@ -5914,13 +5918,16 @@ static void start_discovery_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; + bt_dev_dbg(hdev, "err %d", err); + + if (err == -ECANCELED) + return; + if (cmd != pending_find(MGMT_OP_START_DISCOVERY, hdev) && cmd != pending_find(MGMT_OP_START_LIMITED_DISCOVERY, hdev) && cmd != pending_find(MGMT_OP_START_SERVICE_DISCOVERY, hdev)) return; - bt_dev_dbg(hdev, "err %d", err); - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err), cmd->param, 1); mgmt_pending_remove(cmd); @@ -6153,7 +6160,8 @@ static void stop_discovery_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; - if (cmd != pending_find(MGMT_OP_STOP_DISCOVERY, hdev)) + if (err == -ECANCELED || + cmd != pending_find(MGMT_OP_STOP_DISCOVERY, hdev)) return; bt_dev_dbg(hdev, "err %d", err); @@ -8144,7 +8152,8 @@ static void read_local_oob_ext_data_complete(struct hci_dev *hdev, void *data, u8 status = mgmt_status(err); u16 eir_len; - if (cmd != pending_find(MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev)) + if (err == -ECANCELED || + cmd != pending_find(MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev)) return; if (!status) { -- cgit v1.3 From ed9588554943097bdf09588a8a105fbb058869c5 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Tue, 26 Nov 2024 07:58:43 +0800 Subject: Bluetooth: SCO: remove the redundant sco_conn_put When adding conn, it is necessary to increase and retain the conn reference count at the same time. Another problem was fixed along the way, conn_put is missing when hcon is NULL in the timeout routine. Fixes: e6720779ae61 ("Bluetooth: SCO: Use kref to track lifetime of sco_conn") Reported-and-tested-by: syzbot+489f78df4709ac2bfdd3@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=489f78df4709ac2bfdd3 Signed-off-by: Edward Adam Davis Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/sco.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 1b8e468d24cf..78f7bca24487 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -143,6 +143,7 @@ static void sco_sock_timeout(struct work_struct *work) sco_conn_lock(conn); if (!conn->hcon) { sco_conn_unlock(conn); + sco_conn_put(conn); return; } sk = sco_sock_hold(conn); @@ -192,7 +193,6 @@ static struct sco_conn *sco_conn_add(struct hci_conn *hcon) conn->hcon = hcon; sco_conn_unlock(conn); } - sco_conn_put(conn); return conn; } -- cgit v1.3 From 1465036b10be4b8b00eb31c879e86de633ad74c1 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Tue, 19 Nov 2024 14:31:41 +0100 Subject: llc: Improve setsockopt() handling of malformed user input copy_from_sockptr() is used incorrectly: return value is the number of bytes that could not be copied. Since it's deprecated, switch to copy_safe_from_sockptr(). Note: Keeping the `optlen != sizeof(int)` check as copy_safe_from_sockptr() by itself would also accept optlen > sizeof(int). Which would allow a more lenient handling of inputs. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Suggested-by: David Wei Signed-off-by: Michal Luczaj Signed-off-by: Paolo Abeni --- net/llc/af_llc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 4eb52add7103..0259cde394ba 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -1098,7 +1098,7 @@ static int llc_ui_setsockopt(struct socket *sock, int level, int optname, lock_sock(sk); if (unlikely(level != SOL_LLC || optlen != sizeof(int))) goto out; - rc = copy_from_sockptr(&opt, optval, sizeof(opt)); + rc = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (rc) goto out; rc = -EINVAL; -- cgit v1.3 From 02020056647017e70509bb58c3096448117099e1 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Tue, 19 Nov 2024 14:31:42 +0100 Subject: rxrpc: Improve setsockopt() handling of malformed user input copy_from_sockptr() does not return negative value on error; instead, it reports the number of bytes that failed to copy. Since it's deprecated, switch to copy_safe_from_sockptr(). Note: Keeping the `optlen != sizeof(unsigned int)` check as copy_safe_from_sockptr() by itself would also accept optlen > sizeof(unsigned int). Which would allow a more lenient handling of inputs. Fixes: 17926a79320a ("[AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both") Signed-off-by: Michal Luczaj Signed-off-by: Paolo Abeni --- net/rxrpc/af_rxrpc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index f4844683e120..9d8bd0b37e41 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -707,9 +707,10 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, ret = -EISCONN; if (rx->sk.sk_state != RXRPC_UNBOUND) goto error; - ret = copy_from_sockptr(&min_sec_level, optval, - sizeof(unsigned int)); - if (ret < 0) + ret = copy_safe_from_sockptr(&min_sec_level, + sizeof(min_sec_level), + optval, optlen); + if (ret) goto error; ret = -EINVAL; if (min_sec_level > RXRPC_SECURITY_MAX) -- cgit v1.3 From c31e72d021db2714df03df6c42855a1db592716c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 23 Nov 2024 09:42:36 -0800 Subject: tcp: Fix use-after-free of nreq in reqsk_timer_handler(). The cited commit replaced inet_csk_reqsk_queue_drop_and_put() with __inet_csk_reqsk_queue_drop() and reqsk_put() in reqsk_timer_handler(). Then, oreq should be passed to reqsk_put() instead of req; otherwise use-after-free of nreq could happen when reqsk is migrated but the retry attempt failed (e.g. due to timeout). Let's pass oreq to reqsk_put(). Fixes: e8c526f2bdf1 ("tcp/dccp: Don't use timer_pending() in reqsk_queue_unlink().") Reported-by: Liu Jian Closes: https://lore.kernel.org/netdev/1284490f-9525-42ee-b7b8-ccadf6606f6d@huawei.com/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Vadim Fedorenko Reviewed-by: Liu Jian Reviewed-by: Eric Dumazet Reviewed-by: Martin KaFai Lau Link: https://patch.msgid.link/20241123174236.62438-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/ipv4/inet_connection_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 491c2c6b683e..6872b5aff73e 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1191,7 +1191,7 @@ no_ownership: drop: __inet_csk_reqsk_queue_drop(sk_listener, oreq, true); - reqsk_put(req); + reqsk_put(oreq); } static bool reqsk_queue_hash_req(struct request_sock *req, -- cgit v1.3 From 122aba8c80618eca904490b1733af27fb8f07528 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 23 Nov 2024 18:21:48 -0800 Subject: net_sched: sch_fq: don't follow the fast path if Tx is behind now Recent kernels cause a lot of TCP retransmissions [ ID] Interval Transfer Bitrate Retr Cwnd [ 5] 0.00-1.00 sec 2.24 GBytes 19.2 Gbits/sec 2767 442 KBytes [ 5] 1.00-2.00 sec 2.23 GBytes 19.1 Gbits/sec 2312 350 KBytes ^^^^ Replacing the qdisc with pfifo makes retransmissions go away. It appears that a flow may have a delayed packet with a very near Tx time. Later, we may get busy processing Rx and the target Tx time will pass, but we won't service Tx since the CPU is busy with Rx. If Rx sees an ACK and we try to push more data for the delayed flow we may fastpath the skb, not realizing that there are already "ready to send" packets for this flow sitting in the qdisc. Don't trust the fastpath if we are "behind" according to the projected Tx time for next flow waiting in the Qdisc. Because we consider anything within the offload window to be okay for fastpath we must consider the entire offload window as "now". Qdisc config: qdisc fq 8001: dev eth0 parent 1234:1 limit 10000p flow_limit 100p \ buckets 32768 orphan_mask 1023 bands 3 \ priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 \ weights 589824 196608 65536 quantum 3028b initial_quantum 15140b \ low_rate_threshold 550Kbit \ refill_delay 40ms timer_slack 10us horizon 10s horizon_drop For iperf this change seems to do fine, the reordering is gone. The fastpath still gets used most of the time: gc 0 highprio 0 fastpath 142614 throttled 418309 latency 19.1us xx_behind 2731 where "xx_behind" counts how many times we hit the new "return false". CC: stable@vger.kernel.org Fixes: 076433bd78d7 ("net_sched: sch_fq: add fast path for mostly idle qdisc") Signed-off-by: Jakub Kicinski Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241124022148.3126719-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/sched/sch_fq.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index a97638bef6da..a5e87f9ea986 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -332,6 +332,12 @@ static bool fq_fastpath_check(const struct Qdisc *sch, struct sk_buff *skb, */ if (q->internal.qlen >= 8) return false; + + /* Ordering invariants fall apart if some delayed flows + * are ready but we haven't serviced them, yet. + */ + if (q->time_next_delayed_flow <= now + q->offload_horizon) + return false; } sk = skb->sk; -- cgit v1.3 From 11b6e701bce96f98474084f26821157cb0dccf69 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sun, 24 Nov 2024 16:40:56 +0100 Subject: ipmr: add debug check for mr table cleanup The multicast route tables lifecycle, for both ipv4 and ipv6, is protected by RCU using the RTNL lock for write access. In many places a table pointer escapes the RCU (or RTNL) protected critical section, but such scenarios are actually safe because tables are deleted only at namespace cleanup time or just after allocation, in case of default rule creation failure. Tables freed at namespace cleanup time are assured to be alive for the whole netns lifetime; tables freed just after creation time are never exposed to other possible users. Ensure that the free conditions are respected in ip{,6}mr_free_table, to document the locking schema and to prevent future possible introduction of 'table del' operation from breaking it. Reviewed-by: David Ahern Signed-off-by: Paolo Abeni --- net/ipv4/ipmr.c | 14 ++++++++++++++ net/ipv6/ip6mr.c | 14 ++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index c58dd78509a2..bac0776648e0 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -120,6 +120,11 @@ static void ipmr_expire_process(struct timer_list *t); lockdep_rtnl_is_held() || \ list_empty(&net->ipv4.mr_tables)) +static bool ipmr_can_free_table(struct net *net) +{ + return !check_net(net) || !net->ipv4.mr_rules_ops; +} + static struct mr_table *ipmr_mr_table_iter(struct net *net, struct mr_table *mrt) { @@ -302,6 +307,11 @@ EXPORT_SYMBOL(ipmr_rule_default); #define ipmr_for_each_table(mrt, net) \ for (mrt = net->ipv4.mrt; mrt; mrt = NULL) +static bool ipmr_can_free_table(struct net *net) +{ + return !check_net(net); +} + static struct mr_table *ipmr_mr_table_iter(struct net *net, struct mr_table *mrt) { @@ -413,6 +423,10 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) static void ipmr_free_table(struct mr_table *mrt) { + struct net *net = read_pnet(&mrt->net); + + DEBUG_NET_WARN_ON_ONCE(!ipmr_can_free_table(net)); + timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC | MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index d66f58932a79..b80fca894916 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -108,6 +108,11 @@ static void ipmr_expire_process(struct timer_list *t); lockdep_rtnl_is_held() || \ list_empty(&net->ipv6.mr6_tables)) +static bool ip6mr_can_free_table(struct net *net) +{ + return !check_net(net) || !net->ipv6.mr6_rules_ops; +} + static struct mr_table *ip6mr_mr_table_iter(struct net *net, struct mr_table *mrt) { @@ -291,6 +296,11 @@ EXPORT_SYMBOL(ip6mr_rule_default); #define ip6mr_for_each_table(mrt, net) \ for (mrt = net->ipv6.mrt6; mrt; mrt = NULL) +static bool ip6mr_can_free_table(struct net *net) +{ + return !check_net(net); +} + static struct mr_table *ip6mr_mr_table_iter(struct net *net, struct mr_table *mrt) { @@ -392,6 +402,10 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id) static void ip6mr_free_table(struct mr_table *mrt) { + struct net *net = read_pnet(&mrt->net); + + DEBUG_NET_WARN_ON_ONCE(!ip6mr_can_free_table(net)); + timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC | MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC); -- cgit v1.3 From f1553c9894b4dbeb10a2ab15ab1aa113b3b4047c Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sun, 24 Nov 2024 16:40:57 +0100 Subject: ip6mr: fix tables suspicious RCU usage Several places call ip6mr_get_table() with no RCU nor RTNL lock. Add RCU protection inside such helper and provide a lockless variant for the few callers that already acquired the relevant lock. Note that some users additionally reference the table outside the RCU lock. That is actually safe as the table deletion can happen only after all table accesses are completed. Fixes: e2d57766e674 ("net: Provide compat support for SIOCGETMIFCNT_IN6 and SIOCGETSGCNT_IN6.") Fixes: d7c31cbde4bc ("net: ip6mr: add RTM_GETROUTE netlink op") Reviewed-by: David Ahern Signed-off-by: Paolo Abeni --- net/ipv6/ip6mr.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index b80fca894916..4147890fe98f 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -130,7 +130,7 @@ static struct mr_table *ip6mr_mr_table_iter(struct net *net, return ret; } -static struct mr_table *ip6mr_get_table(struct net *net, u32 id) +static struct mr_table *__ip6mr_get_table(struct net *net, u32 id) { struct mr_table *mrt; @@ -141,6 +141,16 @@ static struct mr_table *ip6mr_get_table(struct net *net, u32 id) return NULL; } +static struct mr_table *ip6mr_get_table(struct net *net, u32 id) +{ + struct mr_table *mrt; + + rcu_read_lock(); + mrt = __ip6mr_get_table(net, id); + rcu_read_unlock(); + return mrt; +} + static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, struct mr_table **mrt) { @@ -182,7 +192,7 @@ static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp, arg->table = fib_rule_get_table(rule, arg); - mrt = ip6mr_get_table(rule->fr_net, arg->table); + mrt = __ip6mr_get_table(rule->fr_net, arg->table); if (!mrt) return -EAGAIN; res->mrt = mrt; @@ -314,6 +324,8 @@ static struct mr_table *ip6mr_get_table(struct net *net, u32 id) return net->ipv6.mrt6; } +#define __ip6mr_get_table ip6mr_get_table + static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, struct mr_table **mrt) { @@ -392,7 +404,7 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id) { struct mr_table *mrt; - mrt = ip6mr_get_table(net, id); + mrt = __ip6mr_get_table(net, id); if (mrt) return mrt; @@ -425,13 +437,15 @@ static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos) struct net *net = seq_file_net(seq); struct mr_table *mrt; - mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); - if (!mrt) + rcu_read_lock(); + mrt = __ip6mr_get_table(net, RT6_TABLE_DFLT); + if (!mrt) { + rcu_read_unlock(); return ERR_PTR(-ENOENT); + } iter->mrt = mrt; - rcu_read_lock(); return mr_vif_seq_start(seq, pos); } @@ -2292,11 +2306,13 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, struct mfc6_cache *cache; struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); - mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); - if (!mrt) + rcu_read_lock(); + mrt = __ip6mr_get_table(net, RT6_TABLE_DFLT); + if (!mrt) { + rcu_read_unlock(); return -ENOENT; + } - rcu_read_lock(); cache = ip6mr_cache_find(mrt, &rt->rt6i_src.addr, &rt->rt6i_dst.addr); if (!cache && skb->dev) { int vif = ip6mr_find_vif(mrt, skb->dev); @@ -2576,7 +2592,7 @@ static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, grp = nla_get_in6_addr(tb[RTA_DST]); tableid = nla_get_u32_default(tb[RTA_TABLE], 0); - mrt = ip6mr_get_table(net, tableid ?: RT_TABLE_DEFAULT); + mrt = __ip6mr_get_table(net, tableid ?: RT_TABLE_DEFAULT); if (!mrt) { NL_SET_ERR_MSG_MOD(extack, "MR table does not exist"); return -ENOENT; @@ -2623,7 +2639,7 @@ static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) if (filter.table_id) { struct mr_table *mrt; - mrt = ip6mr_get_table(sock_net(skb->sk), filter.table_id); + mrt = __ip6mr_get_table(sock_net(skb->sk), filter.table_id); if (!mrt) { if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IP6MR) return skb->len; -- cgit v1.3 From fc9c273d6daaa9866f349bbe8cae25c67764c456 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sun, 24 Nov 2024 16:40:58 +0100 Subject: ipmr: fix tables suspicious RCU usage Similar to the previous patch, plumb the RCU lock inside the ipmr_get_table(), provided a lockless variant and apply the latter in the few spots were the lock is already held. Fixes: 709b46e8d90b ("net: Add compat ioctl support for the ipv4 multicast ioctl SIOCGETSGCNT") Fixes: f0ad0860d01e ("ipv4: ipmr: support multiple tables") Reviewed-by: David Ahern Signed-off-by: Paolo Abeni --- net/ipv4/ipmr.c | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index bac0776648e0..383ea8b91cc7 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -142,7 +142,7 @@ static struct mr_table *ipmr_mr_table_iter(struct net *net, return ret; } -static struct mr_table *ipmr_get_table(struct net *net, u32 id) +static struct mr_table *__ipmr_get_table(struct net *net, u32 id) { struct mr_table *mrt; @@ -153,6 +153,16 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id) return NULL; } +static struct mr_table *ipmr_get_table(struct net *net, u32 id) +{ + struct mr_table *mrt; + + rcu_read_lock(); + mrt = __ipmr_get_table(net, id); + rcu_read_unlock(); + return mrt; +} + static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, struct mr_table **mrt) { @@ -194,7 +204,7 @@ static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp, arg->table = fib_rule_get_table(rule, arg); - mrt = ipmr_get_table(rule->fr_net, arg->table); + mrt = __ipmr_get_table(rule->fr_net, arg->table); if (!mrt) return -EAGAIN; res->mrt = mrt; @@ -325,6 +335,8 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id) return net->ipv4.mrt; } +#define __ipmr_get_table ipmr_get_table + static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, struct mr_table **mrt) { @@ -413,7 +425,7 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) if (id != RT_TABLE_DEFAULT && id >= 1000000000) return ERR_PTR(-EINVAL); - mrt = ipmr_get_table(net, id); + mrt = __ipmr_get_table(net, id); if (mrt) return mrt; @@ -1388,7 +1400,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, goto out_unlock; } - mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); + mrt = __ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); if (!mrt) { ret = -ENOENT; goto out_unlock; @@ -2276,11 +2288,13 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, struct mr_table *mrt; int err; - mrt = ipmr_get_table(net, RT_TABLE_DEFAULT); - if (!mrt) + rcu_read_lock(); + mrt = __ipmr_get_table(net, RT_TABLE_DEFAULT); + if (!mrt) { + rcu_read_unlock(); return -ENOENT; + } - rcu_read_lock(); cache = ipmr_cache_find(mrt, saddr, daddr); if (!cache && skb->dev) { int vif = ipmr_find_vif(mrt, skb->dev); @@ -2564,7 +2578,7 @@ static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, grp = nla_get_in_addr_default(tb[RTA_DST], 0); tableid = nla_get_u32_default(tb[RTA_TABLE], 0); - mrt = ipmr_get_table(net, tableid ? tableid : RT_TABLE_DEFAULT); + mrt = __ipmr_get_table(net, tableid ? tableid : RT_TABLE_DEFAULT); if (!mrt) { err = -ENOENT; goto errout_free; @@ -2618,7 +2632,7 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) if (filter.table_id) { struct mr_table *mrt; - mrt = ipmr_get_table(sock_net(skb->sk), filter.table_id); + mrt = __ipmr_get_table(sock_net(skb->sk), filter.table_id); if (!mrt) { if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IPMR) return skb->len; @@ -2726,7 +2740,7 @@ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh, break; } } - mrt = ipmr_get_table(net, tblid); + mrt = __ipmr_get_table(net, tblid); if (!mrt) { ret = -ENOENT; goto out; @@ -2934,13 +2948,15 @@ static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos) struct net *net = seq_file_net(seq); struct mr_table *mrt; - mrt = ipmr_get_table(net, RT_TABLE_DEFAULT); - if (!mrt) + rcu_read_lock(); + mrt = __ipmr_get_table(net, RT_TABLE_DEFAULT); + if (!mrt) { + rcu_read_unlock(); return ERR_PTR(-ENOENT); + } iter->mrt = mrt; - rcu_read_lock(); return mr_vif_seq_start(seq, pos); } -- cgit v1.3 From f6d7695b5ae22092fa2cc42529bb7462f7e0c4ad Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 28 Nov 2024 17:18:04 +0100 Subject: ipmr: fix build with clang and DEBUG_NET disabled. Sasha reported a build issue in ipmr:: net/ipv4/ipmr.c:320:13: error: function 'ipmr_can_free_table' is not \ needed and will not be emitted \ [-Werror,-Wunneeded-internal-declaration] 320 | static bool ipmr_can_free_table(struct net *net) Apparently clang is too smart with BUILD_BUG_ON_INVALID(), let's fallback to a plain WARN_ON_ONCE(). Reported-by: Sasha Levin Closes: https://qa-reports.linaro.org/lkft/sashal-linus-next/build/v6.11-25635-g6813e2326f1e/testrun/26111580/suite/build/test/clang-nightly-lkftconfig/details/ Fixes: 11b6e701bce9 ("ipmr: add debug check for mr table cleanup") Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/ee75faa926b2446b8302ee5fc30e129d2df73b90.1732810228.git.pabeni@redhat.com Signed-off-by: Paolo Abeni --- net/ipv4/ipmr.c | 2 +- net/ipv6/ip6mr.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 383ea8b91cc7..c5b8ec5c0a8c 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -437,7 +437,7 @@ static void ipmr_free_table(struct mr_table *mrt) { struct net *net = read_pnet(&mrt->net); - DEBUG_NET_WARN_ON_ONCE(!ipmr_can_free_table(net)); + WARN_ON_ONCE(!ipmr_can_free_table(net)); timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC | diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 4147890fe98f..7f1902ac3586 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -416,7 +416,7 @@ static void ip6mr_free_table(struct mr_table *mrt) { struct net *net = read_pnet(&mrt->net); - DEBUG_NET_WARN_ON_ONCE(!ip6mr_can_free_table(net)); + WARN_ON_ONCE(!ip6mr_can_free_table(net)); timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC | -- cgit v1.3 From 4db9ad82a6c823094da27de4825af693a3475d51 Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Fri, 15 Nov 2024 17:38:04 +0800 Subject: sunrpc: clear XPRT_SOCK_UPD_TIMEOUT when reset transport Since transport->sock has been set to NULL during reset transport, XPRT_SOCK_UPD_TIMEOUT also needs to be cleared. Otherwise, the xs_tcp_set_socket_timeouts() may be triggered in xs_tcp_send_request() to dereference the transport->sock that has been set to NULL. Fixes: 7196dbb02ea0 ("SUNRPC: Allow changing of the TCP timeout parameters on the fly") Signed-off-by: Li Lingfeng Signed-off-by: Liu Jian Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index d587c261d999..31bc046e2850 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1198,6 +1198,7 @@ static void xs_sock_reset_state_flags(struct rpc_xprt *xprt) clear_bit(XPRT_SOCK_WAKE_WRITE, &transport->sock_state); clear_bit(XPRT_SOCK_WAKE_DISCONNECT, &transport->sock_state); clear_bit(XPRT_SOCK_NOSPACE, &transport->sock_state); + clear_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state); } static void xs_run_error_worker(struct sock_xprt *transport, unsigned int nr) -- cgit v1.3 From d7bdd849ef1b681da03ac05ca0957b2cbe2d24b6 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 15 Nov 2024 08:59:36 -0500 Subject: SUNRPC: timeout and cancel TLS handshake with -ETIMEDOUT We've noticed a situation where an unstable TCP connection can cause the TLS handshake to timeout waiting for userspace to complete it. When this happens, we don't want to return from xs_tls_handshake_sync() with zero, as this will cause the upper xprt to be set CONNECTED, and subsequent attempts to transmit will be returned with -EPIPE. The sunrpc machine does not recover from this situation and will spin attempting to transmit. The return value of tls_handshake_cancel() can be used to detect a race with completion: * tls_handshake_cancel - cancel a pending handshake * Return values: * %true - Uncompleted handshake request was canceled * %false - Handshake request already completed or not found If true, we do not want the upper xprt to be connected, so return -ETIMEDOUT. If false, its possible the handshake request was lost and that may be the reason for our timeout. Again we do not want the upper xprt to be connected, so return -ETIMEDOUT. Ensure that we alway return an error from xs_tls_handshake_sync() if we call tls_handshake_cancel(). Signed-off-by: Benjamin Coddington Reviewed-by: Chuck Lever Fixes: 75eb6af7acdf ("SUNRPC: Add a TCP-with-TLS RPC transport class") Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 31bc046e2850..fecc8b8fa266 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2616,11 +2616,10 @@ static int xs_tls_handshake_sync(struct rpc_xprt *lower_xprt, struct xprtsec_par rc = wait_for_completion_interruptible_timeout(&lower_transport->handshake_done, XS_TLS_HANDSHAKE_TO); if (rc <= 0) { - if (!tls_handshake_cancel(sk)) { - if (rc == 0) - rc = -ETIMEDOUT; - goto out_put_xprt; - } + tls_handshake_cancel(sk); + if (rc == 0) + rc = -ETIMEDOUT; + goto out_put_xprt; } rc = lower_transport->xprt_err; -- cgit v1.3 From 3f23f96528e8fcf8619895c4c916c52653892ec1 Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Tue, 12 Nov 2024 21:54:34 +0800 Subject: sunrpc: fix one UAF issue caused by sunrpc kernel tcp socket BUG: KASAN: slab-use-after-free in tcp_write_timer_handler+0x156/0x3e0 Read of size 1 at addr ffff888111f322cd by task swapper/0/0 CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.12.0-rc4-dirty #7 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 Call Trace: dump_stack_lvl+0x68/0xa0 print_address_description.constprop.0+0x2c/0x3d0 print_report+0xb4/0x270 kasan_report+0xbd/0xf0 tcp_write_timer_handler+0x156/0x3e0 tcp_write_timer+0x66/0x170 call_timer_fn+0xfb/0x1d0 __run_timers+0x3f8/0x480 run_timer_softirq+0x9b/0x100 handle_softirqs+0x153/0x390 __irq_exit_rcu+0x103/0x120 irq_exit_rcu+0xe/0x20 sysvec_apic_timer_interrupt+0x76/0x90 asm_sysvec_apic_timer_interrupt+0x1a/0x20 RIP: 0010:default_idle+0xf/0x20 Code: 4c 01 c7 4c 29 c2 e9 72 ff ff ff 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 66 90 0f 00 2d 33 f8 25 00 fb f4 c3 cc cc cc cc 66 66 2e 0f 1f 84 00 00 00 00 00 90 90 90 90 90 RSP: 0018:ffffffffa2007e28 EFLAGS: 00000242 RAX: 00000000000f3b31 RBX: 1ffffffff4400fc7 RCX: ffffffffa09c3196 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff9f00590f RBP: 0000000000000000 R08: 0000000000000001 R09: ffffed102360835d R10: ffff88811b041aeb R11: 0000000000000001 R12: 0000000000000000 R13: ffffffffa202d7c0 R14: 0000000000000000 R15: 00000000000147d0 default_idle_call+0x6b/0xa0 cpuidle_idle_call+0x1af/0x1f0 do_idle+0xbc/0x130 cpu_startup_entry+0x33/0x40 rest_init+0x11f/0x210 start_kernel+0x39a/0x420 x86_64_start_reservations+0x18/0x30 x86_64_start_kernel+0x97/0xa0 common_startup_64+0x13e/0x141 Allocated by task 595: kasan_save_stack+0x24/0x50 kasan_save_track+0x14/0x30 __kasan_slab_alloc+0x87/0x90 kmem_cache_alloc_noprof+0x12b/0x3f0 copy_net_ns+0x94/0x380 create_new_namespaces+0x24c/0x500 unshare_nsproxy_namespaces+0x75/0xf0 ksys_unshare+0x24e/0x4f0 __x64_sys_unshare+0x1f/0x30 do_syscall_64+0x70/0x180 entry_SYSCALL_64_after_hwframe+0x76/0x7e Freed by task 100: kasan_save_stack+0x24/0x50 kasan_save_track+0x14/0x30 kasan_save_free_info+0x3b/0x60 __kasan_slab_free+0x54/0x70 kmem_cache_free+0x156/0x5d0 cleanup_net+0x5d3/0x670 process_one_work+0x776/0xa90 worker_thread+0x2e2/0x560 kthread+0x1a8/0x1f0 ret_from_fork+0x34/0x60 ret_from_fork_asm+0x1a/0x30 Reproduction script: mkdir -p /mnt/nfsshare mkdir -p /mnt/nfs/netns_1 mkfs.ext4 /dev/sdb mount /dev/sdb /mnt/nfsshare systemctl restart nfs-server chmod 777 /mnt/nfsshare exportfs -i -o rw,no_root_squash *:/mnt/nfsshare ip netns add netns_1 ip link add name veth_1_peer type veth peer veth_1 ifconfig veth_1_peer 11.11.0.254 up ip link set veth_1 netns netns_1 ip netns exec netns_1 ifconfig veth_1 11.11.0.1 ip netns exec netns_1 /root/iptables -A OUTPUT -d 11.11.0.254 -p tcp \ --tcp-flags FIN FIN -j DROP (note: In my environment, a DESTROY_CLIENTID operation is always sent immediately, breaking the nfs tcp connection.) ip netns exec netns_1 timeout -s 9 300 mount -t nfs -o proto=tcp,vers=4.1 \ 11.11.0.254:/mnt/nfsshare /mnt/nfs/netns_1 ip netns del netns_1 The reason here is that the tcp socket in netns_1 (nfs side) has been shutdown and closed (done in xs_destroy), but the FIN message (with ack) is discarded, and the nfsd side keeps sending retransmission messages. As a result, when the tcp sock in netns_1 processes the received message, it sends the message (FIN message) in the sending queue, and the tcp timer is re-established. When the network namespace is deleted, the net structure accessed by tcp's timer handler function causes problems. To fix this problem, let's hold netns refcnt for the tcp kernel socket as done in other modules. This is an ugly hack which can easily be backported to earlier kernels. A proper fix which cleans up the interfaces will follow, but may not be so easy to backport. Fixes: 26abe14379f8 ("net: Modify sk_alloc to not reference count the netns of kernel sockets.") Signed-off-by: Liu Jian Acked-by: Jeff Layton Reviewed-by: Kuniyuki Iwashima Signed-off-by: Trond Myklebust --- net/sunrpc/svcsock.c | 4 ++++ net/sunrpc/xprtsock.c | 7 +++++++ 2 files changed, 11 insertions(+) (limited to 'net') diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 825ec5357691..59e2c46240f5 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1551,6 +1551,10 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, newlen = error; if (protocol == IPPROTO_TCP) { + __netns_tracker_free(net, &sock->sk->ns_tracker, false); + sock->sk->sk_net_refcnt = 1; + get_net_track(net, &sock->sk->ns_tracker, GFP_KERNEL); + sock_inuse_add(net, 1); if ((error = kernel_listen(sock, 64)) < 0) goto bummer; } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index fecc8b8fa266..c60936d8cef7 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1941,6 +1941,13 @@ static struct socket *xs_create_sock(struct rpc_xprt *xprt, goto out; } + if (protocol == IPPROTO_TCP) { + __netns_tracker_free(xprt->xprt_net, &sock->sk->ns_tracker, false); + sock->sk->sk_net_refcnt = 1; + get_net_track(xprt->xprt_net, &sock->sk->ns_tracker, GFP_KERNEL); + sock_inuse_add(xprt->xprt_net, 1); + } + filp = sock_alloc_file(sock, O_NONBLOCK, NULL); if (IS_ERR(filp)) return ERR_CAST(filp); -- cgit v1.3